### Feature Engineering for Deep Learning


In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = (
    SparkSession.builder.appName("concrec-rank")
    .config("spark.driver.memory", "11g")
    .getOrCreate()
)

24/04/29 23:06:21 WARN Utils: Your hostname, Jiarongs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.12.14 instead (on interface en0)
24/04/29 23:06:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/29 23:06:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [32]:
anime_df = spark.read.csv(
    "../anime-data/parsed_anime.csv", header=True, inferSchema=True
)

In [33]:
anime_df.printSchema()

root
 |-- anime_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- type: string (nullable = true)
 |-- episodes: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- members: integer (nullable = true)
 |-- japanese_title: string (nullable = true)
 |-- aired: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- aired_from: string (nullable = true)
 |-- aired_to: integer (nullable = true)



In [34]:
# cast aired_from into int
from pyspark.sql.types import IntegerType

anime_df = anime_df.withColumn("aired_from", col("aired_from").cast("int"))

In [35]:
anime_df.printSchema()

root
 |-- anime_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- type: string (nullable = true)
 |-- episodes: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- members: integer (nullable = true)
 |-- japanese_title: string (nullable = true)
 |-- aired: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- aired_from: integer (nullable = true)
 |-- aired_to: integer (nullable = true)



In [36]:
anime_df.show(5)

+--------+--------------------+--------------------+-----+--------+------+-------+--------------------------+--------------------+--------------------+----------+----------+
|anime_id|                name|               genre| type|episodes|rating|members|            japanese_title|               aired|           image_url|aired_from|  aired_to|
+--------+--------------------+--------------------+-----+--------+------+-------+--------------------------+--------------------+--------------------+----------+----------+
|   32281|      Kimi no Na wa.|Drama, Romance, S...|Movie|       1|  9.37| 200630|                君の名は。|        Aug 26, 2016|https://cdn.myani...|1472140800|1472140800|
|    5114|Fullmetal Alchemi...|Action, Adventure...|   TV|      64|  9.26| 793665|鋼の錬金術師 FULLMETAL ...|Apr 5, 2009 to Ju...|https://cdn.myani...|1238860800|1278172800|
|   28977|            Gintama°|Action, Comedy, H...|   TV|      51|  9.25| 114262|                     銀魂°|Apr 8, 2015 to Ma...|https://cdn.m

In [37]:
rating_df = spark.read.csv("../anime-data/rating.csv", header=True, inferSchema=True)

                                                                                

In [38]:
rating_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- anime_id: integer (nullable = true)
 |-- rating: integer (nullable = true)



In [39]:
# valid rating only
rating_df = rating_df.filter(rating_df["rating"] > 0)

In [40]:
rating_df.show(5)

+-------+--------+------+
|user_id|anime_id|rating|
+-------+--------+------+
|      1|    8074|    10|
|      1|   11617|    10|
|      1|   11757|    10|
|      1|   15451|    10|
|      2|   11771|    10|
+-------+--------+------+
only showing top 5 rows



## Merge rating with anime


In [41]:
merged_df = rating_df.join(
    anime_df.select(
        "anime_id",
        "name",
        "genre",
        "type",
        "episodes",
        "rating",
        "members",
        "aired_from",
        "aired_to",
    ).withColumnRenamed("rating", "all_rating"),
    on=["anime_id"],
    how="left",
)

In [42]:
merged_df.show(5)

+--------+-------+------+--------------------+--------------------+----+--------+----------+-------+----------+----------+
|anime_id|user_id|rating|                name|               genre|type|episodes|all_rating|members|aired_from|  aired_to|
+--------+-------+------+--------------------+--------------------+----+--------+----------+-------+----------+----------+
|    8074|      1|    10|Highschool of the...|Action, Ecchi, Ho...|  TV|      12|      7.46| 535892|1278259200|1284912000|
|   11617|      1|    10|     High School DxD|Comedy, Demons, E...|  TV|      12|       7.7| 398660|1325779200|1332432000|
|   11757|      1|    10|    Sword Art Online|Action, Adventure...|  TV|      25|      7.83| 893100|1341676800|1356192000|
|   15451|      1|    10| High School DxD New|Action, Comedy, D...|  TV|      12|      7.87| 266657|1373126400|1379779200|
|   11771|      2|    10|    Kuroko no Basket|Comedy, School, S...|  TV|      25|      8.46| 338315|1333814400|1348243200|
+--------+------

## Build Label

In [43]:
like_threshold = 7.5

def build_label(df):
    return df.withColumn("label", when(col("rating") >= like_threshold, 1).otherwise(0))

In [44]:
labeled_df = build_label(merged_df)
labeled_df.show(5)

+--------+-------+------+--------------------+--------------------+----+--------+----------+-------+----------+----------+-----+
|anime_id|user_id|rating|                name|               genre|type|episodes|all_rating|members|aired_from|  aired_to|label|
+--------+-------+------+--------------------+--------------------+----+--------+----------+-------+----------+----------+-----+
|    8074|      1|    10|Highschool of the...|Action, Ecchi, Ho...|  TV|      12|      7.46| 535892|1278259200|1284912000|    1|
|   11617|      1|    10|     High School DxD|Comedy, Demons, E...|  TV|      12|       7.7| 398660|1325779200|1332432000|    1|
|   11757|      1|    10|    Sword Art Online|Action, Adventure...|  TV|      25|      7.83| 893100|1341676800|1356192000|    1|
|   15451|      1|    10| High School DxD New|Action, Comedy, D...|  TV|      12|      7.87| 266657|1373126400|1379779200|    1|
|   11771|      2|    10|    Kuroko no Basket|Comedy, School, S...|  TV|      25|      8.46| 3383

## Sliding Window

这里要在df的每个row上，额外增加和用户相关的信息。比如该用户最爱的电影类型、该用户看过多少电影、平均打分是多少
为了防止泄露未来信息，需把所有评分按照时间顺序排序，然后用滑动窗口聚合
理论应该使用评分时间，但是由于没有这个数据，所以采用电影上映时间

In [45]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F
import pyspark.sql.types as types

In [46]:
windowSpec = Window.partitionBy("user_id").orderBy("aired_from").rowsBetween(-100, -1)

In [49]:
# 帮助方法：对于某一列，在聚合的时候，如果用户不喜欢这个电影，则不聚合这个电影的信息
likedMoviesCol = lambda cname: when(col("label") == 1, col(cname)).otherwise(lit(None))

@udf(types.ArrayType(types.StringType()))
def most_liked_genres(gen_strs):
    """
    gen_strs = ["Action, Adventure, Drama", "Comedy, Drama, School"]
    """
    gens = [s.split(",") for s in gen_strs]
    gens = [x for l in gens for x in l]  # flatten
    gens = [s.strip() for s in gens]

    gen_set = set(gens)
    count_occur = lambda gen, l: len([g for g in l if g == gen])
    gen_with_occur = [(gen, count_occur(gen, gens)) for gen in gen_set]
    gen_with_occur.sort(key=lambda x: x[1], reverse=True)

    # pick 3 most liked genres
    return [x[0] for x in gen_with_occur[:5]]

In [50]:
NUMBER_PRECISION = 2

feat_df = (
    labeled_df.withColumn("user_rating_cnt", count(lit(1)).over(windowSpec))
    .withColumn("user_rating_ave", mean(col("rating")).over(windowSpec))
    .withColumn("user_rating_ave", F.round(col("user_rating_ave"), NUMBER_PRECISION))
    .withColumn("user_rating_std", stddev(col("rating")).over(windowSpec))
    .withColumn("user_rating_std", F.round(col("user_rating_std"), NUMBER_PRECISION))
    .withColumn(
        "user_aired_from_ave", mean(likedMoviesCol("aired_from")).over(windowSpec)
    )
    .withColumn("user_aired_from_ave", F.round(col("user_aired_from_ave"), 0))
    .withColumn("user_aired_to_ave", mean(likedMoviesCol("aired_to")).over(windowSpec))
    .withColumn("user_aired_to_ave", F.round(col("user_aired_to_ave"), 0))
    .withColumn(
        "user_liked_genres",
        most_liked_genres(collect_list(likedMoviesCol("genre")).over(windowSpec)),
    )
)

In [51]:
feat_df.printSchema()

root
 |-- anime_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- type: string (nullable = true)
 |-- episodes: string (nullable = true)
 |-- all_rating: double (nullable = true)
 |-- members: integer (nullable = true)
 |-- aired_from: integer (nullable = true)
 |-- aired_to: integer (nullable = true)
 |-- label: integer (nullable = false)
 |-- user_rating_cnt: long (nullable = false)
 |-- user_rating_ave: double (nullable = true)
 |-- user_rating_std: double (nullable = true)
 |-- user_aired_from_ave: double (nullable = true)
 |-- user_aired_to_ave: double (nullable = true)
 |-- user_liked_genres: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [52]:
feat_df.select(
    "anime_id",
    "user_id",
    "rating",
    #                'user_rating_cnt', 'user_rating_ave', 'user_rating_std',
    #                'user_aired_from_ave', 'user_aired_to_ave'
    "genre",
    "user_liked_genres",
).head(10)

                                                                                

[Row(anime_id=392, user_id=28, rating=10, genre='Action, Comedy, Demons, Fantasy, Martial Arts, School, Shounen', user_liked_genres=[]),
 Row(anime_id=20, user_id=28, rating=9, genre='Action, Comedy, Martial Arts, Shounen, Super Power', user_liked_genres=['Martial Arts', 'Shounen', 'School', 'Demons', 'Action']),
 Row(anime_id=226, user_id=28, rating=9, genre='Action, Drama, Horror, Psychological, Romance, Seinen, Supernatural', user_liked_genres=['Martial Arts', 'Shounen', 'Action', 'Comedy', 'School']),
 Row(anime_id=4224, user_id=28, rating=10, genre='Comedy, Romance, School, Slice of Life', user_liked_genres=['Action', 'Martial Arts', 'Shounen', 'Comedy', 'School']),
 Row(anime_id=7054, user_id=28, rating=10, genre='Comedy, Romance, School, Shoujo', user_liked_genres=['Action', 'Comedy', 'Martial Arts', 'Shounen', 'School']),
 Row(anime_id=6547, user_id=28, rating=10, genre='Action, Comedy, Drama, School, Supernatural', user_liked_genres=['Comedy', 'School', 'Action', 'Romance', 'M

## Encoding

将数值型和分类型特征分布进行encode表达


1. Genres: multi-hot


In [53]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
import pyspark.sql.types as types
from pyspark.ml.linalg import SparseVector
import numpy as np

In [54]:
# 1. parse genre to list
@udf(returnType="array<string>")
def genre_to_list(gen_str):
    if gen_str is None:
        return []

    gens = gen_str.split(",")
    return [gen.strip() for gen in gens]


genres_df = feat_df.withColumn("genres", genre_to_list(col("genre"))).drop("genre")

In [55]:
genres_df.head(5)

                                                                                

[Row(anime_id=392, user_id=28, rating=10, name='Yuu☆Yuu☆Hakusho', type='TV', episodes='112', all_rating=8.47, members=195017, aired_from=718646400, aired_to=789408000, label=1, user_rating_cnt=0, user_rating_ave=None, user_rating_std=None, user_aired_from_ave=None, user_aired_to_ave=None, user_liked_genres=[], genres=['Action', 'Comedy', 'Demons', 'Fantasy', 'Martial Arts', 'School', 'Shounen']),
 Row(anime_id=20, user_id=28, rating=9, name='Naruto', type='TV', episodes='220', all_rating=7.81, members=683297, aired_from=1033574400, aired_to=1170864000, label=1, user_rating_cnt=1, user_rating_ave=10.0, user_rating_std=None, user_aired_from_ave=718646400.0, user_aired_to_ave=789408000.0, user_liked_genres=['Martial Arts', 'Shounen', 'School', 'Demons', 'Action'], genres=['Action', 'Comedy', 'Martial Arts', 'Shounen', 'Super Power']),
 Row(anime_id=226, user_id=28, rating=9, name='Elfen Lied', type='TV', episodes='13', all_rating=7.85, members=623511, aired_from=1090684800, aired_to=10979

In [56]:
feat_df.printSchema()

root
 |-- anime_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- type: string (nullable = true)
 |-- episodes: string (nullable = true)
 |-- all_rating: double (nullable = true)
 |-- members: integer (nullable = true)
 |-- aired_from: integer (nullable = true)
 |-- aired_to: integer (nullable = true)
 |-- label: integer (nullable = false)
 |-- user_rating_cnt: long (nullable = false)
 |-- user_rating_ave: double (nullable = true)
 |-- user_rating_std: double (nullable = true)
 |-- user_aired_from_ave: double (nullable = true)
 |-- user_aired_to_ave: double (nullable = true)
 |-- user_liked_genres: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [57]:
def encode_genres_col(index_mapping_broadcasted):
    @udf(returnType="array<int>")
    def encode_genres_col(genres, max_genre_index):
        """
        用已经训练好的string index mapping对genres数组进行encode
        """
        if genres is None:
            genres = []
        gen_vec = [index_mapping_broadcasted.value.get(gen) for gen in genres]
        gen_vec = list(set(gen_vec))  # dedup

        # convert genre vector to multi-hot
        fill = np.ones(len(gen_vec), dtype=np.int32)
        sorted_index = np.sort(gen_vec)
        multihot_vec = SparseVector(max_genre_index + 1, sorted_index, fill)
        return multihot_vec.toArray().astype(np.int32).tolist()

    return encode_genres_col


def multi_hot_encode_genres(featdf):
    df = featdf.withColumn("genre_item", explode(col("genres")))

    genre_string_indexer = StringIndexer(inputCol="genre_item", outputCol="genre_index")
    indexer_model = genre_string_indexer.fit(df)

    # get mapping from string indexer
    gens_df = spark.createDataFrame([{"genre_item": g} for g in indexer_model.labels])
    mapping_df = indexer_model.transform(gens_df).collect()
    mapping_dict = {row.genre_item: int(row.genre_index) for row in mapping_df}
    max_genre_index = __builtin__.max(mapping_dict.values())
    broadcasted = spark.sparkContext.broadcast(mapping_dict)

    encode_fn = encode_genres_col(broadcasted)

    return featdf.withColumn(
        "genres_multihot", encode_fn(col("genres"), lit(max_genre_index))
    ).withColumn(
        "user_liked_genres_multihot",
        encode_fn(col("user_liked_genres"), lit(max_genre_index)),
    )

In [58]:
genre_encoded_df = multi_hot_encode_genres(genres_df)

                                                                                

In [59]:
genre_encoded_df.printSchema()

root
 |-- anime_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- type: string (nullable = true)
 |-- episodes: string (nullable = true)
 |-- all_rating: double (nullable = true)
 |-- members: integer (nullable = true)
 |-- aired_from: integer (nullable = true)
 |-- aired_to: integer (nullable = true)
 |-- label: integer (nullable = false)
 |-- user_rating_cnt: long (nullable = false)
 |-- user_rating_ave: double (nullable = true)
 |-- user_rating_std: double (nullable = true)
 |-- user_aired_from_ave: double (nullable = true)
 |-- user_aired_to_ave: double (nullable = true)
 |-- user_liked_genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- genres_multihot: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- user_liked_genres_multihot: array (nu

2. min max scaler for numeric features


In [60]:
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
from pyspark.ml import Pipeline

In [61]:
@udf(types.FloatType())
def extract_float(l):
    r = __builtin__.round(l[0], NUMBER_PRECISION)

    return float(r)


def min_max_scale(featdf, col):
    output_col = f"{col}_min_max"
    vec_assembler = VectorAssembler(
        inputCols=[col], outputCol=f"{col}_vec", handleInvalid="keep"
    )
    min_max_scaler = MinMaxScaler(inputCol=f"{col}_vec", outputCol=output_col)
    pipeline = Pipeline(stages=[vec_assembler, min_max_scaler])

    return (
        pipeline.fit(featdf)
        .transform(featdf)
        .drop(f"{col}_vec")
        .withColumn(output_col, extract_float(F.col(output_col)))
    )

In [62]:
scaled_df = genre_encoded_df

In [63]:
scaled_df = min_max_scale(scaled_df, "all_rating")
scaled_df = min_max_scale(scaled_df, "members")
scaled_df = min_max_scale(scaled_df, "aired_from")
scaled_df = min_max_scale(scaled_df, "aired_to")
scaled_df = min_max_scale(scaled_df, "user_rating_ave")
scaled_df = min_max_scale(scaled_df, "user_rating_std")
scaled_df = min_max_scale(scaled_df, "user_aired_from_ave")
scaled_df = min_max_scale(scaled_df, "user_aired_to_ave")

                                                                                

In [64]:
scaled_df.select(
    "anime_id", "user_id", "user_aired_from_ave", "user_aired_from_ave_min_max"
).show(1000)

[Stage 68:====>                                                   (1 + 11) / 12]

+--------+-------+-------------------+---------------------------+
|anime_id|user_id|user_aired_from_ave|user_aired_from_ave_min_max|
+--------+-------+-------------------+---------------------------+
|     392|     28|               NULL|                        NaN|
|      20|     28|         7.186464E8|                       0.76|
|     226|     28|         8.761104E8|                       0.81|
|    4224|     28|         9.476352E8|                       0.83|
|    7054|     28|        1.0164456E9|                       0.86|
|    6547|     28|         1.067184E9|                       0.87|
|    8460|     28|         1.101024E9|                       0.88|
|    9989|     28|      1.128281143E9|                       0.89|
|    9919|     28|        1.1500956E9|                        0.9|
|   10620|     28|        1.1670816E9|                        0.9|
|   11757|     28|        1.1821824E9|                       0.91|
|     523|     31|               NULL|                        

                                                                                

Pick useful features

In [65]:
scaled_df.printSchema()

root
 |-- anime_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- type: string (nullable = true)
 |-- episodes: string (nullable = true)
 |-- all_rating: double (nullable = true)
 |-- members: integer (nullable = true)
 |-- aired_from: integer (nullable = true)
 |-- aired_to: integer (nullable = true)
 |-- label: integer (nullable = false)
 |-- user_rating_cnt: long (nullable = false)
 |-- user_rating_ave: double (nullable = true)
 |-- user_rating_std: double (nullable = true)
 |-- user_aired_from_ave: double (nullable = true)
 |-- user_aired_to_ave: double (nullable = true)
 |-- user_liked_genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- genres_multihot: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- user_liked_genres_multihot: array (nu

In [66]:
output_df = scaled_df.select(
    "anime_id",
    "user_id",
    "label",
    "all_rating_min_max",
    "members_min_max",
    "aired_from_min_max",
    "aired_to_min_max",
    "genres_multihot",
    "user_rating_ave_min_max",
    "user_rating_std_min_max",
    "user_aired_from_ave_min_max",
    "user_aired_to_ave_min_max",
    "user_liked_genres_multihot",
)

In [67]:
output_df.printSchema()

root
 |-- anime_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- label: integer (nullable = false)
 |-- all_rating_min_max: float (nullable = true)
 |-- members_min_max: float (nullable = true)
 |-- aired_from_min_max: float (nullable = true)
 |-- aired_to_min_max: float (nullable = true)
 |-- genres_multihot: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- user_rating_ave_min_max: float (nullable = true)
 |-- user_rating_std_min_max: float (nullable = true)
 |-- user_aired_from_ave_min_max: float (nullable = true)
 |-- user_aired_to_ave_min_max: float (nullable = true)
 |-- user_liked_genres_multihot: array (nullable = true)
 |    |-- element: integer (containsNull = true)



Output

In [68]:
output_df.fillna(0).write.mode("overwrite").save("../anime-data/dnn_feat_eng")
#     .format('csv').option("header", "true") \

                                                                                

In [3]:
spark.stop()