# Wide-and-Deep ML: Feature Engineering

In this notebook, we will engineer the features we will use to build the wide-and-deep collaborative filter recommender.

## 1. Feature...something, I'm tired

- one-hot encoding time won't work since this was time of rating. (I'm so fucking done with this!)
- movie id is unique, so popped off.

In [1]:
# import modules

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [26]:
# get data
df0 = pd.read_csv('../data/user_movie_interaction.csv')
df0.head()

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
1,1,5,2,3.0,1996-12-25 15:26:09,Jumanji (1995),Adventure|Children|Fantasy
2,2,13,2,3.0,1996-11-27 08:19:02,Jumanji (1995),Adventure|Children|Fantasy
3,3,29,2,3.0,1996-06-23 20:36:14,Jumanji (1995),Adventure|Children|Fantasy
4,4,34,2,3.0,1996-10-28 13:29:44,Jumanji (1995),Adventure|Children|Fantasy


In [3]:
df0.shape

(71554, 7)

In [27]:
# remove unwanted column
del df0['Unnamed: 0']
df0.dtypes

userId         int64
movieId        int64
rating       float64
timestamp     object
title         object
genres        object
dtype: object

In [28]:
# convert genres to categorical data
df0['genres'] = df0['genres'].astype('category')
df0.dtypes

userId          int64
movieId         int64
rating        float64
timestamp      object
title          object
genres       category
dtype: object

In [29]:
# scale rating column
df0['rating'] = df0['rating'].apply(lambda x: x/5.0)

In [30]:
# sort table primarily by `userId` then by `timestamp`, both in ascending order
df1 = df0.sort_values(by=['userId', 'timestamp'])
display(df1)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
2433,1,924,0.7,2004-09-10 03:06:38,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi
2349,1,919,0.7,2004-09-10 03:07:01,"Wizard of Oz, The (1939)",Adventure|Children|Fantasy|Musical
6081,1,2683,0.7,2004-09-10 03:07:30,Austin Powers: The Spy Who Shagged Me (1999),Action|Adventure|Comedy
5253,1,1584,0.7,2004-09-10 03:07:36,Contact (1997),Drama|Sci-Fi
2634,1,1079,0.8,2004-09-10 03:07:45,"Fish Called Wanda, A (1988)",Comedy|Crime
...,...,...,...,...,...,...
3688,500,1200,0.8,2012-05-16 15:34:34,Aliens (1986),Action|Adventure|Horror|Sci-Fi
47936,500,162,0.8,2012-05-16 15:35:08,Crumb (1994),Documentary
63087,500,3095,0.8,2012-05-16 15:35:14,"Grapes of Wrath, The (1940)",Drama
4829,500,1291,0.8,2012-05-16 15:35:20,Indiana Jones and the Last Crusade (1989),Action|Adventure


In [31]:
# timestamp is not useful information
del df1['timestamp']

In [32]:
# genre relevance
df1['genre_freq'] = df1['genres'].value_counts(normalize=True).loc[df1['genres']].values
df1['user_genre_rating'] = df1.rating * df1.genre_freq

In [33]:
df1.head()

Unnamed: 0,userId,movieId,rating,title,genres,genre_freq,user_genre_rating
2433,1,924,0.7,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi,0.004458,0.003121
2349,1,919,0.7,"Wizard of Oz, The (1939)",Adventure|Children|Fantasy|Musical,0.001174,0.000822
6081,1,2683,0.7,Austin Powers: The Spy Who Shagged Me (1999),Action|Adventure|Comedy,0.003158,0.002211
5253,1,1584,0.7,Contact (1997),Drama|Sci-Fi,0.00362,0.002534
2634,1,1079,0.8,"Fish Called Wanda, A (1988)",Comedy|Crime,0.01111,0.008888


In [34]:
# find and drop any missing values
df1.isnull().sum()
# df1.dropna()

userId               0
movieId              0
rating               0
title                0
genres               0
genre_freq           0
user_genre_rating    0
dtype: int64

In [35]:
# standardize `'genres` column by removing vertical slash
# and making every letter lowercase. save this data in new
# dataframe
df1['genres'] = df1['genres'].apply(lambda x: x.replace('|', ' ').lower())
df1.head()

Unnamed: 0,userId,movieId,rating,title,genres,genre_freq,user_genre_rating
2433,1,924,0.7,2001: A Space Odyssey (1968),adventure drama sci-fi,0.004458,0.003121
2349,1,919,0.7,"Wizard of Oz, The (1939)",adventure children fantasy musical,0.001174,0.000822
6081,1,2683,0.7,Austin Powers: The Spy Who Shagged Me (1999),action adventure comedy,0.003158,0.002211
5253,1,1584,0.7,Contact (1997),drama sci-fi,0.00362,0.002534
2634,1,1079,0.8,"Fish Called Wanda, A (1988)",comedy crime,0.01111,0.008888


In [36]:
# split dataset into training and testing subsets
train, test = train_test_split(df1, test_size=.2)
train, val = train_test_split(train, test_size=.2)

# preview shape of datasets
print(f"{len(train)} train examples")
print(f"{len(val)} validation examples")
print(f"{len(test)} test examples")

# save train, test, validation samples to csv
train.to_csv('../data/user_movie_interaction_train.csv')
val.to_csv('../data/user_movie_interaction_val.csv')
test.to_csv('../data/user_movie_interaction_test.csv')

45794 train examples
11449 validation examples
14311 test examples
