## 1. Importing Required External & Internal Libraries

In [1]:
# Imports
import pandas as pd
import numpy as np
from IPython.core.magic import register_cell_magic

In [2]:
@register_cell_magic
def skip(line, cell):
    return

In [3]:
import sys
sys.path.append('../framework')
from framework import *

## 2. Preparing the Dataset in a Pandas DataFrame

In [4]:
item_train = pd.read_csv('./data/anime-dataset-2023.csv', delimiter=',')
item_train.head()

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,"Apr 3, 1998 to Apr 24, 1999",...,Sunrise,Original,24 min per ep,R - 17+ (violence & profanity),41.0,43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,カウボーイビバップ 天国の扉,8.38,"Action, Sci-Fi","Another day, another bounty—such is the life o...",Movie,1.0,"Sep 1, 2001",...,Bones,Original,1 hr 55 min,R - 17+ (violence & profanity),189.0,602,1448,206248.0,360978,https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,Trigun,トライガン,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",TV,26.0,"Apr 1, 1998 to Sep 30, 1998",...,Madhouse,Manga,24 min per ep,PG-13 - Teens 13 or older,328.0,246,15035,356739.0,727252,https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),7.25,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,TV,26.0,"Jul 3, 2002 to Dec 25, 2002",...,Sunrise,Original,25 min per ep,PG-13 - Teens 13 or older,2764.0,1795,613,42829.0,111931,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Bouken Ou Beet,Beet the Vandel Buster,冒険王ビィト,6.94,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...,TV,52.0,"Sep 30, 2004 to Sep 29, 2005",...,Toei Animation,Manga,23 min per ep,PG - Children,4240.0,5126,14,6413.0,15001,https://cdn.myanimelist.net/images/anime/7/215...


### 2a. Drop labels that contain word values / unneeded features:

In [5]:
item_train = item_train.drop(columns=['English name', 'Other name', 'Synopsis', 'Type', 'Aired', 'Premiered', 'Status', 'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating', 'Rank', 'Popularity', 'Favorites', 'Members', 'Image URL'])
item_train.head()

Unnamed: 0,anime_id,Name,Score,Genres,Episodes,Scored By
0,1,Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi",26.0,914193.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.38,"Action, Sci-Fi",1.0,206248.0
2,6,Trigun,8.22,"Action, Adventure, Sci-Fi",26.0,356739.0
3,7,Witch Hunter Robin,7.25,"Action, Drama, Mystery, Supernatural",26.0,42829.0
4,8,Bouken Ou Beet,6.94,"Adventure, Fantasy, Supernatural",52.0,6413.0


### 2b. Extract Unique Genres, Compile into a List of Genres

In [6]:
# Compile all genres into a single list
genres = item_train['Genres'].str.split(', ').explode()

# Get unique genres
genres = genres.unique()

### 2c. One-Hot Encoding for Genres into the DataFrame

NOTE: The dataset contains some genres that have __sensitive / potentiallly sensitive content__.  I believe that AI/ML should be used in an __ethical manner__.

Therefore, __remove__ the genres that contain sensitive / possibly sensitive content. 

In [7]:
# One-Hot Encode
for genre in genres:
    item_train[genre] = item_train['Genres'].apply(lambda x: 1 if genre in x else 0) #lambda to define short function

# Remove original genre label, drop sensitive content
item_train = item_train.drop(columns=["Genres", "Ecchi", "Hentai", "Erotica"])
item_train.head()

Unnamed: 0,anime_id,Name,Score,Episodes,Scored By,Action,Award Winning,Sci-Fi,Adventure,Drama,...,Comedy,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN
0,1,Cowboy Bebop,8.75,26.0,914193.0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,Cowboy Bebop: Tengoku no Tobira,8.38,1.0,206248.0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,Trigun,8.22,26.0,356739.0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,7,Witch Hunter Robin,7.25,26.0,42829.0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,8,Bouken Ou Beet,6.94,52.0,6413.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Remove sensitive content from genre list
genres = genres[~np.isin(genres, ["Ecchi", "Hentai", "Erotica"])]
print(genres)


['Action' 'Award Winning' 'Sci-Fi' 'Adventure' 'Drama' 'Mystery'
 'Supernatural' 'Fantasy' 'Sports' 'Comedy' 'Romance' 'Slice of Life'
 'Suspense' 'Gourmet' 'Avant Garde' 'Horror' 'Girls Love' 'Boys Love'
 'UNKNOWN']


In [9]:
# Remove Name from item_train dataset because we don't want to feed that in the NN, just there to see the initial df
item_train = item_train.drop(columns=["Name"])

### 2d. Replace all 'UNKNOWN' values in `Scored By` and `Episodes` Columns
In addition, convert any numeric values read by the code as a String to be numeric so it can be computed.

In [10]:
contains_unknown_scoredby = item_train['Scored By'].str.contains('UNKNOWN')
contains_unknown_episodes = item_train['Episodes'].str.contains('UNKNOWN')
contains_unknown_score = item_train['Score'].str.contains('UNKNOWN')

print(contains_unknown_scoredby)
print(contains_unknown_episodes)
print(contains_unknown_score)

0        False
1        False
2        False
3        False
4        False
         ...  
24900     True
24901     True
24902     True
24903     True
24904     True
Name: Scored By, Length: 24905, dtype: bool
0        False
1        False
2        False
3        False
4        False
         ...  
24900    False
24901    False
24902    False
24903    False
24904    False
Name: Episodes, Length: 24905, dtype: bool
0        False
1        False
2        False
3        False
4        False
         ...  
24900     True
24901     True
24902     True
24903     True
24904     True
Name: Score, Length: 24905, dtype: bool


In [11]:
# Replace `UNKNOWN` across item DataFrame
item_train = item_train.replace('UNKNOWN', 0)
item_train = item_train.replace('NaN', 0)

In [12]:
# Filter for string values
string_values = item_train['Scored By'].dropna().apply(lambda x: x if isinstance(x, str) else None)

# Drop the None values
string_values = string_values.dropna()

print(string_values)

0        914193.0
1        206248.0
2        356739.0
3         42829.0
4          6413.0
           ...   
24590       129.0
24635       152.0
24729       114.0
24831       320.0
24856       194.0
Name: Scored By, Length: 15692, dtype: object


In [13]:
# Convert 'Scored By' and 'Episode' to numeric, handling non-numeric values
item_train['Scored By'] = pd.to_numeric(item_train['Scored By'], errors='coerce')
item_train['Episodes'] = pd.to_numeric(item_train['Episodes'], errors='coerce')
item_train['Score'] = pd.to_numeric(item_train['Score'], errors='coerce')

# Filter for string values
string_values = item_train['Scored By'].dropna().apply(lambda x: x if isinstance(x, str) else None)

# Drop the None values
string_values = string_values.dropna()

print(item_train.isnull().sum())
print(string_values)

anime_id         0
Score            0
Episodes         0
Scored By        0
Action           0
Award Winning    0
Sci-Fi           0
Adventure        0
Drama            0
Mystery          0
Supernatural     0
Fantasy          0
Sports           0
Comedy           0
Romance          0
Slice of Life    0
Suspense         0
Gourmet          0
Avant Garde      0
Horror           0
Girls Love       0
Boys Love        0
UNKNOWN          0
dtype: int64
Series([], Name: Scored By, dtype: object)


We'll also load a y_train (true values) from the user's rating to train the neural network with. This allows us to compute losses (MSE).

In [14]:
print(item_train.shape)
y_train = item_train['Score']
print(y_train.shape)

(24905, 23)
(24905,)


## Repeat Step 2's process for the user_train

NOTE: I separated these two loading processes because user_train data is significantly larger than anime_data

In [15]:
user_train = pd.read_csv("./data/users-score-2023.csv")
user_original = user_train #Store the original dataset
user_train.head()

Unnamed: 0,user_id,Username,anime_id,Anime Title,rating
0,1,Xinil,21,One Piece,9
1,1,Xinil,48,.hack//Sign,7
2,1,Xinil,320,A Kite,5
3,1,Xinil,49,Aa! Megami-sama!,8
4,1,Xinil,304,Aa! Megami-sama! Movie,8


Once again, drop unnecessary features / qualitative descriptions from the dataset for user_train:

In [16]:
user_train = user_train.drop(columns=["Username", "Anime Title"])
user_train.head()

Unnamed: 0,user_id,anime_id,rating
0,1,21,9
1,1,48,7
2,1,320,5
3,1,49,8
4,1,304,8


In [17]:
# Replace UNKNOWN values in the user_train to avoid numeric exceptions
user_train = user_train.replace('UNKNOWN', 0)
y_train = y_train.replace('UNKNOWN', 0)

## 4. Comparison of One-Hot Encoded Genres vs. User Preference Rating (Weighted Average)

What we use to compare the two vectors is a weighted average of the user which will give a more accurate prediction of A's genre preference and the one-hot encoded genres. This allows us to make a prediction on how each value of the user's preference vector aligns/compares to the one-hot encoded, or genres, that exist in the anime.

### 4a. Preparation of Dataset:

#### Step 1: Apply Bayesian Average to item_train (Animes) and then Merge Datasets on anime_id
Implementing a Bayesian average of the user ratings can help reflect ratings more accurately in genre preferences.

What does a Bayesian average compute?

- Bayesian average is good for computing a kind of weighted average for ranking data based on a confidence/reliability factor
- For example, if a originally unrated item immediately gets a positive rating of 5 stars, you don't want to make it shoot up in value, as that is inaccurate since there is only 1 rating

In [18]:
# Step 1: Apply Bayesian Average
item_train = Bayesian_Rating(item_train)

In [19]:
# Step 1: Merge datasets on anime_id
merged_df = pd.merge(user_train, item_train, on="anime_id")
merged_df.head()

Unnamed: 0,user_id,anime_id,rating,Score,Episodes,Scored By,Action,Award Winning,Sci-Fi,Adventure,...,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN,Bayesian Rating
0,1,21,9,8.69,0.0,1226493.0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,8.619186
1,1,48,7,6.95,26.0,83009.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.40701
2,1,320,5,6.54,2.0,33411.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,5.630107
3,1,49,8,7.29,5.0,26400.0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,5.926462
4,1,304,8,7.54,1.0,22479.0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,5.93307


#### Step 2: Calculate Weighted Genre Scores for Each User

In [20]:
# Get the list of genre columns using `genre`
# First, mulitply each genre value by the user's rating for that anime, to give weight to the genres of that anime
for genre_column in genres:
    merged_df[genre_column] *= merged_df['Bayesian Rating']

merged_df.head()

Unnamed: 0,user_id,anime_id,rating,Score,Episodes,Scored By,Action,Award Winning,Sci-Fi,Adventure,...,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN,Bayesian Rating
0,1,21,9,8.69,0.0,1226493.0,8.619186,0.0,0.0,8.619186,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.619186
1,1,48,7,6.95,26.0,83009.0,0.0,0.0,0.0,6.40701,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.40701
2,1,320,5,6.54,2.0,33411.0,5.630107,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.630107
3,1,49,8,7.29,5.0,26400.0,0.0,0.0,0.0,0.0,...,5.926462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.926462
4,1,304,8,7.54,1.0,22479.0,0.0,0.0,0.0,0.0,...,5.93307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.93307


#### Step 3: Apply Weighted Average with Bayesian Ratings

In [21]:
user_genre_scores = weighted_average(merged_df, genres) # Get the user_genre_scores

#### Step 4: Append Average Genre Scores to User Dataset

In [22]:
# Append the current user_items with the new inferred genre scores, without duplicating
user_train = pd.merge(user_train[['user_id']].drop_duplicates(), user_genre_scores, on='user_id') 

# Ordered list
user_train = user_train.reset_index(drop=True)
print(user_train.shape)
print(y_train.shape)
user_train.head()

(270033, 20)
(24905,)


Unnamed: 0,user_id,Action,Award Winning,Sci-Fi,Adventure,Drama,Mystery,Supernatural,Fantasy,Sports,Comedy,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN
0,1,0.423087,0.146563,0.264716,0.247974,0.346414,0.088505,0.137129,0.213116,0.070307,0.373709,0.365949,0.049828,0.050717,0.003535,0.022778,0.059515,0.002438,0.0,0.0
1,4,0.493301,0.041293,0.192259,0.167162,0.220241,0.142479,0.227153,0.279958,0.046194,0.281855,0.274672,0.032381,0.031466,0.010046,0.0,0.060575,0.0,0.011146,0.014115
2,9,0.533885,0.092524,0.297905,0.261819,0.369363,0.032676,0.180937,0.239785,0.032358,0.485288,0.446481,0.011507,0.019832,0.0,0.019832,0.078537,0.01373,0.0,0.0
3,20,0.587981,0.208474,0.200164,0.443596,0.251733,0.107298,0.136772,0.397092,0.069755,0.202925,0.134392,0.028708,0.09138,0.008268,0.009782,0.05548,0.0,0.0,0.0
4,23,0.725014,0.056231,0.279638,0.258751,0.192413,0.109075,0.123981,0.339428,0.095523,0.245119,0.133775,0.007553,0.041426,0.004171,0.00427,0.048904,0.002968,0.0,0.003796


### Step 5: Scaling of Shapes
An issue that we currently have in our dataset is that for the model to be trained on the same y_train, all trains require the same amount of examples -- this allows us to initialize parameters and spread into minibatches. Right now, m of item_train = m of y_train, but now we need it to also = m of user_train.

In [23]:
user_train = user_train[:y_train.shape[0]]
print(user_train.shape)
user_train.head()

(24905, 20)


Unnamed: 0,user_id,Action,Award Winning,Sci-Fi,Adventure,Drama,Mystery,Supernatural,Fantasy,Sports,Comedy,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN
0,1,0.423087,0.146563,0.264716,0.247974,0.346414,0.088505,0.137129,0.213116,0.070307,0.373709,0.365949,0.049828,0.050717,0.003535,0.022778,0.059515,0.002438,0.0,0.0
1,4,0.493301,0.041293,0.192259,0.167162,0.220241,0.142479,0.227153,0.279958,0.046194,0.281855,0.274672,0.032381,0.031466,0.010046,0.0,0.060575,0.0,0.011146,0.014115
2,9,0.533885,0.092524,0.297905,0.261819,0.369363,0.032676,0.180937,0.239785,0.032358,0.485288,0.446481,0.011507,0.019832,0.0,0.019832,0.078537,0.01373,0.0,0.0
3,20,0.587981,0.208474,0.200164,0.443596,0.251733,0.107298,0.136772,0.397092,0.069755,0.202925,0.134392,0.028708,0.09138,0.008268,0.009782,0.05548,0.0,0.0,0.0
4,23,0.725014,0.056231,0.279638,0.258751,0.192413,0.109075,0.123981,0.339428,0.095523,0.245119,0.133775,0.007553,0.041426,0.004171,0.00427,0.048904,0.002968,0.0,0.003796


#### Step 6: Normalization of `item_train`, `user_train`, and `y_train` Values
The item_train and user_train will be scaled using a self-implemented z-score normalization.

The y_train will be scaled using a self-implemented MinMaxScaler.

In [24]:
# What's inside item train that's making values so big???
item_train.head()

Unnamed: 0,anime_id,Score,Episodes,Scored By,Action,Award Winning,Sci-Fi,Adventure,Drama,Mystery,...,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN,Bayesian Rating
0,1,8.75,26.0,914193.0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,8.654271
1,5,8.38,1.0,206248.0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,8.014284
2,6,8.22,26.0,356739.0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,8.00885
3,7,7.25,26.0,42829.0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,6.261687
4,8,6.94,52.0,6413.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,4.760508


In [25]:
item_train = np_zscore_normalization(item_train, genres)
item_train.head()

Unnamed: 0,anime_id,Score,Episodes,Scored By,Action,Award Winning,Sci-Fi,Adventure,Drama,Mystery,...,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN,Bayesian Rating
0,1,8.75,26.0,914193.0,2.309401,2.309401,2.309401,-0.433013,-0.433013,-0.433013,...,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,8.654271
1,5,8.38,1.0,206248.0,2.915476,-0.342997,2.915476,-0.342997,-0.342997,-0.342997,...,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,8.014284
2,6,8.22,26.0,356739.0,2.309401,-0.433013,2.309401,2.309401,-0.433013,-0.433013,...,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,8.00885
3,7,7.25,26.0,42829.0,1.936492,-0.516398,-0.516398,-0.516398,1.936492,1.936492,...,-0.516398,-0.516398,-0.516398,-0.516398,-0.516398,-0.516398,-0.516398,-0.516398,-0.516398,6.261687
4,8,6.94,52.0,6413.0,-0.433013,-0.433013,-0.433013,2.309401,-0.433013,-0.433013,...,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,4.760508


In [26]:
user_train = np_zscore_normalization(user_train, genres)
user_train.head()

Unnamed: 0,user_id,Action,Award Winning,Sci-Fi,Adventure,Drama,Mystery,Supernatural,Fantasy,Sports,Comedy,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN
0,1,1.926569,-0.030386,0.805778,0.6873,1.383957,-0.441262,-0.09715,0.440609,-0.570053,1.577122,1.522206,-0.714983,-0.70869,-1.042593,-0.906415,-0.646427,-1.050358,-1.067612,-1.067612
1,4,2.709723,-0.689349,0.445903,0.257176,0.656324,0.07156,0.708301,1.105396,-0.652499,1.119662,1.065644,-0.75637,-0.763248,-0.924331,-0.999873,-0.544355,-0.999873,-0.916058,-0.893731
2,9,2.063147,-0.39884,0.746807,0.545515,1.145415,-0.732683,0.094341,0.422605,-0.734453,1.79206,1.575593,-0.850764,-0.804329,-0.914953,-0.804329,-0.476861,-0.838365,-0.914953,-0.914953
3,20,2.670545,0.332999,0.281816,1.781219,0.599451,-0.290187,-0.108644,1.494782,-0.521432,0.298821,-0.123303,-0.774258,-0.388232,-0.900152,-0.89083,-0.609357,-0.95108,-0.95108,-0.95108
4,23,3.376858,-0.487722,0.803243,0.682545,0.299208,-0.18236,-0.096228,1.148742,-0.260673,0.603771,-0.039635,-0.769007,-0.573273,-0.788551,-0.787981,-0.530059,-0.795507,-0.812655,-0.790718


In [27]:
# Min-Max Scaler
y_train_arr = y_train.to_numpy()
y_train_arr = y_train_arr[:, np.newaxis] #Make into 2D array
y_train_arr = MinMaxScaler(y_train_arr, -1, 1)
print(y_train_arr.shape)

(24905, 1)


Convert `item_train` and `user_train` into np arrays:

In [28]:
# Ensure we drop all unneeded values in data before training
pd.options.display.max_columns = None
item_train = item_train.drop(columns=["anime_id", "Score", "Episodes", "Scored By", "Bayesian Rating"])
print(item_train.shape)
item_train.head()

(24905, 19)


Unnamed: 0,Action,Award Winning,Sci-Fi,Adventure,Drama,Mystery,Supernatural,Fantasy,Sports,Comedy,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN
0,2.309401,2.309401,2.309401,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013
1,2.915476,-0.342997,2.915476,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997
2,2.309401,-0.433013,2.309401,2.309401,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013
3,1.936492,-0.516398,-0.516398,-0.516398,1.936492,1.936492,1.936492,-0.516398,-0.516398,-0.516398,-0.516398,-0.516398,-0.516398,-0.516398,-0.516398,-0.516398,-0.516398,-0.516398,-0.516398
4,-0.433013,-0.433013,-0.433013,2.309401,-0.433013,-0.433013,2.309401,2.309401,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013,-0.433013


In [29]:
user_train = user_train.drop(columns=["user_id"])
print(user_train.shape)
user_train.head()

(24905, 19)


Unnamed: 0,Action,Award Winning,Sci-Fi,Adventure,Drama,Mystery,Supernatural,Fantasy,Sports,Comedy,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN
0,1.926569,-0.030386,0.805778,0.6873,1.383957,-0.441262,-0.09715,0.440609,-0.570053,1.577122,1.522206,-0.714983,-0.70869,-1.042593,-0.906415,-0.646427,-1.050358,-1.067612,-1.067612
1,2.709723,-0.689349,0.445903,0.257176,0.656324,0.07156,0.708301,1.105396,-0.652499,1.119662,1.065644,-0.75637,-0.763248,-0.924331,-0.999873,-0.544355,-0.999873,-0.916058,-0.893731
2,2.063147,-0.39884,0.746807,0.545515,1.145415,-0.732683,0.094341,0.422605,-0.734453,1.79206,1.575593,-0.850764,-0.804329,-0.914953,-0.804329,-0.476861,-0.838365,-0.914953,-0.914953
3,2.670545,0.332999,0.281816,1.781219,0.599451,-0.290187,-0.108644,1.494782,-0.521432,0.298821,-0.123303,-0.774258,-0.388232,-0.900152,-0.89083,-0.609357,-0.95108,-0.95108,-0.95108
4,3.376858,-0.487722,0.803243,0.682545,0.299208,-0.18236,-0.096228,1.148742,-0.260673,0.603771,-0.039635,-0.769007,-0.573273,-0.788551,-0.787981,-0.530059,-0.795507,-0.812655,-0.790718


In [30]:
item_train_arr = item_train.to_numpy()
print(item_train_arr)

[[ 2.30940108  2.30940108  2.30940108 ... -0.4330127  -0.4330127
  -0.4330127 ]
 [ 2.91547595 -0.34299717  2.91547595 ... -0.34299717 -0.34299717
  -0.34299717]
 [ 2.30940108 -0.4330127   2.30940108 ... -0.4330127  -0.4330127
  -0.4330127 ]
 ...
 [ 1.93649167 -0.51639778  1.93649167 ... -0.51639778 -0.51639778
  -0.51639778]
 [-0.23570226 -0.23570226 -0.23570226 ... -0.23570226 -0.23570226
   4.24264069]
 [-0.23570226 -0.23570226 -0.23570226 ... -0.23570226 -0.23570226
   4.24264069]]


In [31]:
user_train_arr = user_train.to_numpy()
print(user_train_arr)

[[ 1.92656912 -0.03038588  0.80577761 ... -1.05035833 -1.06761183
  -1.06761183]
 [ 2.7097229  -0.68934902  0.44590297 ... -0.99987349 -0.91605804
  -0.89373091]
 [ 2.06314672 -0.39884008  0.74680702 ... -0.83836508 -0.91495303
  -0.91495303]
 ...
 [ 2.28294445 -0.50613473  0.42389465 ... -0.80834486 -0.85640667
  -0.85640667]
 [ 1.67457119  0.13916972  1.54116107 ... -1.00236376 -1.00236376
  -1.00236376]
 [ 3.00192563  0.22584842  1.17296254 ... -0.77402595 -0.77402595
  -0.77402595]]


#### Part 6: Splitting into Train and Test Sets
Implemented using a self-made train_test_split function:

In [32]:
# Initial split into train (80%) and test (20%) sets
item_train_arr, item_test_arr = train_test_split(item_train_arr, train_size=0.80, random_state=1, shuffle=True)
user_train_arr, user_test_arr = train_test_split(user_train_arr, train_size=0.80, random_state=1, shuffle=True)
y_train_arr, y_test_arr       = train_test_split(y_train_arr,    train_size=0.80, random_state=1, shuffle=True)

# Split again for cv-sets (10%) and test (10%) sets
item_test_arr, item_cv_arr = train_test_split(item_test_arr, train_size=0.50, random_state=1, shuffle=True)
user_test_arr, user_cv_arr = train_test_split(user_test_arr, train_size=0.50, random_state=1, shuffle=True)
y_test_arr, y_cv_arr = train_test_split(y_test_arr, train_size=0.50, random_state=1, shuffle=True)

print(f"Item training data shape: {item_train_arr.shape}")
print(f"Item test data shape: {item_test_arr.shape}")
print(f"Item cv data shape: {item_cv_arr.shape}")

Item training data shape: (19924, 19)
Item test data shape: (2490, 19)
Item cv data shape: (2491, 19)


## 5. Building a Deep Learning Neural Network to Compute User and Item Vectors

Here, two neural networks based on the user and the item will be used to compute the properties of the user and the item. Then, by taking the dot product of these two vectors, a comparison/prediction can be made based on these two values.

The Neural Network implemented will be self made.

<figure>
    <center> <img src="./images/RecSysNN.png"   style="width:500px;height:280px;" ></center>
</figure>

In [33]:
# Checking shapes
print("Item: ", item_train_arr.shape)
print("User: ", user_train_arr.shape)

Item:  (19924, 19)
User:  (19924, 19)


In [34]:
# Get the Sequential calculation class
from Sequential import *
sequential = Sequential()

# Get Adam initialization from model
from model import init_Adam

#### Define the Model Function
The model will attempt to implement Adam optimization (inspired by Andrew Ng's DL Course) to try to speed up computational time.

The hyperparameters of choice will be:

- beta1 = 0.9 --> dW
- beta2 = 0.999 --> dW^2
- epsilon = 1 * 10^-8

During testing with mini-batch regular gradient descent, training one epoch took ~2 mins.

<figure>
    <center> <img src="./images/Adam.png"   style="width:500px;height:280px;" ></center>
</figure>

In addition, we will introduce L2 Normalization onto the parameters. Then, a dot product of the two vectors will provide a value that points to the similarity between the two values.

<figure>
    <center> <img src="./images/L2Norm.png"   style="width:500px;height:280px;" ></center>
</figure>


In [35]:
# Model training imports
from model import *
from matplotlib import pyplot as plt

In [36]:
# @Deprecated: Original model for training only a single model, not RecSys
%%skip

def model(name, X, Y, layer_dimensions, learning_rate = 0.01, mini_batch_size = 128, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8, epochs = 50, print_cost = True, decay = None, decay_rate = 0.6):
    L = len(layer_dimensions) #Number of layers in the neural network
    loss_history = []
    t = 0. #Counter for Adam update    
    seed = 10      
    m = X.shape[0] #Number of training exs
    learning_rate_initial = learning_rate #Initialize the original learning rate

    # Initialize the parameters. layer_dimensions control the input/output of the model
    params = init_params(layer_dimensions)

    # Initialize the cache (as empty dict)
    cache = dict()

    # Initialize Adam optimizer (v, s dicts)
    v, s = init_Adam(params)
    
    # Optimization
    for i in range(epochs):
        
        # Define the minibatches
        seed += 1 #increment the seed so reshuffle different minibatches each time. Therefore different datasets to work with
        minibatches = generate_minibatches(X, Y, mini_batch_size, seed)
        total_loss = 0.

        # Iterate the optimization process for every minibatch
        for minibatch in minibatches:

            # Get the X and Y's from the minibatch -- remember mini-batch repesent shuffled X, Y'set
            (X_minibatch, Y_minibatch) = minibatch

            # Sequential (calculation)
                # X: np.ndarray, Y: np.ndarray, params: dict, 
                # v: dict, s: dict, t: int, learning_rate = None, cache = None,
                # beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8

            a5, cache, loss, updated_params, updated_v, updated_s, updated_t = sequential.Sequential(X=X_minibatch, Y=Y_minibatch, params=params, v=v, s=s, t=t, learning_rate=learning_rate, cache=cache,
            beta1=beta1, beta2=beta2, epsilon=epsilon)

            # Adjust new parameters
            v = updated_v
            s = updated_s
            t = updated_t
            total_loss += loss
            params = updated_params

        avg_loss = total_loss / (m / mini_batch_size)

        # Apply the decay function to adjust the learning_rate
        if decay:
            learning_rate = decay(learning_rate_initial, i, decay_rate)

        # Print the cost every epoch
        if print_cost is True and (i % 5 == 0 or i == 0):
            print(name+" loss at epoch %i: %f" %(i, avg_loss))
            if decay:
                print(f'lr at epoch {i}: {learning_rate}')
        if print_cost is True and (i % 5 == 0 or i == 0):
            loss_history.append(avg_loss)

    # plot the cost
    plt.plot(loss_history)
    plt.ylabel('Loss')
    plt.xlabel('Epochs')
    plt.title("Learning rate = " + str(learning_rate))
    plt.show()
    
    # Return the final optimized parameters and output layers for prediction
    return a5, params, loss_history

UsageError: Line magic function `%%skip` not found.


In [37]:
#Define the full model for Adam, basically our tf model.fit function
def RecSys_model_Adam(X_user, X_item, Y, X_user_cv, X_item_cv, Y_cv, layer_dims_user, layer_dims_item, learning_rate = 0.01, mini_batch_size = 128, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8, epochs = 5000, print_cost = True, decay = None, decay_rate = 1.0):
    L = len(layer_dims_user) #Number of layers in the neural network
    train_loss_history = []
    cv_loss_history = []
    t = 0. #Counter for Adam update    
    seed = 10      
    m = X_user.shape[0] #Number of training exs. X_user = X_item
    learning_rate_initial = learning_rate #Initialize the original learning rate (for decay)

    # Initialize the parameters. layer_dimensions control the input/output of the model
    params_u = init_params(layer_dims_user)
    params_i = init_params(layer_dims_item)

    # Initialize Adam optimizer (v, s dicts)
    v_u, s_u = init_Adam(params_u)
    v_i, s_i = init_Adam(params_i)

    # Optimization
    for i in range(epochs):
        
        # Define the minibatches
        seed += 1 #increment the seed so reshuffle different minibatches each time. Therefore different datasets to work with
        minibatches = create_minibatches(X_user, X_item, Y, mini_batch_size, seed)
        train_loss = 0.
        cv_loss = 0.

        # Iterate the optimization process for every minibatch
        for minibatch in minibatches:

            # Get the X and Y's from the minibatch -- remember mini-batch repesent shuffled X, Y'set
            (X_minibatch_user, X_minibatch_item, Y_minibatch) = minibatch

            # Sequential (calculation)
                # X_u: np.ndarray, X_i: np.ndarray, Y: np.ndarray, params_u: dict, params_i: dict, v_u: dict, v_i: dict, 
                # s_u: dict, s_i: dict, t: float, learning_rate: float, cache_i = dict(), cache_u = dict(),
                # beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8

            params_u, v_u, s_u, params_i, v_i, s_i, y_pred, loss, t = sequential.RecSys_Sequential(
            X_minibatch_user, X_minibatch_item, Y_minibatch, params_u, params_i, v_u, v_i, s_u, s_i, t, learning_rate)

            # Adjust new parameters
            train_loss += loss

            # Find CV Loss
            cv_loss = ValidationLoss(X_user_cv, X_item_cv, params_u, params_i, Y_cv, cv_loss)

        #Compute the avg loss over the entire training & cv sets
        avg_train_loss = train_loss / (m) 
        avg_cv_loss = cv_loss / (X_user_cv.shape[0])

        # Apply the decay function to adjust the learning_rate
        if decay:
            learning_rate = decay(learning_rate_initial, i, decay_rate)

        # Print the cost every epoch
        if print_cost is True:
            print(f'Losses at epoch {i}: Train Loss: {avg_train_loss} | CV Loss: {avg_cv_loss}')
            if decay:
                print(f'lr at epoch {i}: {learning_rate}')
            print()
        if print_cost is True:
            train_loss_history.append(avg_train_loss)
            cv_loss_history.append(avg_cv_loss)

    # plot the cost
    plt.plot(train_loss_history, color='blue', label="Training Loss")
    plt.plot(cv_loss_history, color="red", label='Validation Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epochs')
    plt.title(f'Loss Histories After {epochs} Epochs')
    plt.legend()
    plt.show()
    
    # Return the final optimized parameters and output layers for prediction
    return y_pred, params_u, params_i, train_loss_history, cv_loss_history


### Define the layer dimensions for the user and item
Good rule of thumb for picking a neural network architecture:

From: https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw
𝑁𝑖 = number of input neurons (features).

𝑁𝑜 = number of output neurons.

𝑁𝑠 = number of samples in training data set.

𝛼 = an arbitrary scaling factor usually 2-10. Recommended 2 to prevent overfitting issues.

Some other rules of thumb include:

The number of hidden neurons should be between the size of the input layer and the size of the output layer.
The number of hidden neurons should be 2/3 the size of the input layer, plus the size of the output layer.
The number of hidden neurons should be less than twice the size of the input layer.
In addition, we will apply a scheduled learning rate decay function to optimize learning. During the first few steps of gradient descent, the model is able to get away with large initial steps & learning rates. Therefore, having learning rate decay to allow us to control a decrease of the lr gives us a way to speed up initial learning epochs while not overshooting in later epochs.

Chosen NN Architecture: [input, 256, 128, 64, 32]

Chosen Epochs = 50

In [38]:
# Specify the layer dimensions (i.e. num of inputs + num of units per layer)
layer_dims_user = [user_train_arr.shape[1], 256, 128, 64, 32]
layer_dims_item = [item_train_arr.shape[1], 256, 128, 64, 32]

In [39]:
# Trying mini-batch Adam, don't clear the output

# Model training for Adam
prediction, user_params, item_params, loss_history = RecSys_model_Adam(user_train_arr, item_train_arr, y_train_arr, user_cv_arr, item_cv_arr, y_cv_arr, layer_dims_user, layer_dims_item, mini_batch_size=64, learning_rate=0.01, epochs=300, decay=schedule_lr_decay, decay_rate=1.0)

Losses at epoch 0: Train Loss: [nan] | CV Loss: [nan]
lr at epoch 0: 0.01

Losses at epoch 1: Train Loss: [nan] | CV Loss: [nan]
lr at epoch 1: 0.009990009990009992

Losses at epoch 2: Train Loss: [nan] | CV Loss: [nan]
lr at epoch 2: 0.00998003992015968



KeyboardInterrupt: 

In [None]:
print(np.argmin(loss_history))
print(np.min(loss_history))

#### Trying the Model with Mini-Batch GD (Not Adam)
Using standard gradient descent algorithm to see if we get better results on accuracy and lower loss.

The issue with the current Adam implementation is that loss goes down and then comes back up and oscillates with more epochs. This might be because there isn't enough variance to make Adam work effectively in our dataset.

In [40]:
#Define the full model, basically our tf model.fit function
from model import *
from framework import l2_normalize
from matplotlib import pyplot as plt

def RecSys_model_gd(X_user, X_item, Y, layer_dims_user, layer_dims_item, learning_rate = 0.001, mini_batch_size = 128, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8, epochs = 5000, print_cost = True, decay = None, decay_rate = 0.8):
    L = len(layer_dims_user) #Number of layers in the neural network
    loss_history = [] 
    seed = 10      
    m = X_user.shape[0] #Number of training exs. X_user = X_item
    learning_rate_initial = learning_rate #Initialize the original learning rate (for decay)

    # Initialize the parameters. layer_dimensions control the input/output of the model
    params_u = init_params(layer_dims_user)
    params_i = init_params(layer_dims_item)
    
    # Optimization
    for i in range(epochs):
        # Define the minibatches
        seed += 1 #increment the seed so reshuffle different minibatches each time. Therefore different datasets to work with
        minibatches = create_minibatches(X_user, X_item, Y, mini_batch_size, seed)
        total_loss = 0.
        total_mae = 0.

        # Iterate the optimization process for every minibatch
        for minibatch in minibatches:

            # Get the X and Y's from the minibatch -- remember mini-batch repesent shuffled X, Y'set
            (X_minibatch_user, X_minibatch_item, Y_minibatch) = minibatch

            # Refresh ForwardProp caches on every iteration
            cache_u = dict()
            cache_i = dict()
                
            # 1a. Forward prop with user 
            a1_u, cache_u = ForwardProp(X_minibatch_user, params_u, relu, 1, cache_u) #cache_u empty on the 1st layer
            a2_u, cache_u = ForwardProp(a1_u, params_u, relu, 2, cache_u) 
            a3_u, cache_u = ForwardProp(a2_u, params_u, relu, 3, cache_u)
            a4_u, cache_u = ForwardProp(a3_u, params_u, linear, 4, cache_u)

            # 1b. Forward prop with items
            a1_i, cache_i = ForwardProp(X_minibatch_item, params_i, relu, 1, cache_i) #cache_i empty on the 1st layer
            a2_i, cache_i = ForwardProp(a1_i, params_i, relu, 2, cache_i) 
            a3_i, cache_i = ForwardProp(a2_i, params_i, relu, 3, cache_i)
            a4_i, cache_i = ForwardProp(a3_i, params_i, linear, 4, cache_i)

            # 1c. Transpose back output layers
            a4_u = a4_u.T
            a4_i = a4_i.T

            # print(f'a4_u shape: {a4_u.shape}, a4_i shape: {a4_i.shape}')

            # 2. L2 Normalization of vectors
            a4_u = l2_normalize(vector=a4_u, axis=1)
            a4_i = l2_normalize(vector=a4_i, axis=1)

            # 3. Current prediction (dot product):
                # Initialize an empty array to store the dot product predictions
            y_pred = np.zeros((a4_u.shape[0], 1))

            # Compute dot product predictions into y_pred
            for j in range(a4_u.shape[0]):
                y_pred[j] = np.dot(a4_u[j], a4_i[j])

            # print(y_pred.shape)
            # y_pred = np.mean(y_pred, axis=1)
            # print(y_pred)

            # print(f'a4_u shape: {a4_u.shape}, a4_i shape: {a4_i.shape}, y_pred shape: {y_pred.shape}')

            # 4. Using the result of the NN (a3) to compute loss with MSE:
            total_loss += MSECost(y_pred.T, Y_minibatch)

            # 5. Apply backprop to find derivative/gradients
            gradients_u = Backprop(X_minibatch_user, Y_minibatch, cache_u)
            gradients_i = Backprop(X_minibatch_item, Y_minibatch, cache_i)

            # print(f"db3: {gradients_u['db3']}")
            # print(f"db4: {gradients_u['db4']}")

            # 6. Update parameters using standard GD for user and item
            # print(f"Before update - b3: {params_u['b3']}")
            # print(f"Before update - b4: {params_u['b4']}")

            params_u = gradient_descent(params=params_u, gradients=gradients_u, learning_rate=learning_rate, num_layers=4)
            params_i = gradient_descent(params=params_i, gradients=gradients_i, learning_rate=learning_rate, num_layers=4)

            # print(f"After update - b3: {params_u['b3']}")
            # print(f"After update - b4: {params_u['b4']}")

            # Calculate mean absolute error for accuracy
            total_mae += np.mean(np.abs(y_pred - Y_minibatch)) #Accuracy

        avg_loss = total_loss / m
        avg_mae = total_mae / m
        accuracy = (1 - (avg_mae / (np.max(Y_minibatch) - np.min(Y_minibatch))))*100 # Get the % accuracy from MAE

        # Apply the decay function to adjust the learning_rate
        if decay:
            learning_rate = decay(learning_rate_initial, i, decay_rate)

        # Print the cost every epoch
        if print_cost is True:
            print(f"Loss at epoch {i}: {avg_loss}")
            print(f'Accuracy at epoch {i}: {accuracy}%')
            
            if decay:
                print(f'lr at epoch {i}: {learning_rate}')
        if print_cost is True:
            loss_history.append(avg_loss)

    # plot the cost
    plt.plot(loss_history)
    plt.ylabel('Loss')
    plt.xlabel('Epochs')
    plt.title("Learning rate = " + str(learning_rate))
    plt.show()
    
    # Return the final optimized parameters and output layers for prediction
    return y_pred, params_u, params_i, loss_history

In [42]:
trained = False #Change here

if (trained == False):
    # Model training with standard GD
    prediction_gd, user_params_gd, item_params_gd, loss_history_gd = RecSys_model_gd(user_train_arr, item_train_arr, y_train_arr, layer_dims_user, layer_dims_item, learning_rate=0.001, epochs=5000, decay=schedule_lr_decay, decay_rate=1.0)

Loss at epoch 0: [nan]
Accuracy at epoch 0: nan%
lr at epoch 0: 0.001
Loss at epoch 1: [nan]
Accuracy at epoch 1: nan%
lr at epoch 1: 0.0009990009990009992
Loss at epoch 2: [nan]
Accuracy at epoch 2: nan%
lr at epoch 2: 0.000998003992015968


KeyboardInterrupt: 

In [None]:
pass

#Further unpack results
user_output, user_trained_params, user_loss_history = user_results[0], user_results[1], user_results[2]
item_output, item_trained_params, item_loss_history = item_results[0], item_results[1], item_results[2]

#### Storing the Trained Models
The parameters of each model is stored using `pickle`. Why not just use `.txt` files to save the user_params and item_params?

- `.pkl` serializes Python objects into a byte stream that can be easily saved to a file or transmitted over a network
- `.pkl` allows serialization of almost any data type (nparray, dict, list) while `.txt` can just store text data. Therefore when reusing, data types of objects can be retained without conversions.
- `.pkl` is stored in binary format which is faster to read/write than text formats

In [43]:
%pip install pickle
import pickle

[31mERROR: Could not find a version that satisfies the requirement pickle (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pickle[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [46]:
if (trained == False):
    # Store the user_model parameters
    with open('./pickle/user_model.pkl', 'wb') as file:
        pickle.dump(user_params_gd, file)

    # Store the item_model parameters
    with open('./pickle/item_model.pkl', 'wb') as file:
        pickle.dump(item_params_gd, file)

#### Loading the Models (if required)

In [None]:
trained = True

if (trained == True):
    user_params, item_params = dict(), dict()

    # Load the user_model parameters
    with open('./pickle/user_model.pkl', 'rb') as file:
        user_params = pickle.load(file)

    # Store the item_model parameters
    with open('./pickle/item_model.pkl', 'rb') as file:
        item_params = pickle.load(file)

#### Evaluating the Model's Accuracy on Test Sets

In [None]:
from model import predict, evaluate
predictions = predict(item_test_arr, user_test_arr, user_params_gd, item_params_gd)
evaluate(predictions.T, y_test_arr)

print("y_pred sample:", predictions)
print("y_test sample:", y_test_arr)
print("Max of y_test:", np.max(y_test_arr))
print("Min of y_test:", np.min(y_test_arr))

## 6. Predictions
We can make a prediction for a new user or an existing user. Since we want to recommend animes to users based on genres they like, we are most likely going to use an algorithm to recommend to new users by having them rate genres on a scale.

To find similar items from our large catalogue, a squared distance measure between items can be used, but that would be for already existing users who have data in the catalogue. This may be a future integration to add to the model.