## 1. Importing Required External & Internal Libraries

In [2]:
# Imports
import pandas as pd
import numpy as np
from numpy import genfromtxt
import csv
from collections import defaultdict

In [3]:
import sys
sys.path.append('../framework')
from framework import *

## 2. Preparing the Dataset in a Pandas DataFrame

In [4]:
item_train = pd.read_csv('./data/anime-dataset-2023.csv', delimiter=',')
item_train.head()

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,"Apr 3, 1998 to Apr 24, 1999",...,Sunrise,Original,24 min per ep,R - 17+ (violence & profanity),41.0,43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,カウボーイビバップ 天国の扉,8.38,"Action, Sci-Fi","Another day, another bounty—such is the life o...",Movie,1.0,"Sep 1, 2001",...,Bones,Original,1 hr 55 min,R - 17+ (violence & profanity),189.0,602,1448,206248.0,360978,https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,Trigun,トライガン,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",TV,26.0,"Apr 1, 1998 to Sep 30, 1998",...,Madhouse,Manga,24 min per ep,PG-13 - Teens 13 or older,328.0,246,15035,356739.0,727252,https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),7.25,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,TV,26.0,"Jul 3, 2002 to Dec 25, 2002",...,Sunrise,Original,25 min per ep,PG-13 - Teens 13 or older,2764.0,1795,613,42829.0,111931,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Bouken Ou Beet,Beet the Vandel Buster,冒険王ビィト,6.94,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...,TV,52.0,"Sep 30, 2004 to Sep 29, 2005",...,Toei Animation,Manga,23 min per ep,PG - Children,4240.0,5126,14,6413.0,15001,https://cdn.myanimelist.net/images/anime/7/215...


### 2a. Drop labels that contain word values / unneeded features:

In [5]:
item_train = item_train.drop(columns=['English name', 'Other name', 'Synopsis', 'Type', 'Aired', 'Premiered', 'Status', 'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating', 'Rank', 'Popularity', 'Favorites', 'Members', 'Image URL'])
item_train.head()

Unnamed: 0,anime_id,Name,Score,Genres,Episodes,Scored By
0,1,Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi",26.0,914193.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.38,"Action, Sci-Fi",1.0,206248.0
2,6,Trigun,8.22,"Action, Adventure, Sci-Fi",26.0,356739.0
3,7,Witch Hunter Robin,7.25,"Action, Drama, Mystery, Supernatural",26.0,42829.0
4,8,Bouken Ou Beet,6.94,"Adventure, Fantasy, Supernatural",52.0,6413.0


### 2b. Extract Unique Genres, Compile into a List of Genres

In [6]:
# Compile all genres into a single list
genres = item_train['Genres'].str.split(', ').explode()

# Get unique genres
genres = genres.unique()

### 2c. One-Hot Encoding for Genres into the DataFrame

NOTE: The dataset contains some genres that have __sensitive / potentiallly sensitive content__.  I believe that AI/ML should be used in an __ethical manner__.

Therefore, __remove__ the genres that contain sensitive / possibly sensitive content. 

In [7]:
# One-Hot Encode
for genre in genres:
    item_train[genre] = item_train['Genres'].apply(lambda x: 1 if genre in x else 0) #lambda to define short function

# Remove original genre label, drop sensitive content
item_train = item_train.drop(columns=["Genres", "Ecchi", "Hentai", "Erotica"])
item_train.head()

Unnamed: 0,anime_id,Name,Score,Episodes,Scored By,Action,Award Winning,Sci-Fi,Adventure,Drama,...,Comedy,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN
0,1,Cowboy Bebop,8.75,26.0,914193.0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,Cowboy Bebop: Tengoku no Tobira,8.38,1.0,206248.0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,Trigun,8.22,26.0,356739.0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,7,Witch Hunter Robin,7.25,26.0,42829.0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,8,Bouken Ou Beet,6.94,52.0,6413.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Remove sensitive content from genre list
genres = genres[~np.isin(genres, ["Ecchi", "Hentai", "Erotica"])]
print(genres)


['Action' 'Award Winning' 'Sci-Fi' 'Adventure' 'Drama' 'Mystery'
 'Supernatural' 'Fantasy' 'Sports' 'Comedy' 'Romance' 'Slice of Life'
 'Suspense' 'Gourmet' 'Avant Garde' 'Horror' 'Girls Love' 'Boys Love'
 'UNKNOWN']


In [9]:
# Remove Name from item_train dataset because we don't want to feed that in the NN, just there to see the initial df
item_train = item_train.drop(columns=["Name"])

### 2d. Replace all 'UNKNOWN' values in `Scored By` and `Episodes` Columns
In addition, convert any numeric values read by the code as a String to be numeric so it can be computed.

In [10]:
contains_unknown_scoredby = item_train['Scored By'].str.contains('UNKNOWN')
contains_unknown_episodes = item_train['Episodes'].str.contains('UNKNOWN')
contains_unknown_score = item_train['Score'].str.contains('UNKNOWN')

print(contains_unknown_scoredby)
print(contains_unknown_episodes)
print(contains_unknown_score)

0        False
1        False
2        False
3        False
4        False
         ...  
24900     True
24901     True
24902     True
24903     True
24904     True
Name: Scored By, Length: 24905, dtype: bool
0        False
1        False
2        False
3        False
4        False
         ...  
24900    False
24901    False
24902    False
24903    False
24904    False
Name: Episodes, Length: 24905, dtype: bool
0        False
1        False
2        False
3        False
4        False
         ...  
24900     True
24901     True
24902     True
24903     True
24904     True
Name: Score, Length: 24905, dtype: bool


In [11]:
# Replace `UNKNOWN` across item DataFrame
item_train = item_train.replace('UNKNOWN', 0)
item_train = item_train.replace('NaN', 0)

In [12]:
# Filter for string values
string_values = item_train['Scored By'].dropna().apply(lambda x: x if isinstance(x, str) else None)

# Drop the None values
string_values = string_values.dropna()

print(string_values)

0        914193.0
1        206248.0
2        356739.0
3         42829.0
4          6413.0
           ...   
24590       129.0
24635       152.0
24729       114.0
24831       320.0
24856       194.0
Name: Scored By, Length: 15692, dtype: object


In [13]:
# Convert 'Scored By' and 'Episode' to numeric, handling non-numeric values
item_train['Scored By'] = pd.to_numeric(item_train['Scored By'], errors='coerce')
item_train['Episodes'] = pd.to_numeric(item_train['Episodes'], errors='coerce')
item_train['Score'] = pd.to_numeric(item_train['Score'], errors='coerce')

# Filter for string values
string_values = item_train['Scored By'].dropna().apply(lambda x: x if isinstance(x, str) else None)

# Drop the None values
string_values = string_values.dropna()

print(item_train.isnull().sum())
print(string_values)

anime_id         0
Score            0
Episodes         0
Scored By        0
Action           0
Award Winning    0
Sci-Fi           0
Adventure        0
Drama            0
Mystery          0
Supernatural     0
Fantasy          0
Sports           0
Comedy           0
Romance          0
Slice of Life    0
Suspense         0
Gourmet          0
Avant Garde      0
Horror           0
Girls Love       0
Boys Love        0
UNKNOWN          0
dtype: int64
Series([], Name: Scored By, dtype: object)


## Repeat Step 2's process for the user_train

NOTE: I separated these two loading processes because user_train data is significantly larger than anime_data

In [14]:
user_train = pd.read_csv("./data/users-score-2023.csv")
user_train.head()

Unnamed: 0,user_id,Username,anime_id,Anime Title,rating
0,1,Xinil,21,One Piece,9
1,1,Xinil,48,.hack//Sign,7
2,1,Xinil,320,A Kite,5
3,1,Xinil,49,Aa! Megami-sama!,8
4,1,Xinil,304,Aa! Megami-sama! Movie,8


Once again, drop unnecessary features / qualitative descriptions from the dataset for user_train:

In [15]:
user_train = user_train.drop(columns=["Username", "Anime Title"])
user_train.head()

Unnamed: 0,user_id,anime_id,rating
0,1,21,9
1,1,48,7
2,1,320,5
3,1,49,8
4,1,304,8


We'll also load a y_train (true values) from the user's rating to train the neural network with. This allows us to compute losses (MSE).

In [16]:
y_train = user_train
y_train = y_train.drop(columns=["user_id", "anime_id"])
y_train.head()

Unnamed: 0,rating
0,9
1,7
2,5
3,8
4,8


In [17]:
# Replace UNKNOWN values in the user_train to avoid numeric exceptions
user_train = user_train.replace('UNKNOWN', 0)
y_train = y_train.replace('UNKNOWN', 0)

## 4. Comparison of One-Hot Encoded Genres vs. User Preference Rating (Weighted Average)

What we use to compare the two vectors is a weighted average of the user which will give a more accurate prediction of A's genre preference and the one-hot encoded genres. This allows us to make a prediction on how each value of the user's preference vector aligns/compares to the one-hot encoded, or genres, that exist in the anime.

### 4a. Preparation of Dataset:

#### Step 1: Apply Bayesian Average to item_train (Animes) and then Merge Datasets on anime_id
Implementing a Bayesian average of the user ratings can help reflect ratings more accurately in genre preferences.

What does a Bayesian average compute?

- Bayesian average is good for computing a kind of weighted average for ranking data based on a confidence/reliability factor
- For example, if a originally unrated item immediately gets a positive rating of 5 stars, you don't want to make it shoot up in value, as that is inaccurate since there is only 1 rating

In [18]:
# Step 1: Apply Bayesian Average
item_train = Bayesian_Rating(item_train)

In [19]:
# Step 1: Merge datasets on anime_id
merged_df = pd.merge(user_train, item_train, on="anime_id")
merged_df.head()

Unnamed: 0,user_id,anime_id,rating,Score,Episodes,Scored By,Action,Award Winning,Sci-Fi,Adventure,...,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN,Bayesian Rating
0,1,21,9,8.69,0.0,1226493.0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,8.619186
1,1,48,7,6.95,26.0,83009.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.40701
2,1,320,5,6.54,2.0,33411.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,5.630107
3,1,49,8,7.29,5.0,26400.0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,5.926462
4,1,304,8,7.54,1.0,22479.0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,5.93307


#### Step 2: Calculate Weighted Genre Scores for Each User

In [20]:
# Get the list of genre columns using `genre`
# First, mulitply each genre value by the user's rating for that anime, to give weight to the genres of that anime
for genre_column in genres:
    merged_df[genre_column] *= merged_df['Bayesian Rating']

merged_df.head()

Unnamed: 0,user_id,anime_id,rating,Score,Episodes,Scored By,Action,Award Winning,Sci-Fi,Adventure,...,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN,Bayesian Rating
0,1,21,9,8.69,0.0,1226493.0,8.619186,0.0,0.0,8.619186,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.619186
1,1,48,7,6.95,26.0,83009.0,0.0,0.0,0.0,6.40701,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.40701
2,1,320,5,6.54,2.0,33411.0,5.630107,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.630107
3,1,49,8,7.29,5.0,26400.0,0.0,0.0,0.0,0.0,...,5.926462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.926462
4,1,304,8,7.54,1.0,22479.0,0.0,0.0,0.0,0.0,...,5.93307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.93307


#### Step 3: Apply Weighted Average with Bayesian Ratings

In [21]:
user_genre_scores = weighted_average(merged_df, genres) # Get the user_genre_scores

#### Step 4: Append Average Genre Scores to User Dataset

In [22]:
# Append the current user_items with the new inferred genre scores, without duplicating
user_train = pd.merge(user_train[['user_id']].drop_duplicates(), user_genre_scores, on='user_id') 

# Ordered list
user_train = user_train.reset_index(drop=True)

user_train.head()

Unnamed: 0,user_id,Action,Award Winning,Sci-Fi,Adventure,Drama,Mystery,Supernatural,Fantasy,Sports,Comedy,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN
0,1,3.038131,1.120823,1.772458,1.79811,2.405662,0.631593,0.95851,1.508028,0.486404,2.372774,2.373327,0.344604,0.407779,0.021713,0.180737,0.402367,0.010326,0.0,0.0
1,4,3.397572,0.323196,1.26266,1.10241,1.554629,1.011652,1.566381,1.8556,0.339982,1.858365,1.803661,0.202683,0.234407,0.06422,0.0,0.418753,0.0,0.077735,0.075333
2,9,3.809706,0.716667,1.913819,1.841969,2.416142,0.223892,1.167979,1.656381,0.219102,3.057448,2.747099,0.05523,0.164041,0.0,0.164041,0.537301,0.078627,0.0,0.0
3,20,4.535397,1.651167,1.528931,3.426816,1.953919,0.833883,1.068402,3.076587,0.491127,1.501018,1.027209,0.219422,0.739293,0.054257,0.075937,0.409882,0.0,0.0,0.0
4,23,5.129297,0.44347,1.917877,1.836391,1.338258,0.754057,0.864227,2.408736,0.665953,1.670143,0.900626,0.055477,0.308702,0.033708,0.035319,0.339052,0.01706,0.0,0.02792


#### Step 5: Normalization of `item_train`, `user_train`, and `y_train` Values
The item_train and user_train will be scaled using a self-implemented z-score normalization.

The y_train will be scaled using a self-implemented MinMaxScaler.

In [23]:
item_train = zscore_normalization(item_train, genres)
item_train.head()

Unnamed: 0,anime_id,Score,Episodes,Scored By,Action,Award Winning,Sci-Fi,Adventure,Drama,Mystery,...,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN,Bayesian Rating
0,1,8.75,26.0,914193.0,2.065267,10.116336,2.656551,-0.427089,-0.358477,-0.187634,...,-0.300526,-0.275336,-0.099057,-0.076791,-0.182646,-0.148025,-0.067512,-0.082657,-0.496735,8.654271
1,5,8.38,1.0,206248.0,2.065267,-0.09885,2.656551,-0.427089,-0.358477,-0.187634,...,-0.300526,-0.275336,-0.099057,-0.076791,-0.182646,-0.148025,-0.067512,-0.082657,-0.496735,8.014284
2,6,8.22,26.0,356739.0,2.065267,-0.09885,2.656551,2.341431,-0.358477,-0.187634,...,-0.300526,-0.275336,-0.099057,-0.076791,-0.182646,-0.148025,-0.067512,-0.082657,-0.496735,8.00885
3,7,7.25,26.0,42829.0,2.065267,-0.09885,-0.376428,-0.427089,2.789576,5.329519,...,-0.300526,-0.275336,-0.099057,-0.076791,-0.182646,-0.148025,-0.067512,-0.082657,-0.496735,6.261687
4,8,6.94,52.0,6413.0,-0.484199,-0.09885,-0.376428,2.341431,-0.358477,-0.187634,...,-0.300526,-0.275336,-0.099057,-0.076791,-0.182646,-0.148025,-0.067512,-0.082657,-0.496735,4.760508


In [24]:
user_train = zscore_normalization(user_train, genres)
user_train.head()

Unnamed: 0,user_id,Action,Award Winning,Sci-Fi,Adventure,Drama,Mystery,Supernatural,Fantasy,Sports,Comedy,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN
0,1,-0.286397,0.14785,0.209152,-0.184651,0.106592,-0.213934,-0.451416,-0.465044,0.783547,-0.029393,0.228471,0.316931,-0.142948,-0.030002,0.120626,-0.030104,-0.148803,-0.206314,-0.155178
1,4,-0.103242,-0.51962,-0.156034,-0.56867,-0.443017,0.159665,-0.011477,-0.268783,0.448202,-0.361654,-0.131782,-0.000187,-0.331234,0.273886,-0.315177,-0.007609,-0.196655,-0.004519,1.023837
2,9,0.106764,-0.190356,0.310414,-0.160441,0.11336,-0.614705,-0.299815,-0.381275,0.171356,0.412844,0.464843,-0.329662,-0.407652,-0.185236,0.080368,0.155136,0.167707,-0.206314,-0.155178
3,20,0.476545,0.591652,0.034705,0.714377,-0.18515,-0.015082,-0.371882,0.420666,0.794364,-0.592467,-0.622806,0.037218,0.217083,0.202658,-0.132074,-0.019787,-0.196655,-0.206314,-0.155178
4,23,0.779171,-0.418972,0.313321,-0.16352,-0.582751,-0.093551,-0.519652,0.043554,1.194761,-0.483229,-0.702856,-0.329112,-0.250547,0.055747,-0.230014,-0.117024,-0.1176,-0.206314,0.281782


In [25]:
# Min-Max Scaler
y_train_arr = y_train['rating'].to_numpy()
y_train_arr = MinMaxScaler(y_train_arr, -1, 1)
print(y_train_arr)

[ 0.77777778  0.33333333 -0.11111111 ...  0.77777778  1.
  1.        ]


Convert `item_train` and `user_train` into np arrays:

In [26]:
item_train_arr = item_train.to_numpy()
print(item_train_arr)

[[ 1.00000000e+00  8.75000000e+00  2.60000000e+01 ... -8.26568054e-02
  -4.96735438e-01  8.65427074e+00]
 [ 5.00000000e+00  8.38000000e+00  1.00000000e+00 ... -8.26568054e-02
  -4.96735438e-01  8.01428407e+00]
 [ 6.00000000e+00  8.22000000e+00  2.60000000e+01 ... -8.26568054e-02
  -4.96735438e-01  8.00884970e+00]
 ...
 [ 5.57330000e+04  0.00000000e+00  1.60000000e+01 ... -8.26568054e-02
  -4.96735438e-01  4.02043445e+00]
 [ 5.57340000e+04  0.00000000e+00  1.00000000e+00 ... -8.26568054e-02
   2.01314407e+00  4.02043445e+00]
 [ 5.57350000e+04  0.00000000e+00  1.00000000e+00 ... -8.26568054e-02
   2.01314407e+00  4.02043445e+00]]


In [27]:
user_train_arr = user_train.to_numpy()
print(user_train_arr)

[[ 1.00000000e+00 -2.86397450e-01  1.47850053e-01 ... -1.48802979e-01
  -2.06313537e-01 -1.55178035e-01]
 [ 4.00000000e+00 -1.03241673e-01 -5.19620240e-01 ... -1.96654595e-01
  -4.51902404e-03  1.02383703e+00]
 [ 9.00000000e+00  1.06764136e-01 -1.90356043e-01 ...  1.67707065e-01
  -2.06313537e-01 -1.55178035e-01]
 ...
 [ 1.29108500e+06 -9.31552473e-01  1.04514815e+00 ... -1.96654595e-01
  -2.06313537e-01 -1.55178035e-01]
 [ 1.29108700e+06 -5.69106625e-01  2.43298816e-02 ...  5.81730961e-01
  -2.06313537e-01 -1.55178035e-01]
 [ 1.29109700e+06 -6.57008705e-01 -7.90076730e-01 ... -1.96654595e-01
  -2.06313537e-01 -1.55178035e-01]]


#### Step 6: Splitting into Train and Test Sets
Implemented using a self-made train_test_split function:

In [28]:
item_train_arr, item_test_arr = train_test_split(item_train_arr, train_size=0.80, random_state=1, shuffle=True)
user_train_arr, user_test_arr = train_test_split(user_train_arr, train_size=0.80, random_state=1, shuffle=True)
y_train_arr, y_test_arr       = train_test_split(y_train_arr,    train_size=0.80, random_state=1, shuffle=True)
print(f"Item training data shape: {item_train_arr.shape}")
print(f"Item test data shape: {item_test_arr.shape}")

Item training data shape: (4981, 24)
Item test data shape: (19924, 24)


## 5. Building a Deep Learning Neural Network to Compute User and Item Vectors

Here, two neural networks based on the user and the item will be used to compute the properties of the user and the item. Then, by taking the dot product of these two vectors, a comparison/prediction can be made based on these two values.

The Neural Network implemented will be self made