## 1. Importing Required External & Internal Libraries

In [21]:
# Imports
import pandas as pd
import numpy as np
from numpy import genfromtxt
import csv
from collections import defaultdict

In [22]:
import sys
sys.path.append('../framework')
from framework import *

## 2. Preparing the Dataset in a Pandas DataFrame

In [23]:
item_train = pd.read_csv('./data/anime-dataset-2023.csv', delimiter=',')
item_train.head()

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,"Apr 3, 1998 to Apr 24, 1999",...,Sunrise,Original,24 min per ep,R - 17+ (violence & profanity),41.0,43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,カウボーイビバップ 天国の扉,8.38,"Action, Sci-Fi","Another day, another bounty—such is the life o...",Movie,1.0,"Sep 1, 2001",...,Bones,Original,1 hr 55 min,R - 17+ (violence & profanity),189.0,602,1448,206248.0,360978,https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,Trigun,トライガン,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",TV,26.0,"Apr 1, 1998 to Sep 30, 1998",...,Madhouse,Manga,24 min per ep,PG-13 - Teens 13 or older,328.0,246,15035,356739.0,727252,https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),7.25,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,TV,26.0,"Jul 3, 2002 to Dec 25, 2002",...,Sunrise,Original,25 min per ep,PG-13 - Teens 13 or older,2764.0,1795,613,42829.0,111931,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Bouken Ou Beet,Beet the Vandel Buster,冒険王ビィト,6.94,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...,TV,52.0,"Sep 30, 2004 to Sep 29, 2005",...,Toei Animation,Manga,23 min per ep,PG - Children,4240.0,5126,14,6413.0,15001,https://cdn.myanimelist.net/images/anime/7/215...


### 2a. Drop labels that contain word values / unneeded features:

In [24]:
item_train = item_train.drop(columns=['English name', 'Other name', 'Synopsis', 'Type', 'Aired', 'Premiered', 'Status', 'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating', 'Rank', 'Popularity', 'Favorites', 'Members', 'Image URL'])
item_train.head()

Unnamed: 0,anime_id,Name,Score,Genres,Episodes,Scored By
0,1,Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi",26.0,914193.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.38,"Action, Sci-Fi",1.0,206248.0
2,6,Trigun,8.22,"Action, Adventure, Sci-Fi",26.0,356739.0
3,7,Witch Hunter Robin,7.25,"Action, Drama, Mystery, Supernatural",26.0,42829.0
4,8,Bouken Ou Beet,6.94,"Adventure, Fantasy, Supernatural",52.0,6413.0


### 2b. Extract Unique Genres, Compile into a List of Genres

In [25]:
# Compile all genres into a single list
genres = item_train['Genres'].str.split(', ').explode()

# Get unique genres
genres = genres.unique()

### 2c. One-Hot Encoding for Genres into the DataFrame

NOTE: The dataset contains some genres that have __sensitive / potentiallly sensitive content__.  I believe that AI/ML should be used in an __ethical manner__.

Therefore, __remove__ the genres that contain sensitive / possibly sensitive content. 

In [26]:
# One-Hot Encode
for genre in genres:
    item_train[genre] = item_train['Genres'].apply(lambda x: 1 if genre in x else 0) #lambda to define short function

# Remove original genre label, drop sensitive content
item_train = item_train.drop(columns=["Genres", "Ecchi", "Hentai", "Erotica"])
item_train.head()

Unnamed: 0,anime_id,Name,Score,Episodes,Scored By,Action,Award Winning,Sci-Fi,Adventure,Drama,...,Comedy,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN
0,1,Cowboy Bebop,8.75,26.0,914193.0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,Cowboy Bebop: Tengoku no Tobira,8.38,1.0,206248.0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,Trigun,8.22,26.0,356739.0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,7,Witch Hunter Robin,7.25,26.0,42829.0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,8,Bouken Ou Beet,6.94,52.0,6413.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
# Remove sensitive content from genre list
genres = genres[~np.isin(genres, ["Ecchi", "Hentai", "Erotica"])]
print(genres)


['Action' 'Award Winning' 'Sci-Fi' 'Adventure' 'Drama' 'Mystery'
 'Supernatural' 'Fantasy' 'Sports' 'Comedy' 'Romance' 'Slice of Life'
 'Suspense' 'Gourmet' 'Avant Garde' 'Horror' 'Girls Love' 'Boys Love'
 'UNKNOWN']


In [29]:
# Remove Name from item_train dataset because we don't want to feed that in the NN, just there to see the initial df
item_train = item_train.drop(columns=["Name"])

## Repeat Step 2's process for the user_train

NOTE: I separated these two loading processes because user_train data is significantly larger than anime_data

In [30]:
user_train = pd.read_csv("./data/users-score-2023.csv")
user_train.head()

Unnamed: 0,user_id,Username,anime_id,Anime Title,rating
0,1,Xinil,21,One Piece,9
1,1,Xinil,48,.hack//Sign,7
2,1,Xinil,320,A Kite,5
3,1,Xinil,49,Aa! Megami-sama!,8
4,1,Xinil,304,Aa! Megami-sama! Movie,8


Once again, drop unnecessary features / qualitative descriptions from the dataset for user_train:

In [31]:
user_train = user_train.drop(columns=["Username", "Anime Title"])
user_train.head()

Unnamed: 0,user_id,anime_id,rating
0,1,21,9
1,1,48,7
2,1,320,5
3,1,49,8
4,1,304,8


## 4. Comparison of One-Hot Encoded Genres vs. User Preference Rating (Weighted Average)

What we use to compare the two vectors is a weighted average of the user which will give a more accurate prediction of A's genre preference and the one-hot encoded genres. This allows us to make a prediction on how each value of the user's preference vector aligns/compares to the one-hot encoded, or genres, that exist in the anime.

### 4a. Preparation of Dataset:

#### Step 1: Merge Datasets on `anime_id`

In [32]:
# Step 1: Merge datasets on anime_id
merged_df = pd.merge(user_train, item_train, on="anime_id")
merged_df.head()

Unnamed: 0,user_id,anime_id,rating,Score,Episodes,Scored By,Action,Award Winning,Sci-Fi,Adventure,...,Comedy,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN
0,1,21,9,8.69,UNKNOWN,1226493.0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,48,7,6.95,26.0,83009.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,320,5,6.54,2.0,33411.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,49,8,7.29,5.0,26400.0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
4,1,304,8,7.54,1.0,22479.0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0


#### Step 2: Calculate Weighted Genre Scores for Each User

In [33]:
# Get the list of genre columns using `genre`
# First, mulitply each genre value by the user's rating for that anime, to give weight to the genres of that anime
for genre_column in genres:
    merged_df[genre_column] *= merged_df['rating']

merged_df.head()

Unnamed: 0,user_id,anime_id,rating,Score,Episodes,Scored By,Action,Award Winning,Sci-Fi,Adventure,...,Comedy,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN
0,1,21,9,8.69,UNKNOWN,1226493.0,9,0,0,9,...,0,0,0,0,0,0,0,0,0,0
1,1,48,7,6.95,26.0,83009.0,0,0,0,7,...,0,0,0,0,0,0,0,0,0,0
2,1,320,5,6.54,2.0,33411.0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,49,8,7.29,5.0,26400.0,0,0,0,0,...,8,8,0,0,0,0,0,0,0,0
4,1,304,8,7.54,1.0,22479.0,0,0,0,0,...,8,8,0,0,0,0,0,0,0,0


#### Step 3: Sum and Normalize (Feature Scaling) to Get Average Genre Scores for Each User

The implementation below applies z-score normalization to normalize the weighted scores, helping with scaling between genres:

In [34]:
# from framework import zscore_normalization
merged_df = zscore_normalization(merged_df, genres)
merged_df.head()

Unnamed: 0,user_id,anime_id,rating,Score,Episodes,Scored By,Action,Award Winning,Sci-Fi,Adventure,...,Comedy,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN
0,1,21,9,8.69,UNKNOWN,1226493.0,1.429729,-0.292585,-0.505362,2.173666,...,-0.761861,-0.611245,-0.218972,-0.228194,-0.078,-0.110509,-0.233722,-0.095307,-0.103351,-0.071762
1,1,48,7,6.95,26.0,83009.0,-0.8468,-0.292585,-0.505362,1.57262,...,-0.761861,-0.611245,-0.218972,-0.228194,-0.078,-0.110509,-0.233722,-0.095307,-0.103351,-0.071762
2,1,320,5,6.54,2.0,33411.0,0.417938,-0.292585,-0.505362,-0.53104,...,-0.761861,-0.611245,-0.218972,-0.228194,-0.078,-0.110509,-0.233722,-0.095307,-0.103351,-0.071762
3,1,49,8,7.29,5.0,26400.0,-0.8468,-0.292585,-0.505362,-0.53104,...,1.349621,1.649109,-0.218972,-0.228194,-0.078,-0.110509,-0.233722,-0.095307,-0.103351,-0.071762
4,1,304,8,7.54,1.0,22479.0,-0.8468,-0.292585,-0.505362,-0.53104,...,1.349621,1.649109,-0.218972,-0.228194,-0.078,-0.110509,-0.233722,-0.095307,-0.103351,-0.071762


#### Step 4: Weighted Average of Features/Ratings

Implementing a weighted average of the user ratings can help reflect ratings more accurately in genre preferences:

In [35]:
print(merged_df.columns)

Index(['user_id', 'anime_id', 'rating', 'Score', 'Episodes', 'Scored By',
       'Action', 'Award Winning', 'Sci-Fi', 'Adventure', 'Drama', 'Mystery',
       'Supernatural', 'Fantasy', 'Sports', 'Comedy', 'Romance',
       'Slice of Life', 'Suspense', 'Gourmet', 'Avant Garde', 'Horror',
       'Girls Love', 'Boys Love', 'UNKNOWN'],
      dtype='object')


In [36]:
user_genre_scores = weighted_average(merged_df, genres) # Get the user_genre_scores

#### Step 5: Append Average Genre Scores to User Dataset

In [37]:
# Append the current user_items with the new inferred genre scores, without duplicating
user_train= pd.merge(user_train[['user_id']].drop_duplicates(), user_genre_scores, on='user_id') 

user_train.head()

Unnamed: 0,user_id,Action,Award Winning,Sci-Fi,Adventure,Drama,Mystery,Supernatural,Fantasy,Sports,Comedy,Romance,Slice of Life,Suspense,Gourmet,Avant Garde,Horror,Girls Love,Boys Love,UNKNOWN
0,1,-0.021378,0.221816,0.131792,0.071548,0.161339,-0.077268,-0.128757,-0.155351,0.249422,-0.024359,0.186083,0.014375,-0.018389,-0.023562,0.077137,0.021613,-0.088824,-0.103351,-0.071762
1,4,0.039131,-0.151716,-0.073285,-0.173429,-0.115357,0.069156,0.083315,-0.098244,0.101342,-0.262193,-0.1288,-0.072879,-0.080094,0.055304,-0.110509,0.043455,-0.095307,-0.029656,0.087025
2,9,0.217499,-0.039419,0.16128,0.087401,0.252347,-0.275958,0.06561,-0.025604,-0.001683,0.342347,0.549386,-0.160135,-0.210736,-0.078,-0.075116,0.18158,0.131655,-0.103351,-0.071762
3,20,0.359037,0.485691,0.019793,0.615281,-0.037607,0.0042,-0.093516,0.335923,0.349621,-0.335384,-0.307994,-0.067511,0.201519,0.047413,-0.0044,0.010198,-0.095307,-0.103351,-0.071762
4,23,0.597756,-0.096005,0.165797,0.102159,-0.207201,-0.027355,-0.160734,0.143237,0.344683,-0.287455,-0.344958,-0.201523,-0.054028,-0.014147,-0.109982,0.045201,-0.071276,-0.103351,-0.026659


## 5. Building a Deep Learning Neural Network to Compute User and Item Vectors

Here, two neural networks based on the user and the item will be used to compute the properties of the user and the item. Then, by taking the dot product of these two vectors, a comparison/prediction can be made based on these two values.

The Neural Network implemented will be self made