# ANIME RECOMMENDER SYSTEM - PREPROCESSING - ANIME - STEP 1

In [11]:
# basic library
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import itertools
import collections
import pickle

In [4]:
# load data
anime_df = pd.read_csv('dataset/cleaned_dataset/anime.csv')
anime_df.head()

Unnamed: 0,anime_id,title,score,rating_count,ranked,popularity,members,type,studio,synopsis,episode_count,genre,url,img
0,1,Cowboy Bebop,8.75,872410.0,38.0,43,1688684,TV,Sunrise,Crime is timeless. By the year 2071 humanity h...,26.0,"Action, Sci-Fi, Adult Cast, Space",https://myanimelist.net/anime/1/Cowboy_Bebop,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,8.38,199142.0,182.0,584,346712,Movie,Bones,Another day another bounty�such is the life of...,1.0,"Action, Sci-Fi, Adult Cast, Space",https://myanimelist.net/anime/5/Cowboy_Bebop__...,https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,8.22,339167.0,323.0,246,685752,TV,Madhouse,Vash the Stampede is the man with a $$60000000...,26.0,"Action, Adventure, Comedy, Drama, Sci-Fi, Adul...",https://myanimelist.net/anime/6/Trigun,https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,7.25,42115.0,2737.0,1724,108510,TV,Sunrise,Witches are individuals with special powers li...,26.0,"Action, Drama, Mystery, Supernatural, Detective",https://myanimelist.net/anime/7/Witch_Hunter_R...,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Bouken Ou Beet,6.95,6308.0,4173.0,4964,14610,TV,Toei Animation,It is the dark century and the people are suff...,52.0,"Adventure, Fantasy, Supernatural, Shounen",https://myanimelist.net/anime/8/Bouken_Ou_Beet,https://cdn.myanimelist.net/images/anime/7/215...


In [5]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23765 entries, 0 to 23764
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   anime_id       23765 non-null  int64  
 1   title          23765 non-null  object 
 2   score          23765 non-null  float64
 3   rating_count   23765 non-null  float64
 4   ranked         23765 non-null  float64
 5   popularity     23765 non-null  int64  
 6   members        23765 non-null  int64  
 7   type           23765 non-null  object 
 8   studio         23765 non-null  object 
 9   synopsis       23765 non-null  object 
 10  episode_count  23765 non-null  float64
 11  genre          23765 non-null  object 
 12  url            23765 non-null  object 
 13  img            23765 non-null  object 
dtypes: float64(4), int64(3), object(7)
memory usage: 2.5+ MB


## Preprocessing step:

1. scaling (numerical features)
2. encoding (categorical features)
3. word embedding
4. image embedding

### scaling
- score
- rating_count
- ranked
- popularity
- members
- episode_count

In [6]:
numerical_col = ['score', 'rating_count', 'ranked', 'popularity', 'members', 'episode_count']

In [7]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

for col in numerical_col:
    anime_df[col] = scaler.fit_transform(anime_df[[col]])
anime_df.head()

Unnamed: 0,anime_id,title,score,rating_count,ranked,popularity,members,type,studio,synopsis,episode_count,genre,url,img
0,1,Cowboy Bebop,0.960483,0.341453,0.001762,0.001828,0.472131,TV,Sunrise,Crime is timeless. By the year 2071 humanity h...,0.008505,"Action, Sci-Fi, Adult Cast, Space",https://myanimelist.net/anime/1/Cowboy_Bebop,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,0.919868,0.077942,0.008438,0.024833,0.096935,Movie,Bones,Another day another bounty�such is the life of...,0.000327,"Action, Sci-Fi, Adult Cast, Space",https://myanimelist.net/anime/5/Cowboy_Bebop__...,https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,0.902305,0.132747,0.014975,0.010461,0.191726,TV,Madhouse,Vash the Stampede is the man with a $$60000000...,0.008505,"Action, Adventure, Comedy, Drama, Sci-Fi, Adul...",https://myanimelist.net/anime/6/Trigun,https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,0.795829,0.016483,0.126895,0.073309,0.030338,TV,Sunrise,Witches are individuals with special powers li...,0.008505,"Action, Drama, Mystery, Supernatural, Detective",https://myanimelist.net/anime/7/Witch_Hunter_R...,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Bouken Ou Beet,0.762898,0.002469,0.193472,0.211081,0.004085,TV,Toei Animation,It is the dark century and the people are suff...,0.01701,"Adventure, Fantasy, Supernatural, Shounen",https://myanimelist.net/anime/8/Bouken_Ou_Beet,https://cdn.myanimelist.net/images/anime/7/215...


### encoding
- type
- studio
- genre

In [8]:
categorical_col = ['type', 'studio']

In [9]:
# label encode categorical data
from sklearn.preprocessing import LabelEncoder

for col in categorical_col:
    anime_df[col] = anime_df[[col]].apply(LabelEncoder().fit_transform)
anime_df.head()

Unnamed: 0,anime_id,title,score,rating_count,ranked,popularity,members,type,studio,synopsis,episode_count,genre,url,img
0,1,Cowboy Bebop,0.960483,0.341453,0.001762,0.001828,0.472131,4,743,Crime is timeless. By the year 2071 humanity h...,0.008505,"Action, Sci-Fi, Adult Cast, Space",https://myanimelist.net/anime/1/Cowboy_Bebop,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,0.919868,0.077942,0.008438,0.024833,0.096935,0,103,Another day another bounty�such is the life of...,0.000327,"Action, Sci-Fi, Adult Cast, Space",https://myanimelist.net/anime/5/Cowboy_Bebop__...,https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,0.902305,0.132747,0.014975,0.010461,0.191726,4,399,Vash the Stampede is the man with a $$60000000...,0.008505,"Action, Adventure, Comedy, Drama, Sci-Fi, Adul...",https://myanimelist.net/anime/6/Trigun,https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,0.795829,0.016483,0.126895,0.073309,0.030338,4,743,Witches are individuals with special powers li...,0.008505,"Action, Drama, Mystery, Supernatural, Detective",https://myanimelist.net/anime/7/Witch_Hunter_R...,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Bouken Ou Beet,0.762898,0.002469,0.193472,0.211081,0.004085,4,789,It is the dark century and the people are suff...,0.01701,"Adventure, Fantasy, Supernatural, Shounen",https://myanimelist.net/anime/8/Bouken_Ou_Beet,https://cdn.myanimelist.net/images/anime/7/215...


genre

In [13]:
# function to split:
def split(col):
    # split
    anime_df[col] = anime_df[col].apply(lambda x: x.split(', '))

col_to_split_list = ['genre']
for i in col_to_split_list:
    split(i)

In [14]:
flat = itertools.chain(*anime_df['genre'].values.tolist())
flat = list(flat)

flat = np.array(flat)
flat = np.unique(flat)

flat = flat.tolist()

flat

['Action',
 'Adult Cast',
 'Adventure',
 'Anthropomorphic',
 'Avant Garde',
 'Award Winning',
 'Boys Love',
 'CGDCT',
 'Childcare',
 'Combat Sports',
 'Comedy',
 'Crossdressing',
 'Delinquents',
 'Detective',
 'Drama',
 'Ecchi',
 'Educational',
 'Erotica',
 'Fantasy',
 'Gag Humor',
 'Girls Love',
 'Gore',
 'Gourmet',
 'Harem',
 'Hentai',
 'High Stakes Game',
 'Historical',
 'Horror',
 'Idols (Female)',
 'Idols (Male)',
 'Isekai',
 'Iyashikei',
 'Josei',
 'Kids',
 'Love Polygon',
 'Magical Sex Shift',
 'Mahou Shoujo',
 'Martial Arts',
 'Mecha',
 'Medical',
 'Military',
 'Music',
 'Mystery',
 'Mythology',
 'Organized Crime',
 'Otaku Culture',
 'Parody',
 'Performing Arts',
 'Pets',
 'Psychological',
 'Racing',
 'Reincarnation',
 'Reverse Harem',
 'Romance',
 'Romantic Subtext',
 'Samurai',
 'School',
 'Sci-Fi',
 'Seinen',
 'Shoujo',
 'Shounen',
 'Showbiz',
 'Slice of Life',
 'Space',
 'Sports',
 'Strategy Game',
 'Super Power',
 'Supernatural',
 'Survival',
 'Suspense',
 'Team Sports',
 

In [17]:
len(flat)

77

In [18]:
tags = flat
columns = list(set([i for lst in tags for i in lst]))
for col in flat:
    anime_df[col] = anime_df["genre"].apply(lambda x: 1 if col in x else 0)

In [19]:
anime_df.head()

Unnamed: 0,anime_id,title,score,rating_count,ranked,popularity,members,type,studio,synopsis,...,Supernatural,Survival,Suspense,Team Sports,Time Travel,Unknown,Vampire,Video Game,Visual Arts,Workplace
0,1,Cowboy Bebop,0.960483,0.341453,0.001762,0.001828,0.472131,4,743,Crime is timeless. By the year 2071 humanity h...,...,0,0,0,0,0,0,0,0,0,0
1,5,Cowboy Bebop: Tengoku no Tobira,0.919868,0.077942,0.008438,0.024833,0.096935,0,103,Another day another bounty�such is the life of...,...,0,0,0,0,0,0,0,0,0,0
2,6,Trigun,0.902305,0.132747,0.014975,0.010461,0.191726,4,399,Vash the Stampede is the man with a $$60000000...,...,0,0,0,0,0,0,0,0,0,0
3,7,Witch Hunter Robin,0.795829,0.016483,0.126895,0.073309,0.030338,4,743,Witches are individuals with special powers li...,...,1,0,0,0,0,0,0,0,0,0
4,8,Bouken Ou Beet,0.762898,0.002469,0.193472,0.211081,0.004085,4,789,It is the dark century and the people are suff...,...,1,0,0,0,0,0,0,0,0,0


In [20]:
anime_df = anime_df.drop(['Unknown'], axis=1)

In [21]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23765 entries, 0 to 23764
Data columns (total 90 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   anime_id           23765 non-null  int64  
 1   title              23765 non-null  object 
 2   score              23765 non-null  float64
 3   rating_count       23765 non-null  float64
 4   ranked             23765 non-null  float64
 5   popularity         23765 non-null  float64
 6   members            23765 non-null  float64
 7   type               23765 non-null  int32  
 8   studio             23765 non-null  int32  
 9   synopsis           23765 non-null  object 
 10  episode_count      23765 non-null  float64
 11  genre              23765 non-null  object 
 12  url                23765 non-null  object 
 13  img                23765 non-null  object 
 14  Action             23765 non-null  int64  
 15  Adult Cast         23765 non-null  int64  
 16  Adventure          237

word embedding & image embedding to be done on other notebook

In [22]:
anime_df = anime_df.drop(['url', 'img'], axis=1)

## export df to be continued at other notebook

In [25]:
anime_df.to_csv('dataset/processed_dataset/preprocessing/anime_preproc_s1.csv', index=False)