In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

warnings.filterwarnings('ignore')

#### Using TF-IDF - Content based recommendation


Data Preprocessing

In [2]:
# contains user_id, MAL_ID, rating
top_animes_count = 1000
rating_top_anime = pd.read_csv('dataset/top_anime_unsupervised_use.csv')

usecols = ["MAL_ID", "Name", "Score", "Genres", "Type", "Episodes", "Premiered",
           "Studios", "Source", "Rating", "Members"]

# contains all anime info
anime_data = pd.read_csv('dataset/anime.csv', usecols=usecols)
anime = pd.read_csv('dataset/anime.csv',low_memory=True)

In [3]:
anime_data.reset_index(drop=True, inplace=True)

In [4]:
def process_multilabel(series):
    series = series.split(",")
    if "Unknown" in series:
        series.remove("Unknown")
    return series

In [5]:
anime_data["Genres"] = anime_data["Genres"].map(process_multilabel)
anime_data["Studios"] = anime_data["Studios"].map(process_multilabel)
anime_data["Score"] = anime_data["Score"].replace("Unknown", 0).astype(float)
anime_data["Episodes"] = anime_data["Episodes"].replace("Unknown", 0).astype(int)
anime_data.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members
0,1,Cowboy Bebop,8.78,"[Action, Adventure, Comedy, Drama, Sci-Fi,...",TV,26,Spring 1998,[Sunrise],Original,R - 17+ (violence & profanity),1251960
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"[Action, Drama, Mystery, Sci-Fi, Space]",Movie,1,Unknown,[Bones],Original,R - 17+ (violence & profanity),273145
2,6,Trigun,8.24,"[Action, Sci-Fi, Adventure, Comedy, Drama,...",TV,26,Spring 1998,[Madhouse],Manga,PG-13 - Teens 13 or older,558913
3,7,Witch Hunter Robin,7.27,"[Action, Mystery, Police, Supernatural, Dr...",TV,26,Summer 2002,[Sunrise],Original,PG-13 - Teens 13 or older,94683
4,8,Bouken Ou Beet,6.98,"[Adventure, Fantasy, Shounen, Supernatural]",TV,52,Fall 2004,[Toei Animation],Manga,PG - Children,13224


In [6]:
anime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17562 entries, 0 to 17561
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   MAL_ID     17562 non-null  int64  
 1   Name       17562 non-null  object 
 2   Score      17562 non-null  float64
 3   Genres     17562 non-null  object 
 4   Type       17562 non-null  object 
 5   Episodes   17562 non-null  int64  
 6   Premiered  17562 non-null  object 
 7   Studios    17562 non-null  object 
 8   Source     17562 non-null  object 
 9   Rating     17562 non-null  object 
 10  Members    17562 non-null  int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 1.5+ MB


In [7]:
def to_category(df, column, is_multilabel=False):
    lb = LabelBinarizer()
    if is_multilabel:
        lb = MultiLabelBinarizer()
        
    expandedLabelData = lb.fit_transform(df[column])
    labelClasses = lb.classes_

    category_df = pd.DataFrame(expandedLabelData, columns=labelClasses)
    del df[column]
    return pd.concat([df, category_df], axis=1)

In [8]:
# [['MAL_ID', 'Name', 'Genres', 'Type', 'Producers', 'Source', 'Completed', 'Episodes', 'Source', 'Score']]
anime_metadata = anime_data.copy()

anime_metadata = to_category(anime_metadata, 'Source')
anime_metadata = to_category(anime_metadata, 'Premiered')
anime_metadata = to_category(anime_metadata, 'Studios', is_multilabel=True)
anime_metadata = to_category(anime_metadata, 'Type')
anime_metadata = to_category(anime_metadata, "Rating")
# anime_metadata = to_category(anime_metadata, 'Genres', is_multilabel=True)

Genres = anime_metadata["Genres"]
anime_id = anime_metadata[['MAL_ID', 'Name']]

del anime_metadata["Genres"]
del anime_metadata['MAL_ID']
del anime_metadata['Name']
del anime_metadata['Unknown']


In [9]:
anime_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17562 entries, 0 to 17561
Columns: 1208 entries, Score to Rx - Hentai
dtypes: float64(1), int64(1207)
memory usage: 161.9 MB


In [10]:
anime_metadata[["Score", "Episodes", "Members"]] = MinMaxScaler().fit_transform(anime_metadata[["Score", "Episodes", "Members"]])
anime_metadata = anime_metadata.values

In [11]:
tfv = TfidfVectorizer(min_df=3, max_features=None,
                      strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                      ngram_range=(1, 3),
                      stop_words = 'english')

# Filling NaNs with empty string
genres_original = anime_data['Genres'].fillna('').astype(str)
genres_vector_tf_idf = tfv.fit_transform(genres_original)

genres_vector_one_hot = to_category(pd.DataFrame(Genres), "Genres", True).values

In [12]:
def get_recommended(vector, query_index, n_neighbors=10):
    model_knn = NearestNeighbors(metric='cosine', n_neighbors=n_neighbors)
    model_knn.fit(csr_matrix(vector))

    distances, indices = model_knn.kneighbors(vector[query_index,:].reshape(1, -1), n_neighbors = n_neighbors)
    result = []
    for i in range(0, len(distances.flatten())):
        index = indices.flatten()[i]
        if index == query_index:
            continue
        result.append(anime_data.iloc[index])
        
    return pd.DataFrame(result)

In [13]:
query_index = anime_id[anime_id.MAL_ID == 5231].index[0]
anime_data.iloc[[query_index]]

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members
4034,5231,Inazuma Eleven,7.59,"[Sports, Super Power, Shounen]",TV,127,Fall 2008,[OLM],Game,G - All Ages,138185


#### Content based -> studio

In [14]:
get_recommended(anime_metadata, query_index, 10)

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members
6939,13261,Inazuma Eleven Go: Chrono Stone,7.17,"[Sports, Super Power, Shounen]",TV,51,Spring 2012,[OLM],Game,G - All Ages,38045
7775,17917,Danball Senki Wars,7.08,"[Action, Kids, Mecha]",TV,37,Spring 2013,[OLM],Game,G - All Ages,3680
6795,12651,Danball Senki W,7.05,"[Action, Kids, Mecha]",TV,58,Winter 2012,[OLM],Game,G - All Ages,4559
4893,7081,Danball Senki,7.02,"[Action, Kids, Mecha]",TV,44,Spring 2011,[OLM],Game,G - All Ages,7206
6243,10507,Inazuma Eleven Go,6.98,"[Shounen, Sports, Super Power]",TV,47,Spring 2011,[OLM],Game,G - All Ages,54307
14796,38235,Inazuma Eleven: Orion no Kokuin,6.79,[Sports],TV,49,Fall 2018,[OLM],Game,G - All Ages,15200
12038,33733,Inazuma Eleven: Ares no Tenbin,6.68,[Sports],TV,26,Spring 2018,[OLM],Game,G - All Ages,24740
7818,18097,Inazuma Eleven Go: Galaxy,6.65,"[Shounen, Sports, Super Power]",TV,43,Spring 2013,[OLM],Game,G - All Ages,31602
14184,37324,Youkai Watch: Shadow Side,6.58,"[Comedy, Demons, Kids, Supernatural]",TV,49,Spring 2018,[OLM],Game,G - All Ages,1447


#### Content based - Keywords using TD IDF

In [15]:
get_recommended(genres_vector_tf_idf, query_index, 10)

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members
9124,24347,Inazuma Eleven: Chou Jigen Dream Match,6.94,"[Sports, Super Power, Shounen]",Movie,1,Unknown,[OLM],Game,G - All Ages,7384
6785,12585,Inazuma Eleven Go Specials,6.69,"[Sports, Super Power, Shounen]",Special,2,Unknown,[OLM],Unknown,G - All Ages,4743
6939,13261,Inazuma Eleven Go: Chrono Stone,7.17,"[Sports, Super Power, Shounen]",TV,51,Spring 2012,[OLM],Game,G - All Ages,38045
6461,10999,Inazuma Eleven Go: Kyuukyoku no Kizuna Gryphon,7.41,"[Sci-Fi, Sports, Super Power, Shounen]",Movie,1,Unknown,[OLM],Game,G - All Ages,11451
12043,33740,Katekyo Hitman Reborn! x ēlDLIVE Special,6.76,"[Super Power, Shounen]",Special,1,Unknown,[Artland],Original,PG-13 - Teens 13 or older,8366
7818,18097,Inazuma Eleven Go: Galaxy,6.65,"[Shounen, Sports, Super Power]",TV,43,Spring 2013,[OLM],Game,G - All Ages,31602
6243,10507,Inazuma Eleven Go,6.98,"[Shounen, Sports, Super Power]",TV,47,Spring 2011,[OLM],Game,G - All Ages,54307
5617,9032,Inazuma Eleven: Saikyou Gundan Ogre Shuurai,7.32,"[Shounen, Sports, Super Power]",Movie,1,Unknown,[OLM],Unknown,G - All Ages,19711
12921,35230,Zannen Onna Kanbu Black General-san,5.83,"[Comedy, Super Power, Shounen]",ONA,10,Unknown,[Oddjob],Manga,PG-13 - Teens 13 or older,1733


#### Content based - Genres

In [16]:
get_recommended(genres_vector_one_hot, query_index, 10)

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members
6939,13261,Inazuma Eleven Go: Chrono Stone,7.17,"[Sports, Super Power, Shounen]",TV,51,Spring 2012,[OLM],Game,G - All Ages,38045
9124,24347,Inazuma Eleven: Chou Jigen Dream Match,6.94,"[Sports, Super Power, Shounen]",Movie,1,Unknown,[OLM],Game,G - All Ages,7384
6785,12585,Inazuma Eleven Go Specials,6.69,"[Sports, Super Power, Shounen]",Special,2,Unknown,[OLM],Unknown,G - All Ages,4743
6644,11857,Judo Sanka,0.0,"[Sports, Shounen]",TV,27,Spring 1974,[],Unknown,G - All Ages,368
7810,18061,Tiger Mask (Movie),6.13,"[Sports, Shounen]",Movie,1,Unknown,[],Manga,PG-13 - Teens 13 or older,703
7811,18063,Tiger Mask Fuku Men League Sen,6.12,"[Sports, Shounen]",Movie,1,Unknown,[Toei Animation],Manga,PG-13 - Teens 13 or older,705
7541,16824,Hwang-geum-ui Pal,0.0,"[Sports, Shounen]",Movie,1,Unknown,[],Unknown,Unknown,161
8626,22205,Be Blues! Ao ni Nare,5.46,"[Sports, Shounen]",ONA,1,Unknown,[],Manga,G - All Ages,457
6276,10573,Tennis no Ouji-sama: Another Story II - Ano To...,7.54,"[Sports, Shounen]",OVA,4,Unknown,[Production I.G],Manga,PG-13 - Teens 13 or older,9028


#### Content based - All Aspects

In [17]:
all_data = np.concatenate((anime_metadata, genres_vector_tf_idf.todense(), genres_vector_one_hot), axis=1)
get_recommended(all_data, query_index, 10)

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members
6939,13261,Inazuma Eleven Go: Chrono Stone,7.17,"[Sports, Super Power, Shounen]",TV,51,Spring 2012,[OLM],Game,G - All Ages,38045
9124,24347,Inazuma Eleven: Chou Jigen Dream Match,6.94,"[Sports, Super Power, Shounen]",Movie,1,Unknown,[OLM],Game,G - All Ages,7384
6785,12585,Inazuma Eleven Go Specials,6.69,"[Sports, Super Power, Shounen]",Special,2,Unknown,[OLM],Unknown,G - All Ages,4743
14796,38235,Inazuma Eleven: Orion no Kokuin,6.79,[Sports],TV,49,Fall 2018,[OLM],Game,G - All Ages,15200
12038,33733,Inazuma Eleven: Ares no Tenbin,6.68,[Sports],TV,26,Spring 2018,[OLM],Game,G - All Ages,24740
6461,10999,Inazuma Eleven Go: Kyuukyoku no Kizuna Gryphon,7.41,"[Sci-Fi, Sports, Super Power, Shounen]",Movie,1,Unknown,[OLM],Game,G - All Ages,11451
6243,10507,Inazuma Eleven Go,6.98,"[Shounen, Sports, Super Power]",TV,47,Spring 2011,[OLM],Game,G - All Ages,54307
7818,18097,Inazuma Eleven Go: Galaxy,6.65,"[Shounen, Sports, Super Power]",TV,43,Spring 2013,[OLM],Game,G - All Ages,31602
14501,37814,Inazuma Eleven: Reloaded - Soccer no Henkaku,6.99,[Sports],Special,1,Unknown,[OLM],Game,G - All Ages,4106


#### Content based - Top Features

In [18]:
reduced_all_data = PCA(n_components=250).fit_transform(all_data)
get_recommended(reduced_all_data, query_index, 10)

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members
6939,13261,Inazuma Eleven Go: Chrono Stone,7.17,"[Sports, Super Power, Shounen]",TV,51,Spring 2012,[OLM],Game,G - All Ages,38045
9124,24347,Inazuma Eleven: Chou Jigen Dream Match,6.94,"[Sports, Super Power, Shounen]",Movie,1,Unknown,[OLM],Game,G - All Ages,7384
6785,12585,Inazuma Eleven Go Specials,6.69,"[Sports, Super Power, Shounen]",Special,2,Unknown,[OLM],Unknown,G - All Ages,4743
14796,38235,Inazuma Eleven: Orion no Kokuin,6.79,[Sports],TV,49,Fall 2018,[OLM],Game,G - All Ages,15200
12038,33733,Inazuma Eleven: Ares no Tenbin,6.68,[Sports],TV,26,Spring 2018,[OLM],Game,G - All Ages,24740
6243,10507,Inazuma Eleven Go,6.98,"[Shounen, Sports, Super Power]",TV,47,Spring 2011,[OLM],Game,G - All Ages,54307
7818,18097,Inazuma Eleven Go: Galaxy,6.65,"[Shounen, Sports, Super Power]",TV,43,Spring 2013,[OLM],Game,G - All Ages,31602
6461,10999,Inazuma Eleven Go: Kyuukyoku no Kizuna Gryphon,7.41,"[Sci-Fi, Sports, Super Power, Shounen]",Movie,1,Unknown,[OLM],Game,G - All Ages,11451
14501,37814,Inazuma Eleven: Reloaded - Soccer no Henkaku,6.99,[Sports],Special,1,Unknown,[OLM],Game,G - All Ages,4106
