# Importing Modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc

%matplotlib inline

# Data Pre-Processing

In [2]:
anime = pd.read_csv("Data/anime.csv")
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [3]:
for i in anime:
    print("{} : {} Null Values".format(i, anime[i].isna().sum().sum()))

anime_id : 0 Null Values
name : 0 Null Values
genre : 62 Null Values
type : 25 Null Values
episodes : 0 Null Values
rating : 230 Null Values
members : 0 Null Values


In [4]:
anime.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object

In [5]:
# fill null values in 'genre' column with 'Unknown'
anime['genre'].fillna('Unknown', inplace=True)

# fill null values in 'type' column with 'Unknown'
anime['type'].fillna('Unknown', inplace=True)

# replace unknown values in episode column with NaN
anime["episodes"].replace("Unknown", np.nan, inplace=True)

In [6]:
for i in anime:
    print("{} : {} Null Values".format(i, anime[i].isna().sum().sum()))

anime_id : 0 Null Values
name : 0 Null Values
genre : 0 Null Values
type : 0 Null Values
episodes : 340 Null Values
rating : 230 Null Values
members : 0 Null Values


In [7]:
def fill_null_with_genre_mode(df, column_name, group_by_column_name):
    groupby = df.groupby(group_by_column_name)[column_name]
    df[column_name] = groupby.apply(lambda x: x.fillna(method='ffill'))
    return df

In [8]:
# fill null values in 'rating' column with the mode rating value for each genre
anime = fill_null_with_genre_mode(anime, 'rating', 'genre')

In [9]:
for i in anime:
    print("{} : {} Null Values".format(i, anime[i].isna().sum().sum()))

anime_id : 0 Null Values
name : 0 Null Values
genre : 0 Null Values
type : 0 Null Values
episodes : 340 Null Values
rating : 35 Null Values
members : 0 Null Values


In [10]:
anime["rating"].fillna(anime["rating"].median(),inplace = True)

In [11]:
for i in anime:
    print("{} : {} Null Values".format(i, anime[i].isna().sum().sum()))

anime_id : 0 Null Values
name : 0 Null Values
genre : 0 Null Values
type : 0 Null Values
episodes : 340 Null Values
rating : 0 Null Values
members : 0 Null Values


In [12]:
anime["episodes"].fillna(anime["episodes"].median(),inplace = True)

In [13]:
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [14]:
pd.get_dummies(anime[["type"]]).head()

Unnamed: 0,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV,type_Unknown
0,1,0,0,0,0,0,0
1,0,0,0,0,0,1,0
2,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0


In [15]:
# create dummy variables for genre and type columns
genre_dummies = pd.concat([anime["genre"].str.get_dummies(sep=",")])

type_dummies = pd.get_dummies(anime["type"])

# select columns to keep
columns_to_keep = ["rating", "members", "episodes"]

# create anime_features dataframe by concatenating the dummy variables and selected columns
anime_features = pd.concat([genre_dummies, type_dummies, anime[columns_to_keep]], axis=1)

In [16]:
anime["name"] = anime["name"].map(lambda name: re.sub('[^A-Za-z0-9]+', ' ', name))

In [17]:
anime_features

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,Movie,Music,ONA,OVA,Special,TV,Unknown,rating,members,episodes
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,9.37,200630,1
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,1,0,9.26,793665,64
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,9.25,114262,51
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,9.17,673572,24
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,9.16,151266,51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,4.15,211,1
12290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,4.28,183,1
12291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,4.88,219,4
12292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,4.98,175,1


In [18]:
anime_features.dtypes

 Adventure      int64
 Cars           int64
 Comedy         int64
 Dementia       int64
 Demons         int64
               ...   
TV              uint8
Unknown         uint8
rating        float64
members         int64
episodes       object
Length: 93, dtype: object

# KNN   

In [19]:
scaledData = StandardScaler()
anime_features = scaledData.fit_transform(anime_features)

In [20]:
nbrs = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(anime_features)

In [21]:
distances, indices = nbrs.kneighbors(anime_features)

In [22]:
# Calculate silhouette score
silhouette_avg = silhouette_score(anime_features, labels=indices[:, 1])
print("The average silhouette_score is :", silhouette_avg)

The average silhouette_score is : -0.16138299279544413


In [23]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

def anime_recommendation(anime_name, anime_data, anime_features):
    # Scale the features
    scaled_data = StandardScaler()
    anime_features_scaled = scaled_data.fit_transform(anime_features)

    # Fit the NearestNeighbors model
    nbrs = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(anime_features_scaled)
    
    # Find the distances and indices of the nearest neighbors
    distances, indices = nbrs.kneighbors(anime_features_scaled)

    # Find the index of the input anime in the anime_data DataFrame
    anime_index = anime_data[anime_data['name'] == anime_name].index[0]

    # Get the indices of the 5 closest anime (excluding the input anime)
    top_5_indices = indices[anime_index][1:]

    # Return the top 5 recommended anime names
    return anime_data.iloc[top_5_indices]['name'].values


# Results

In [24]:
anime_name = "Fairy Tail"
recommended_anime = anime_recommendation(anime_name, anime, anime_features)
print(recommended_anime)

['Soul Eater' 'Fairy Tail 2014 ' 'Magi The Labyrinth of Magic'
 'D Gray man' 'Akame ga Kill ']


In [25]:
anime_name = "Bleach"
recommended_anime = anime_recommendation(anime_name, anime, anime_features)
print(recommended_anime)

['Katekyo Hitman Reborn ' 'Naruto' 'Hunter x Hunter 2011 ' 'Kill la Kill'
 'Soul Eater']


In [26]:
anime_name = "Noragami"
recommended_anime = anime_recommendation(anime_name, anime, anime_features)
print(recommended_anime)

['Noragami Aragoto' 'Soul Eater' 'Akame ga Kill ' 'Fate Zero' 'D Gray man']


# End