In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


df = pd.read_csv("anime_with_synopsis.csv")

In [2]:
df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [3]:
#returns the number of missing values in the dataset
df.isnull().sum()

MAL_ID       0
Name         0
Score        0
Genres       0
sypnopsis    8
dtype: int64

In [4]:
#dropna will drop all missing values from your original dataset
# dropna()-->>tüm NaN değerleri siler
#Yapılan değişiklikleri kalıcı hale getirmek için inplace=True
#parametresini verdik.
df.dropna(inplace=True)

In [5]:
#method returns a Series with True and False values 
#that describe which rows in the DataFrame are "duplicated" and no
df.duplicated().sum()

0

In [6]:
df["Score"].map(lambda x:np.nan if x=="Unknown" else x)

0        8.78
1        8.39
2        8.24
3        7.27
4        6.98
         ... 
16209     NaN
16210     NaN
16211     NaN
16212     NaN
16213     NaN
Name: Score, Length: 16206, dtype: object

In [7]:
df["Score"] = df["Score"].map(lambda x:np.nan if x=="Unknown" else x)

In [8]:
df["Score"].fillna(df["Score"].median(),inplace = True)

In [9]:
df["Score"] = df["Score"].astype(float)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16206 entries, 0 to 16213
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   MAL_ID     16206 non-null  int64  
 1   Name       16206 non-null  object 
 2   Score      16206 non-null  float64
 3   Genres     16206 non-null  object 
 4   sypnopsis  16206 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 759.7+ KB


In [11]:
# Top 10 Anime Based on Score
df.sort_values(by='Score', ascending=False).head(10)

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
3446,5114,Fullmetal Alchemist: Brotherhood,9.19,"Action, Military, Adventure, Comedy, Drama, Ma...","""In order for something to be obtained, someth..."
14647,40028,Shingeki no Kyojin: The Final Season,9.17,"Action, Military, Mystery, Super Power, Drama,...",Gabi Braun and Falco Grice have been training ...
4953,9253,Steins;Gate,9.11,"Thriller, Sci-Fi",The self-proclaimed mad scientist Rintarou Oka...
5660,11061,Hunter x Hunter (2011),9.1,"Action, Adventure, Fantasy, Shounen, Super Power",Hunter x Hunter is set in a world where Hunter...
8879,28977,Gintama°,9.1,"Action, Comedy, Historical, Parody, Samurai, S...","Gintoki, Shinpachi, and Kagura return as the f..."
13720,38524,Shingeki no Kyojin Season 3 Part 2,9.1,"Action, Drama, Fantasy, Military, Mystery, Sho...",Seeking to restore humanity's diminishing hope...
5234,9969,Gintama',9.08,"Action, Sci-Fi, Comedy, Historical, Parody, Sa...","fter a one-year hiatus, Shinpachi Shimura retu..."
723,820,Ginga Eiyuu Densetsu,9.07,"Military, Sci-Fi, Space, Drama",The 150-year-long stalemate between the two in...
6377,15417,Gintama': Enchousen,9.04,"Action, Comedy, Historical, Parody, Samurai, S...","hile Gintoki Sakata was away, the Yorozuya fou..."
8854,28851,Koe no Katachi,9.0,"Drama, School, Shounen","s a wild youth, elementary school student Shou..."


In [12]:
#convert the Genres and sypnopsis which is a string to a list
df['Genres'] = df['Genres'].apply(lambda x:x.split())
df['sypnopsis'] = df['sypnopsis'].apply(lambda x:x.split())

In [13]:
df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"[Action,, Adventure,, Comedy,, Drama,, Sci-Fi,...","[In, the, year, 2071,, humanity, has, colonize..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"[Action,, Drama,, Mystery,, Sci-Fi,, Space]","[other, day,, another, bounty—such, is, the, l..."
2,6,Trigun,8.24,"[Action,, Sci-Fi,, Adventure,, Comedy,, Drama,...","[Vash, the, Stampede, is, the, man, with, a, $..."
3,7,Witch Hunter Robin,7.27,"[Action,, Mystery,, Police,, Supernatural,, Dr...","[ches, are, individuals, with, special, powers..."
4,8,Bouken Ou Beet,6.98,"[Adventure,, Fantasy,, Shounen,, Supernatural]","[It, is, the, dark, century, and, the, people,..."


In [14]:
# remove space between two words
df['Genres'] = df['Genres'].apply(lambda x:[i.replace(" ","") for i in x])
df['sypnopsis'] = df['sypnopsis'].apply(lambda x:[i.replace(" ","") for i in x])

In [15]:
df['features'] = df['Genres'] + df['sypnopsis'] 

In [16]:
new_df = df[['Name', 'features']]

In [17]:
# convert list to string
new_df['features'] = new_df['features'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['features'] = new_df['features'].apply(lambda x:" ".join(x))


In [18]:
new_df

Unnamed: 0,Name,features
0,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Spac..."
1,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space other da..."
2,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shou..."
3,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ..."
4,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural It i..."
...,...,...
16209,Daomu Biji Zhi Qinling Shen Shu,"Adventure, Mystery, Supernatural No synopsis i..."
16210,Mieruko-chan,"Comedy, Horror, Supernatural ko is a typical h..."
16211,Higurashi no Naku Koro ni Sotsu,"Mystery, Dementia, Horror, Psychological, Supe..."
16212,Yama no Susume: Next Summit,"Adventure, Slice of Life, Comedy New Yama no S..."


In [19]:
new_df['features'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

0        action, adventure, comedy, drama, sci-fi, spac...
1        action, drama, mystery, sci-fi, space other da...
2        action, sci-fi, adventure, comedy, drama, shou...
3        action, mystery, police, supernatural, drama, ...
4        adventure, fantasy, shounen, supernatural it i...
                               ...                        
16209    adventure, mystery, supernatural no synopsis i...
16210    comedy, horror, supernatural ko is a typical h...
16211    mystery, dementia, horror, psychological, supe...
16212    adventure, slice of life, comedy new yama no s...
16213    action, fantasy solar calendar year 2020: grot...
Name: features, Length: 16206, dtype: object

In [20]:
new_df['features']=new_df['features'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['features']=new_df['features'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)


In [21]:
new_df

Unnamed: 0,Name,features
0,Cowboy Bebop,"action, adventure, comedy, drama, sci-fi, spac..."
1,Cowboy Bebop: Tengoku no Tobira,"action, drama, mystery, sci-fi, space other da..."
2,Trigun,"action, sci-fi, adventure, comedy, drama, shou..."
3,Witch Hunter Robin,"action, mystery, police, supernatural, drama, ..."
4,Bouken Ou Beet,"adventure, fantasy, shounen, supernatural it i..."
...,...,...
16209,Daomu Biji Zhi Qinling Shen Shu,"adventure, mystery, supernatural no synopsis i..."
16210,Mieruko-chan,"comedy, horror, supernatural ko is a typical h..."
16211,Higurashi no Naku Koro ni Sotsu,"mystery, dementia, horror, psychological, supe..."
16212,Yama no Susume: Next Summit,"adventure, slice of life, comedy new yama no s..."


In [22]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [23]:
new_df['features'].apply(lambda x: tokenization(x))

0        action , adventur , comedi , drama , sci-fi , ...
1        action , drama , mysteri , sci-fi , space othe...
2        action , sci-fi , adventur , comedi , drama , ...
3        action , mysteri , polic , supernatur , drama ...
4        adventur , fantasi , shounen , supernatur it i...
                               ...                        
16209    adventur , mysteri , supernatur no synopsi inf...
16210    comedi , horror , supernatur ko is a typic hig...
16211    mysteri , dementia , horror , psycholog , supe...
16212    adventur , slice of life , comedi new yama no ...
16213    action , fantasi solar calendar year 2020 : gr...
Name: features, Length: 16206, dtype: object

In [24]:
new_df['features']=new_df['features'].apply(lambda x: tokenization(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['features']=new_df['features'].apply(lambda x: tokenization(x))


In [25]:
new_df['features']

0        action , adventur , comedi , drama , sci-fi , ...
1        action , drama , mysteri , sci-fi , space othe...
2        action , sci-fi , adventur , comedi , drama , ...
3        action , mysteri , polic , supernatur , drama ...
4        adventur , fantasi , shounen , supernatur it i...
                               ...                        
16209    adventur , mysteri , supernatur no synopsi inf...
16210    comedi , horror , supernatur ko is a typic hig...
16211    mysteri , dementia , horror , psycholog , supe...
16212    adventur , slice of life , comedi new yama no ...
16213    action , fantasi solar calendar year 2020 : gr...
Name: features, Length: 16206, dtype: object

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [28]:
tfidvector = TfidfVectorizer()
matrix = tfidvector.fit_transform(new_df['features'])
similarity = cosine_similarity(matrix)

In [29]:
similarity[0]

array([1.        , 0.30046725, 0.13747636, ..., 0.01000254, 0.03431889,
       0.08755463])

In [30]:
sorted(list(enumerate(similarity[0])),reverse=False,key=lambda x:x[1])

[(2441, 0.0),
 (3003, 0.0),
 (4750, 0.0),
 (5233, 0.0),
 (5622, 0.0),
 (5904, 0.0),
 (6080, 0.0),
 (6120, 0.0),
 (6836, 0.0),
 (7181, 0.0),
 (8853, 0.0),
 (9104, 0.0),
 (9259, 0.0),
 (9403, 0.0),
 (9413, 0.0),
 (9414, 0.0),
 (9415, 0.0),
 (9416, 0.0),
 (9417, 0.0),
 (9418, 0.0),
 (9419, 0.0),
 (9420, 0.0),
 (9421, 0.0),
 (9423, 0.0),
 (9424, 0.0),
 (9425, 0.0),
 (9426, 0.0),
 (9466, 0.0),
 (9507, 0.0),
 (10772, 0.0),
 (11432, 0.0),
 (11653, 0.0),
 (11839, 0.0),
 (11876, 0.0),
 (11913, 0.0),
 (12334, 0.0),
 (12514, 0.0),
 (13203, 0.0),
 (13206, 0.0),
 (13613, 0.0),
 (13857, 0.0),
 (14812, 0.0),
 (15267, 0.0),
 (15401, 0.0),
 (15585, 0.0),
 (7322, 0.00253993032277466),
 (16188, 0.0025472270817845383),
 (12176, 0.002954414679384201),
 (4873, 0.003101069197192312),
 (12814, 0.003202426228349774),
 (15132, 0.0032408739961790737),
 (2876, 0.0033146807259944178),
 (8836, 0.0034806419387510075),
 (9069, 0.003923770780434414),
 (9486, 0.003990441615222566),
 (7492, 0.004019015582351975),
 (9086

In [31]:
distances=sorted(list(enumerate(similarity[0])),reverse=False,key=lambda x:x[1])

In [38]:
def recommendation(anime_df):
    idx = new_df[new_df['Name'] == anime_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    anime = []
    for m_id in distances[1:21]:
        anime.append(new_df.iloc[m_id[0]].Name)
        
    return anime

In [39]:
recommendation('Cowboy Bebop')

['Cowboy Bebop: Tengoku no Tobira',
 'Ginga Senpuu Braiger',
 'Cowboy Bebop: Yose Atsume Blues',
 'Terra e... (TV)',
 'Odin: Koushi Hansen Starlight',
 'Rokushin Gattai GodMars (1982)',
 'Seihou Bukyou Outlaw Star',
 'Sol Bianca: Taiyou no Fune',
 'Mujin Wakusei Survive',
 'Terra Formars',
 'Sei Juushi Bismarck',
 'Aoki Ryuusei SPT Layzner',
 'Planetes',
 'Uchuu Senkan Yamato (Movie)',
 'Kanata no Astra',
 'YAT Anshin! Uchuu Ryokou',
 'Uchuu Kuubo Blue Noah',
 'Shironeko Project: Zero Chronicle',
 'Yamato Takeru',
 'Phantasy Star Online 2: Episode Oracle']

In [40]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))