In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
animes = pd.read_csv('anime-dataset-2023.csv')

In [3]:
animes.shape

(24905, 24)

In [4]:
animes.head(1)

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,"Apr 3, 1998 to Apr 24, 1999",...,Sunrise,Original,24 min per ep,R - 17+ (violence & profanity),41.0,43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...


In [5]:
animes.isnull().sum()

anime_id        0
Name            0
English name    0
Other name      0
Score           0
Genres          0
Synopsis        0
Type            0
Episodes        0
Aired           0
Premiered       0
Status          0
Producers       0
Licensors       0
Studios         0
Source          0
Duration        0
Rating          0
Rank            0
Popularity      0
Favorites       0
Scored By       0
Members         0
Image URL       0
dtype: int64

In [6]:
animes.duplicated().sum()

0

In [7]:
animes.iloc[0].Genres

'Action, Award Winning, Sci-Fi'

In [8]:
animes['Genres'].value_counts()

Genres
UNKNOWN                                          4929
Comedy                                           2279
Fantasy                                          1341
Hentai                                           1181
Drama                                             624
                                                 ... 
Avant Garde, Fantasy, Romance                       1
Action, Comedy, Romance, Supernatural, Hentai       1
Drama, Horror, Supernatural, Ecchi                  1
Adventure, Comedy, Romance, Sports                  1
Action, Adventure, Fantasy, Horror                  1
Name: count, Length: 1006, dtype: int64

In [9]:
# Drop hentai genres

animes = animes[~animes['Genres'].apply(lambda x: 'Hentai' in x)]
animes = animes[~animes['Genres'].apply(lambda x: 'Boys Love' in x)]
animes = animes[~animes['Genres'].apply(lambda x: 'Girls Love' in x)]

In [10]:
animes['Genres'].value_counts()

Genres
UNKNOWN                                            4929
Comedy                                             2279
Fantasy                                            1341
Drama                                               624
Slice of Life                                       623
                                                   ... 
Action, Adventure, Comedy, Fantasy, Sports            1
Action, Award Winning, Supernatural                   1
Adventure, Ecchi                                      1
Action, Horror, Mystery, Supernatural, Suspense       1
Action, Adventure, Fantasy, Horror                    1
Name: count, Length: 843, dtype: int64

In [11]:
animes.columns

Index(['anime_id', 'Name', 'English name', 'Other name', 'Score', 'Genres',
       'Synopsis', 'Type', 'Episodes', 'Aired', 'Premiered', 'Status',
       'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating',
       'Rank', 'Popularity', 'Favorites', 'Scored By', 'Members', 'Image URL'],
      dtype='object')

In [12]:
animes = animes[['anime_id', 'Name','English name', 'Genres', 'Synopsis', 'Studios', 'Source', 'Image URL']]

In [13]:
animes.head(2)

Unnamed: 0,anime_id,Name,English name,Genres,Synopsis,Studios,Source,Image URL
0,1,Cowboy Bebop,Cowboy Bebop,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",Sunrise,Original,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,"Action, Sci-Fi","Another day, another bounty—such is the life o...",Bones,Original,https://cdn.myanimelist.net/images/anime/1439/...


In [14]:
animes.iloc[0]['Genres']

'Action, Award Winning, Sci-Fi'

In [15]:
animes.iloc[0]['Synopsis']

"Crime is timeless. By the year 2071, humanity has expanded across the galaxy, filling the surface of other planets with settlements like those on Earth. These new societies are plagued by murder, drug use, and theft, and intergalactic outlaws are hunted by a growing number of tough bounty hunters.\n\nSpike Spiegel and Jet Black pursue criminals throughout space to make a humble living. Beneath his goofy and aloof demeanor, Spike is haunted by the weight of his violent past. Meanwhile, Jet manages his own troubled memories while taking care of Spike and the Bebop, their ship. The duo is joined by the beautiful con artist Faye Valentine, odd child Edward Wong Hau Pepelu Tivrusky IV, and Ein, a bioengineered Welsh Corgi.\n\nWhile developing bonds and working to catch a colorful cast of criminals, the Bebop crew's lives are disrupted by a menace from Spike's past. As a rival's maniacal plot continues to unravel, Spike must choose between life with his newfound family or revenge for his ol

In [16]:
# removing unwanted characters
def format_synopsis(text):
    # Split the text on periods followed by spaces
    sentences = text.split('. ')
    # Add back the period for each sentence
    sentences = [sentence + '.' if not sentence.endswith('.') else sentence for sentence in sentences]
    # Remove double newlines from each sentence
    sentences = [re.sub("\n\n", "", sentence) for sentence in sentences]
    return sentences

def format_column(text_string):
    # Convert to lowercase
    text_string = text_string.lower()
    # Remove any newline characters and excessive whitespace
    text_string = re.sub(r"\s+", " ", text_string).strip()
    # Remove spaces around any commas
    text_string = re.sub(r"\s*,\s*", ",", text_string)
    # Remove some unwanted characters
    text_string = text_string.replace("—", "")
    # Split the text_string into a list by commas
    text_list = text_string.split(',')
    # Remove any remaining spaces from each word
    text_list = [text.replace(" ", "") for text in text_list]

    return text_list

# format to lowercase
def lower_case(text):
    return text.lower()


In [17]:
animes.head(1
           )

Unnamed: 0,anime_id,Name,English name,Genres,Synopsis,Studios,Source,Image URL
0,1,Cowboy Bebop,Cowboy Bebop,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",Sunrise,Original,https://cdn.myanimelist.net/images/anime/4/196...


In [18]:
# Applying the preprocessing function

# Genres Column
animes['Genres'] = animes['Genres'].apply(format_column)

# Synopsis Column
animes['Synopsis'] = animes['Synopsis'].apply(format_synopsis)


# Studios Column
animes['Studios'] = animes['Studios'].apply(format_column)

# Source Column
animes['Source'] = animes['Source'].apply(format_column)



In [19]:
animes.head(1)

Unnamed: 0,anime_id,Name,English name,Genres,Synopsis,Studios,Source,Image URL
0,1,Cowboy Bebop,Cowboy Bebop,"[action, awardwinning, sci-fi]","[Crime is timeless., By the year 2071, humanit...",[sunrise],[original],https://cdn.myanimelist.net/images/anime/4/196...


In [20]:
animes['tags'] = animes['Synopsis'] + animes['Genres'] + animes['Studios'] + animes['Source']

In [21]:
animes.head(1)

Unnamed: 0,anime_id,Name,English name,Genres,Synopsis,Studios,Source,Image URL,tags
0,1,Cowboy Bebop,Cowboy Bebop,"[action, awardwinning, sci-fi]","[Crime is timeless., By the year 2071, humanit...",[sunrise],[original],https://cdn.myanimelist.net/images/anime/4/196...,"[Crime is timeless., By the year 2071, humanit..."


In [22]:
animes.iloc[0]['tags']

['Crime is timeless.',
 'By the year 2071, humanity has expanded across the galaxy, filling the surface of other planets with settlements like those on Earth.',
 'These new societies are plagued by murder, drug use, and theft, and intergalactic outlaws are hunted by a growing number of tough bounty hunters.Spike Spiegel and Jet Black pursue criminals throughout space to make a humble living.',
 'Beneath his goofy and aloof demeanor, Spike is haunted by the weight of his violent past.',
 'Meanwhile, Jet manages his own troubled memories while taking care of Spike and the Bebop, their ship.',
 "The duo is joined by the beautiful con artist Faye Valentine, odd child Edward Wong Hau Pepelu Tivrusky IV, and Ein, a bioengineered Welsh Corgi.While developing bonds and working to catch a colorful cast of criminals, the Bebop crew's lives are disrupted by a menace from Spike's past.",
 "As a rival's maniacal plot continues to unravel, Spike must choose between life with his newfound family or r

In [23]:

animes_new = animes[['anime_id', 'Name', 'tags', 'Image URL']]


In [24]:
animes_new.head()

Unnamed: 0,anime_id,Name,tags,Image URL
0,1,Cowboy Bebop,"[Crime is timeless., By the year 2071, humanit...",https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,"[Another day, another bounty—such is the life ...",https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,"[Vash the Stampede is the man with a $$60,000,...",https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,[Robin Sena is a powerful craft user drafted i...,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Bouken Ou Beet,[It is the dark century and the people are suf...,https://cdn.myanimelist.net/images/anime/7/215...


In [25]:
animes_new['tags'] = animes_new['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  animes_new['tags'] = animes_new['tags'].apply(lambda x: " ".join(x))


In [26]:
animes_new.head()

Unnamed: 0,anime_id,Name,tags,Image URL
0,1,Cowboy Bebop,"Crime is timeless. By the year 2071, humanity ...",https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,"Another day, another bounty—such is the life o...",https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,"Vash the Stampede is the man with a $$60,000,0...",https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,Robin Sena is a powerful craft user drafted in...,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Bouken Ou Beet,It is the dark century and the people are suff...,https://cdn.myanimelist.net/images/anime/7/215...


In [27]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

# Continue using HashingVectorizer
hv = HashingVectorizer(n_features=10000, stop_words='english')  # Adjust 'n_features' as needed
vectorized_data = hv.fit_transform(animes_new['tags'])

# Apply Truncated SVD for dimensionality reduction
svd = TruncatedSVD(n_components=500)  # Adjust 'n_components' as needed
reduced_vector = svd.fit_transform(vectorized_data)

# 'reduced_vector' is now the lower-dimensional representation of your text data


In [28]:
# vector = cv.fit_transform(animes_new['tags']).toarray()

# # 'vectorized_data' is now a matrix where each row represents one of the text documents
# # and each column represents a word from the vocabulary, with the value being the count
# # of that word in the document.

In [29]:
reduced_vector.shape

(23167, 500)

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(reduced_vector)

In [31]:
similarity

array([[ 1.        ,  0.39742498,  0.17063958, ...,  0.10666214,
         0.05569606,  0.05487827],
       [ 0.39742498,  1.        ,  0.27427679, ...,  0.24500072,
         0.06384271,  0.06535443],
       [ 0.17063958,  0.27427679,  1.        , ...,  0.1789036 ,
        -0.00664885,  0.00168128],
       ...,
       [ 0.10666214,  0.24500072,  0.1789036 , ...,  1.        ,
         0.22807597,  0.23030234],
       [ 0.05569606,  0.06384271, -0.00664885, ...,  0.22807597,
         1.        ,  0.98232214],
       [ 0.05487827,  0.06535443,  0.00168128, ...,  0.23030234,
         0.98232214,  1.        ]])

In [32]:
# to find the location of particular anime (index)
animes_new[animes_new['Name'] == 'Naruto'].index[0]

10

In [33]:
def reccomend(animes):
    index = animes_new[animes_new['Name'] == animes].index[0]
    # Calculating Similarity and Sorting
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(animes_new.iloc[i[0]].Name)

In [34]:
reccomend('Naruto')

Naruto (2023)
Boruto: Naruto Next Generations
Naruto: Shippuuden
Naruto: Honoo no Chuunin Shiken! Naruto vs. Konohamaru!!
Naruto: Shippuuden Movie 5 - Blood Prison


In [35]:
animes_new.head()

Unnamed: 0,anime_id,Name,tags,Image URL
0,1,Cowboy Bebop,"Crime is timeless. By the year 2071, humanity ...",https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,"Another day, another bounty—such is the life o...",https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,"Vash the Stampede is the man with a $$60,000,0...",https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,Robin Sena is a powerful craft user drafted in...,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Bouken Ou Beet,It is the dark century and the people are suff...,https://cdn.myanimelist.net/images/anime/7/215...


In [36]:
import pickle
import os

current_directory = os.getcwd()

animes_file_path = os.path.join(current_directory, 'animes_new.pkl')
similarity_file_path = os.path.join(current_directory, 'similarity.pkl')

pickle.dump(animes_new, open(animes_file_path, 'wb'))
pickle.dump(similarity, open(similarity_file_path, 'wb'))


In [37]:
import os
os.path.abspath("")

'C:\\Users\\Lenovo\\Anime Recommendation - Content Based'