In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/myanimelist-dataset/user-filtered.csv

/kaggle/input/myanimelist-dataset/users-score-2023.csv

/kaggle/input/myanimelist-dataset/anime-filtered.csv

/kaggle/input/myanimelist-dataset/anime-dataset-2023.csv

/kaggle/input/myanimelist-dataset/final_animedataset.csv

/kaggle/input/myanimelist-dataset/users-details-2023.csv


# Thought process:

## Goal
To make an anime recommendation system based on a synopsis description

### Steps:
1. Read datasets [x]
2. Clean datasets [x]
3. Get needed categories [x]
4. Fill and remove NaN
5. Use nlp to tokenize synopsis
6. Allow a description to be given and recommendations to be sent based on certain keywords

In [2]:
normal_df = pd.read_csv('/kaggle/input/myanimelist-dataset/anime-dataset-2023.csv')
filtered_df = pd.read_csv('/kaggle/input/myanimelist-dataset/anime-filtered.csv')
final_df = pd.read_csv('/kaggle/input/myanimelist-dataset/final_animedataset.csv')

In [3]:
# Get columns
print(normal_df.columns)
print(len(normal_df.columns))
print(filtered_df.columns)
print(len(filtered_df.columns))
print(final_df.columns)
print(len(final_df.columns))

Index(['anime_id', 'Name', 'English name', 'Other name', 'Score', 'Genres',

       'Synopsis', 'Type', 'Episodes', 'Aired', 'Premiered', 'Status',

       'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating',

       'Rank', 'Popularity', 'Favorites', 'Scored By', 'Members', 'Image URL'],

      dtype='object')

24

Index(['anime_id', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',

       'sypnopsis', 'Type', 'Episodes', 'Aired', 'Premiered', 'Producers',

       'Licensors', 'Studios', 'Source', 'Duration', 'Rating', 'Ranked',

       'Popularity', 'Members', 'Favorites', 'Watching', 'Completed',

       'On-Hold', 'Dropped'],

      dtype='object')

25

Index(['username', 'anime_id', 'my_score', 'user_id', 'gender', 'title',

       'type', 'source', 'score', 'scored_by', 'rank', 'popularity', 'genre'],

      dtype='object')

13


In [4]:
normal_df.shape

(24905, 24)

In [5]:
normal_df.isna().sum()

anime_id        0
Name            0
English name    0
Other name      0
Score           0
Genres          0
Synopsis        0
Type            0
Episodes        0
Aired           0
Premiered       0
Status          0
Producers       0
Licensors       0
Studios         0
Source          0
Duration        0
Rating          0
Rank            0
Popularity      0
Favorites       0
Scored By       0
Members         0
Image URL       0
dtype: int64

In [6]:
normal_df['Synopsis'].head(10)

0    Crime is timeless. By the year 2071, humanity ...
1    Another day, another bounty—such is the life o...
2    Vash the Stampede is the man with a $$60,000,0...
3    Robin Sena is a powerful craft user drafted in...
4    It is the dark century and the people are suff...
5    Shy, reserved, and small-statured, Deimon High...
6    Yuuta Takemoto, a sophomore at an arts college...
7    As the younger brother of Japanese soccer star...
8    Takumi Fujiwara finally joins Ryousuke and Kei...
9    Dr. Kenzou Tenma, an elite neurosurgeon recent...
Name: Synopsis, dtype: object

In [7]:
normal_df['Name']

0                           Cowboy Bebop
1        Cowboy Bebop: Tengoku no Tobira
2                                 Trigun
3                     Witch Hunter Robin
4                         Bouken Ou Beet
                      ...               
24900                        Wu Nao Monu
24901                Bu Xing Si: Yuan Qi
24902                        Di Yi Xulie
24903           Bokura no Saishuu Sensou
24904                     Shijuuku Nichi
Name: Name, Length: 24905, dtype: object

# Data Cleaning

Need to take out some useless information

Columns to take out = Favorites, Image URL, Scored by, Members, Rank, Duration, Producers, Status, Studios, Licensors, Premiered, Aired, 

In [8]:
normal_df.drop(columns=['Favorites', 'Image URL', 'Scored By', 'Members', 'Rank', 'Duration', 'Producers', 'Status', 'Licensors', 'Premiered', 'Aired'], inplace=True)

In [9]:
normal_df['Synopsis'] = normal_df['Synopsis'].replace("No description available for this anime", "Anime")

In [10]:
normal_df

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Studios,Source,Rating,Popularity
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,Sunrise,Original,R - 17+ (violence & profanity),43
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,カウボーイビバップ 天国の扉,8.38,"Action, Sci-Fi","Another day, another bounty—such is the life o...",Movie,1.0,Bones,Original,R - 17+ (violence & profanity),602
2,6,Trigun,Trigun,トライガン,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",TV,26.0,Madhouse,Manga,PG-13 - Teens 13 or older,246
3,7,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),7.25,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,TV,26.0,Sunrise,Original,PG-13 - Teens 13 or older,1795
4,8,Bouken Ou Beet,Beet the Vandel Buster,冒険王ビィト,6.94,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...,TV,52.0,Toei Animation,Manga,PG - Children,5126
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24900,55731,Wu Nao Monu,UNKNOWN,无脑魔女,UNKNOWN,"Comedy, Fantasy, Slice of Life",No description available for this anime.,ONA,15.0,UNKNOWN,Web manga,PG-13 - Teens 13 or older,24723
24901,55732,Bu Xing Si: Yuan Qi,Blader Soul,捕星司·源起,UNKNOWN,"Action, Adventure, Fantasy",No description available for this anime.,ONA,18.0,UNKNOWN,Web novel,PG-13 - Teens 13 or older,0
24902,55733,Di Yi Xulie,The First Order,第一序列,UNKNOWN,"Action, Adventure, Fantasy, Sci-Fi",No description available for this anime.,ONA,16.0,UNKNOWN,Web novel,PG-13 - Teens 13 or older,0
24903,55734,Bokura no Saishuu Sensou,UNKNOWN,僕らの最終戦争,UNKNOWN,UNKNOWN,A music video for the song Bokura no Saishuu S...,Music,1.0,UNKNOWN,Original,PG-13 - Teens 13 or older,0


# Clean and preprocess the synopsis text (remove punctuation, lowercase, etc.)

## Stemming and Lemmatization

In [11]:
!pip install unidecode
from unidecode import unidecode
from nltk.stem import PorterStemmer
# or from nltk.stem import WordNetLemmatizer
import re
import nltk
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

stemmer = PorterStemmer()
# or lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isna(text):
        return ""
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    text = unidecode(text) # convert special characters to ASCII
    tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)
    
normal_df['Cleaned Synopsis'] = normal_df['Synopsis'].apply(clean_text)

Collecting unidecode

  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)

Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m

[?25hInstalling collected packages: unidecode

Successfully installed unidecode-1.3.8

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...

[nltk_data]   Package stopwords is already up-to-date!

[nltk_data] Downloading package punkt to /usr/share/nltk_data...

[nltk_data]   Package punkt is already up-to-date!


In [12]:
normal_df

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Studios,Source,Rating,Popularity,Cleaned Synopsis
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,Sunrise,Original,R - 17+ (violence & profanity),43,crime timeless year human expand across galaxi...
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,カウボーイビバップ 天国の扉,8.38,"Action, Sci-Fi","Another day, another bounty—such is the life o...",Movie,1.0,Bones,Original,R - 17+ (violence & profanity),602,anoth day anoth bountysuch life often unlucki ...
2,6,Trigun,Trigun,トライガン,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",TV,26.0,Madhouse,Manga,PG-13 - Teens 13 or older,246,vash stamped man bounti head reason he mercile...
3,7,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),7.25,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,TV,26.0,Sunrise,Original,PG-13 - Teens 13 or older,1795,robin sena power craft user draft stnja group ...
4,8,Bouken Ou Beet,Beet the Vandel Buster,冒険王ビィト,6.94,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...,TV,52.0,Toei Animation,Manga,PG - Children,5126,dark centuri peopl suffer rule devil vandel ab...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24900,55731,Wu Nao Monu,UNKNOWN,无脑魔女,UNKNOWN,"Comedy, Fantasy, Slice of Life",No description available for this anime.,ONA,15.0,UNKNOWN,Web manga,PG-13 - Teens 13 or older,24723,descript avail anim
24901,55732,Bu Xing Si: Yuan Qi,Blader Soul,捕星司·源起,UNKNOWN,"Action, Adventure, Fantasy",No description available for this anime.,ONA,18.0,UNKNOWN,Web novel,PG-13 - Teens 13 or older,0,descript avail anim
24902,55733,Di Yi Xulie,The First Order,第一序列,UNKNOWN,"Action, Adventure, Fantasy, Sci-Fi",No description available for this anime.,ONA,16.0,UNKNOWN,Web novel,PG-13 - Teens 13 or older,0,descript avail anim
24903,55734,Bokura no Saishuu Sensou,UNKNOWN,僕らの最終戦争,UNKNOWN,UNKNOWN,A music video for the song Bokura no Saishuu S...,Music,1.0,UNKNOWN,Original,PG-13 - Teens 13 or older,0,music video song bokura saishuu sensou shannon


# Use 3 Keyword Extraction methods and determine the one with the best accuracy

In [2]:
# TF-IDF

In [3]:
# Text Rank

In [4]:
# RAKE