# MyAnimeList Data

## Data Pre-Processing 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import *
from statistics import mean



In [220]:
data = './datasets/anime_data.csv'

In [221]:
df = pd.read_csv(data)

In [4]:
df.head(5)

Unnamed: 0,mal_id,aired_from,aired_to,duration,episodes,genres,popularity,premiered,rank,rating,score,scored_by,source,status,studios,synopsis,title,title_english,type
0,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...",38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Finished Airing,"[{'mal_id': 14, 'name': 'Sunrise'}]","In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV
1,100,2001-04-04T00:00:00+00:00,2001-06-27T00:00:00+00:00,23 min per ep,13,"['Comedy', 'Drama', 'Fantasy', 'Magic', 'Roman...",2075,Spring 2001,2703.0,PG-13 - Teens 13 or older,7.21,23787,Manga,Finished Airing,"[{'mal_id': 34, 'name': 'Hal Film Maker'}]","Due to her father's remarriage, robust 16-year...",Shin Shirayuki-hime Densetsu Prétear,Prétear: The New Legend of Snow White,TV
2,1000,1978-03-14T00:00:00+00:00,1979-02-13T00:00:00+00:00,25 min per ep,42,"['Action', 'Sci-Fi', 'Adventure', 'Space', 'Dr...",2980,Spring 1978,1008.0,PG-13 - Teens 13 or older,7.71,7059,Manga,Finished Airing,"[{'mal_id': 18, 'name': 'Toei Animation'}]",It is 2977 AD and mankind has become stagnant....,Uchuu Kaizoku Captain Herlock,Space Pirate Captain Harlock,TV
3,10003,2008-01-01T00:00:00+00:00,,2 min per ep,15,"['Comedy', 'Dementia', 'Horror', 'Seinen']",6848,,10146.0,R+ - Mild Nudity,5.05,1181,Original,Finished Airing,[],"In these jokey short films, many of them crude...",Kago Shintarou Anime Sakuhin Shuu,,OVA
4,10005,2007-03-31T00:00:00+00:00,,1 hr 35 min,1,"['Action', 'Adventure', 'Mecha', 'Sci-Fi']",10765,,6121.0,G - All Ages,6.43,228,Unknown,Finished Airing,"[{'mal_id': 455, 'name': 'Palm Studio'}]",This theatrical version based on the manga by ...,Tetsujin 28-gou: Hakuchuu no Zangetsu,,Movie


### Extracting studio sequences into a new columns

Source: https://stackoverflow.com/questions/71432733/pandas-extracting-a-phrase-in-a-dict-column?noredirect=1#comment126259925_71432733

In case of the items in the column is just string, convert the column into actual object

In [5]:
df['studios'] = df['studios'].apply(ast.literal_eval)

Implementing .str to access indexes/keys from the lists/dicts of items in a column, and use a combination of pipe and where to fallback to the original values where the result from .str to returns NaN

In [6]:
df['studios'] = df['studios'].str[0].str['name'].pipe(lambda x: x.where(x.notna(), df['studios']))
df.head(5)

Unnamed: 0,mal_id,aired_from,aired_to,duration,episodes,genres,popularity,premiered,rank,rating,score,scored_by,source,status,studios,synopsis,title,title_english,type
0,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...",38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV
1,100,2001-04-04T00:00:00+00:00,2001-06-27T00:00:00+00:00,23 min per ep,13,"['Comedy', 'Drama', 'Fantasy', 'Magic', 'Roman...",2075,Spring 2001,2703.0,PG-13 - Teens 13 or older,7.21,23787,Manga,Finished Airing,Hal Film Maker,"Due to her father's remarriage, robust 16-year...",Shin Shirayuki-hime Densetsu Prétear,Prétear: The New Legend of Snow White,TV
2,1000,1978-03-14T00:00:00+00:00,1979-02-13T00:00:00+00:00,25 min per ep,42,"['Action', 'Sci-Fi', 'Adventure', 'Space', 'Dr...",2980,Spring 1978,1008.0,PG-13 - Teens 13 or older,7.71,7059,Manga,Finished Airing,Toei Animation,It is 2977 AD and mankind has become stagnant....,Uchuu Kaizoku Captain Herlock,Space Pirate Captain Harlock,TV
3,10003,2008-01-01T00:00:00+00:00,,2 min per ep,15,"['Comedy', 'Dementia', 'Horror', 'Seinen']",6848,,10146.0,R+ - Mild Nudity,5.05,1181,Original,Finished Airing,[],"In these jokey short films, many of them crude...",Kago Shintarou Anime Sakuhin Shuu,,OVA
4,10005,2007-03-31T00:00:00+00:00,,1 hr 35 min,1,"['Action', 'Adventure', 'Mecha', 'Sci-Fi']",10765,,6121.0,G - All Ages,6.43,228,Unknown,Finished Airing,Palm Studio,This theatrical version based on the manga by ...,Tetsujin 28-gou: Hakuchuu no Zangetsu,,Movie


### Extract genre list into an individual row

In [7]:
df['genres'].head(5)

0    ['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...
1    ['Comedy', 'Drama', 'Fantasy', 'Magic', 'Roman...
2    ['Action', 'Sci-Fi', 'Adventure', 'Space', 'Dr...
3           ['Comedy', 'Dementia', 'Horror', 'Seinen']
4           ['Action', 'Adventure', 'Mecha', 'Sci-Fi']
Name: genres, dtype: object

**Convert the values in the genres column to actual list, because it might just look like a list but actually be a string.**

In [8]:
df['genres'] = df['genres'].apply(ast.literal_eval)

**Implementing .explode() for genres column**

In [9]:
data = df.explode('genres').reset_index(drop = True)

In [10]:
data.head(5)

Unnamed: 0,mal_id,aired_from,aired_to,duration,episodes,genres,popularity,premiered,rank,rating,score,scored_by,source,status,studios,synopsis,title,title_english,type
0,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Action,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV
1,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Adventure,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV
2,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Comedy,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV
3,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Drama,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV
4,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Sci-Fi,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV


### Data Information + Rows and Columns

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35984 entries, 0 to 35983
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   mal_id         35984 non-null  int64  
 1   aired_from     35977 non-null  object 
 2   aired_to       20657 non-null  object 
 3   duration       35984 non-null  object 
 4   episodes       35984 non-null  int64  
 5   genres         35969 non-null  object 
 6   popularity     35984 non-null  int64  
 7   premiered      13621 non-null  object 
 8   rank           33954 non-null  float64
 9   rating         35984 non-null  object 
 10  score          35984 non-null  float64
 11  scored_by      35984 non-null  int64  
 12  source         35984 non-null  object 
 13  status         35984 non-null  object 
 14  studios        35984 non-null  object 
 15  synopsis       35465 non-null  object 
 16  title          35984 non-null  object 
 17  title_english  19120 non-null  object 
 18  type  

In [12]:
data.shape

(35984, 19)

### Looking for missing value within the dataset

In [13]:
data.isnull().sum()

mal_id               0
aired_from           7
aired_to         15327
duration             0
episodes             0
genres              15
popularity           0
premiered        22363
rank              2030
rating               0
score                0
scored_by            0
source               0
status               0
studios              0
synopsis           519
title                0
title_english    16864
type                 0
dtype: int64

### Extracting Season and Year from primier column to create two new columns

In [14]:
data[['premiered_season', 'premiered_year']] = data['premiered'].str.split(expand = True)

In [15]:
data.head(5)

Unnamed: 0,mal_id,aired_from,aired_to,duration,episodes,genres,popularity,premiered,rank,rating,...,scored_by,source,status,studios,synopsis,title,title_english,type,premiered_season,premiered_year
0,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Action,38,Spring 1998,27.0,R - 17+ (violence & profanity),...,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
1,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Adventure,38,Spring 1998,27.0,R - 17+ (violence & profanity),...,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
2,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Comedy,38,Spring 1998,27.0,R - 17+ (violence & profanity),...,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
3,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Drama,38,Spring 1998,27.0,R - 17+ (violence & profanity),...,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
4,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Sci-Fi,38,Spring 1998,27.0,R - 17+ (violence & profanity),...,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV,Spring,1998


In [16]:
data.columns

Index(['mal_id', 'aired_from', 'aired_to', 'duration', 'episodes', 'genres',
       'popularity', 'premiered', 'rank', 'rating', 'score', 'scored_by',
       'source', 'status', 'studios', 'synopsis', 'title', 'title_english',
       'type', 'premiered_season', 'premiered_year'],
      dtype='object')

### Dropping Columns

In [17]:
data.drop(['mal_id', 'aired_from', 'aired_to', 'synopsis', 'status'], axis = 1, inplace = True)

In [18]:
data.head(5)

Unnamed: 0,duration,episodes,genres,popularity,premiered,rank,rating,score,scored_by,source,studios,title,title_english,type,premiered_season,premiered_year
0,24 min per ep,26,Action,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
1,24 min per ep,26,Adventure,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
2,24 min per ep,26,Comedy,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
3,24 min per ep,26,Drama,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
4,24 min per ep,26,Sci-Fi,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998


#### Drop primiered column

In [19]:
data.drop(['premiered'], axis = 1, inplace = True)

In [20]:
data.head(5)

Unnamed: 0,duration,episodes,genres,popularity,rank,rating,score,scored_by,source,studios,title,title_english,type,premiered_season,premiered_year
0,24 min per ep,26,Action,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
1,24 min per ep,26,Adventure,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
2,24 min per ep,26,Comedy,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
3,24 min per ep,26,Drama,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
4,24 min per ep,26,Sci-Fi,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998


#### Drop English title column

In [21]:
data.drop(['title_english'], axis = 1, inplace = True)

In [22]:
data.head(5)

Unnamed: 0,duration,episodes,genres,popularity,rank,rating,score,scored_by,source,studios,title,type,premiered_season,premiered_year
0,24 min per ep,26,Action,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,TV,Spring,1998
1,24 min per ep,26,Adventure,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,TV,Spring,1998
2,24 min per ep,26,Comedy,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,TV,Spring,1998
3,24 min per ep,26,Drama,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,TV,Spring,1998
4,24 min per ep,26,Sci-Fi,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,TV,Spring,1998


### Fill NaN with 0 or make the empty column as string

In [23]:
data['rank'] = data['rank'].fillna(data['rank'].dropna().mode().values[0])
data['premiered_year'] = data['premiered_year'].fillna(data['premiered_year'].dropna().mode().values[0])
data['genres'].fillna('', inplace = True)
data['premiered_season'].fillna('', inplace = True)
data.isnull().sum()

duration            0
episodes            0
genres              0
popularity          0
rank                0
rating              0
score               0
scored_by           0
source              0
studios             0
title               0
type                0
premiered_season    0
premiered_year      0
dtype: int64

In [24]:
data.head(3)

Unnamed: 0,duration,episodes,genres,popularity,rank,rating,score,scored_by,source,studios,title,type,premiered_season,premiered_year
0,24 min per ep,26,Action,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,TV,Spring,1998
1,24 min per ep,26,Adventure,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,TV,Spring,1998
2,24 min per ep,26,Comedy,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,TV,Spring,1998


### Preprocess User Datat ###

Following are the code used to preprocess the user_score_data.csv which is originally derived from user_data.csv. This section was commented out and data was exported into a csv since it takes a while to execute.

In [None]:
'''
    get ratings of anime from each user
'''
# user_df = pd.read_csv('./datasets/user_data.csv')
# user_df.insert(0, 'user_id', range(1, 1 + len(user_df)))
# user_watched = user_df[['user_id', 'watched']]

# import ast
# user_data = []

# for i in range(len(user_df)):
#     row = user_watched.iloc[i].watched
#     row = row.strip('][').split('}, ')
#     for item in row:
#         row_dict = {}
#         if (item[-1] != "}"):
#             item = item + "}"
#         item_dict = ast.literal_eval(item)
#         row_dict['user_id'] = user_watched.iloc[i].user_id
#         row_dict['mal_id'] = item_dict['mal_id']
#         row_dict['rating'] = item_dict['score']
#         user_data.append(row_dict)

# df_user_data = pd.DataFrame(user_data)
# df_user_data.to_csv('user_score_data')

In [55]:
'''
    get favorited anime data from user data
    favorited anime will be identified with a 1
'''

# user_df = pd.read_csv('./datasets/user_data.csv')
# user_df.insert(0, 'user_id', range(1, 1 + len(user_df)))
# user_favorites = user_df[['user_id', 'favorites']]

# import ast
# import re
# user_data = []

# user_df

# for i in range(3):
#     row = user_favorites.iloc[i].favorites
#     row_dict = ast.literal_eval(row)
#     favorites_lst = row_dict['anime']
#     mal_ids = []
#     for item in favorites_lst:
#         before, key, after = row.partition("mal_id': ")
#         mal_ids = re.findall(r'\b\d+\b', after)
#     for mal_id in mal_ids:
#         row_dict = {}
#         row_dict['user_id'] = user_favorites.iloc[i].user_id
#         row_dict['mal_id'] = mal_id
#         row_dict['favorited'] = 1
#         user_data.append(row_dict)

# df_user_favorite_data = pd.DataFrame(user_data)
# df_user_favorite_data.to_csv('user_favorited_data')

'\n    get favorited anime data from user data\n    favorited anime will be identified with a 1\n'

In [61]:
user_rating_data_df = pd.read_csv('./datasets/user_score_data.csv', usecols=['user_id', 'mal_id', 'rating'], 
                                  dtype={'user_id':'int32', 'mal_id':'int32', 'rating':'float32'})
user_favorite_data_df = pd.read_csv('./datasets/user_favorited_data.csv', usecols=['user_id', 'mal_id', 'favorited'],
                                   dtype={'user_id':'int32', 'mal_id':'int32', 'rating':'int32'})
user_data = pd.concat([user_rating_data_df, user_favorite_data_df], axis=0)
user_data.favorited = user_data.favorited.fillna(0)

In [59]:
animes_df = pd.read_csv('./datasets/anime_data.csv', usecols=['mal_id', 'title'], dtype={'mal_id':'int32', 'title':'string'})

### Linear Regression ###

Not all users will rate every anime. Therefore, there are missing data in the ratings of animes. To have a better prediction, linear regression can be used to generate predictions of missing data based on existing values.

In [36]:
def getOverallUserAvgAnimeRating(user_data_df):
    average = user_data_df.groupby('mal_id')['rating'].agg('mean')
    return pd.DataFrame({'mal_id':average.index, 'rating':average.values})

In [142]:
def getTestTrainData(y):
    test_data = y[y['rating_y'].isna()]
    train_data = y.dropna(subset=['rating_y'])

    y_train = train_data['rating_y']
    X_train = train_data.drop('rating_y', axis=1)
    
    return test_data, train_data, y_train, X_train

In [146]:
def fillMissingRatingDataLinReg(y):
    test_data, train_data, y_train, X_train = getTestTrainData(y)
    lin_model = LinearRegression().fit(X_train, y_train)
    
    X_test = test_data.drop('rating_y', axis=1)
    y_pred = lin_model.predict(X_test)
    
    test_data.loc[test_data.rating_y.isna(), 'rating_y'] = y_pred
    
    new = pd.concat([test_data, train_data], axis=0).sort_values(by=['mal_id'], ascending=True)
    new.rename(columns={'rating_y':'rating'}, inplace=True)
    
    return new

In [147]:
def getComprehensiveUserRating(user_data_df, user_id):
    '''
        Takes user data and fills missing data based on linear regression
        using collaborative average anime rating. Predicts what user of specified
        id will rate each anime.
    '''
    # get average anime rating
    avg_df = getOverallUserAvgAnimeRating(user_data_df)
    
    # get all user rating
    y = (user_data_df[user_data_df['user_id'] == user_id])
    y = y.drop(columns=['user_id'])
    
    merged_y = pd.merge(avg_df, y, on='mal_id',how='left').drop(columns=['rating_x'])
    
    comprehensive_df = fillMissingRatingDataLinReg(merged_y)
    
    return comprehensive_df

In [149]:
# new = getComprehensiveUserRating(user_data_df, 1)

### K-Nearest Neighbors ###

K-nearest neighbors can be used to generate recommendation based on specified anime. Using collaborative filtering, k-nearest neighbors will search for what other animes were enjoyed by other users who also enjoyed watching the specified anime.

In [237]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [231]:
animes_users = user_data_df.pivot(index='mal_id', columns='user_id', values='rating').fillna(0)
animes_users_mat = csr_matrix(animes_users.values)

In [233]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)
model_knn.fit(animes_users_mat)

NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=20)

In [249]:
def getRecommendations(movie_title, data_matrix, animes_df, model_knn, n_recommendations):
    model_knn.fit(data_matrix)
    anime_index = process.extractOne(movie_title, animes_df['title'])[2]
    distances, indices = model_knn.kneighbors(data_matrix[anime_index], n_neighbors=n_recommendations)
    for i in indices:
        print(animes_df['title'][i].where(i != anime_index))
    

In [250]:
getRecommendations('Bleach', animes_users_mat, animes_df, model_knn, 5)

3990                                                 <NA>
6198    Iizuka-senpai x Blazer: Ane Kyun! yori The Ani...
5435                              Kanashimi no Belladonna
3093    New Mobile Report Gundam Wing: Frozen Teardrop...
3295                                       Plastic Little
Name: title, dtype: string
