# MyAnimeList Data

## Data Pre-Processing 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ast

In [2]:
data = '/Users/kristineumeh/Desktop/GradSchool/DataMining/AnimeProject/archive/anime_data.csv'

In [3]:
df = pd.read_csv(data)

In [4]:
df.head(5)

Unnamed: 0,mal_id,aired_from,aired_to,duration,episodes,genres,popularity,premiered,rank,rating,score,scored_by,source,status,studios,synopsis,title,title_english,type
0,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...",38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Finished Airing,"[{'mal_id': 14, 'name': 'Sunrise'}]","In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV
1,100,2001-04-04T00:00:00+00:00,2001-06-27T00:00:00+00:00,23 min per ep,13,"['Comedy', 'Drama', 'Fantasy', 'Magic', 'Roman...",2075,Spring 2001,2703.0,PG-13 - Teens 13 or older,7.21,23787,Manga,Finished Airing,"[{'mal_id': 34, 'name': 'Hal Film Maker'}]","Due to her father's remarriage, robust 16-year...",Shin Shirayuki-hime Densetsu Prétear,Prétear: The New Legend of Snow White,TV
2,1000,1978-03-14T00:00:00+00:00,1979-02-13T00:00:00+00:00,25 min per ep,42,"['Action', 'Sci-Fi', 'Adventure', 'Space', 'Dr...",2980,Spring 1978,1008.0,PG-13 - Teens 13 or older,7.71,7059,Manga,Finished Airing,"[{'mal_id': 18, 'name': 'Toei Animation'}]",It is 2977 AD and mankind has become stagnant....,Uchuu Kaizoku Captain Herlock,Space Pirate Captain Harlock,TV
3,10003,2008-01-01T00:00:00+00:00,,2 min per ep,15,"['Comedy', 'Dementia', 'Horror', 'Seinen']",6848,,10146.0,R+ - Mild Nudity,5.05,1181,Original,Finished Airing,[],"In these jokey short films, many of them crude...",Kago Shintarou Anime Sakuhin Shuu,,OVA
4,10005,2007-03-31T00:00:00+00:00,,1 hr 35 min,1,"['Action', 'Adventure', 'Mecha', 'Sci-Fi']",10765,,6121.0,G - All Ages,6.43,228,Unknown,Finished Airing,"[{'mal_id': 455, 'name': 'Palm Studio'}]",This theatrical version based on the manga by ...,Tetsujin 28-gou: Hakuchuu no Zangetsu,,Movie


### Extracting studio sequences into a new columns

Source: https://stackoverflow.com/questions/71432733/pandas-extracting-a-phrase-in-a-dict-column?noredirect=1#comment126259925_71432733

In case of the items in the column is just string, convert the column into actual object

In [5]:
df['studios'] = df['studios'].apply(ast.literal_eval)

Implementing .str to access indexes/keys from the lists/dicts of items in a column, and use a combination of pipe and where to fallback to the original values where the result from .str to returns NaN

In [6]:
df['studios'] = df['studios'].str[0].str['name'].pipe(lambda x: x.where(x.notna(), df['studios']))
df.head(5)

Unnamed: 0,mal_id,aired_from,aired_to,duration,episodes,genres,popularity,premiered,rank,rating,score,scored_by,source,status,studios,synopsis,title,title_english,type
0,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...",38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV
1,100,2001-04-04T00:00:00+00:00,2001-06-27T00:00:00+00:00,23 min per ep,13,"['Comedy', 'Drama', 'Fantasy', 'Magic', 'Roman...",2075,Spring 2001,2703.0,PG-13 - Teens 13 or older,7.21,23787,Manga,Finished Airing,Hal Film Maker,"Due to her father's remarriage, robust 16-year...",Shin Shirayuki-hime Densetsu Prétear,Prétear: The New Legend of Snow White,TV
2,1000,1978-03-14T00:00:00+00:00,1979-02-13T00:00:00+00:00,25 min per ep,42,"['Action', 'Sci-Fi', 'Adventure', 'Space', 'Dr...",2980,Spring 1978,1008.0,PG-13 - Teens 13 or older,7.71,7059,Manga,Finished Airing,Toei Animation,It is 2977 AD and mankind has become stagnant....,Uchuu Kaizoku Captain Herlock,Space Pirate Captain Harlock,TV
3,10003,2008-01-01T00:00:00+00:00,,2 min per ep,15,"['Comedy', 'Dementia', 'Horror', 'Seinen']",6848,,10146.0,R+ - Mild Nudity,5.05,1181,Original,Finished Airing,[],"In these jokey short films, many of them crude...",Kago Shintarou Anime Sakuhin Shuu,,OVA
4,10005,2007-03-31T00:00:00+00:00,,1 hr 35 min,1,"['Action', 'Adventure', 'Mecha', 'Sci-Fi']",10765,,6121.0,G - All Ages,6.43,228,Unknown,Finished Airing,Palm Studio,This theatrical version based on the manga by ...,Tetsujin 28-gou: Hakuchuu no Zangetsu,,Movie


### Extract genre list into an individual row

In [7]:
df['genres'].head(5)

0    ['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...
1    ['Comedy', 'Drama', 'Fantasy', 'Magic', 'Roman...
2    ['Action', 'Sci-Fi', 'Adventure', 'Space', 'Dr...
3           ['Comedy', 'Dementia', 'Horror', 'Seinen']
4           ['Action', 'Adventure', 'Mecha', 'Sci-Fi']
Name: genres, dtype: object

**Convert the values in the genres column to actual list, because it might just look like a list but actually be a string.**

In [8]:
df['genres'] = df['genres'].apply(ast.literal_eval)

**Implementing .explode() for genres column**

In [9]:
data = df.explode('genres').reset_index(drop = True)

In [10]:
data.head(5)

Unnamed: 0,mal_id,aired_from,aired_to,duration,episodes,genres,popularity,premiered,rank,rating,score,scored_by,source,status,studios,synopsis,title,title_english,type
0,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Action,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV
1,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Adventure,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV
2,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Comedy,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV
3,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Drama,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV
4,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Sci-Fi,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV


### Data Information + Rows and Columns

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35984 entries, 0 to 35983
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   mal_id         35984 non-null  int64  
 1   aired_from     35977 non-null  object 
 2   aired_to       20657 non-null  object 
 3   duration       35984 non-null  object 
 4   episodes       35984 non-null  int64  
 5   genres         35969 non-null  object 
 6   popularity     35984 non-null  int64  
 7   premiered      13621 non-null  object 
 8   rank           33954 non-null  float64
 9   rating         35984 non-null  object 
 10  score          35984 non-null  float64
 11  scored_by      35984 non-null  int64  
 12  source         35984 non-null  object 
 13  status         35984 non-null  object 
 14  studios        35984 non-null  object 
 15  synopsis       35465 non-null  object 
 16  title          35984 non-null  object 
 17  title_english  19120 non-null  object 
 18  type  

In [12]:
data.shape

(35984, 19)

### Looking for missing value within the dataset

In [13]:
data.isnull().sum()

mal_id               0
aired_from           7
aired_to         15327
duration             0
episodes             0
genres              15
popularity           0
premiered        22363
rank              2030
rating               0
score                0
scored_by            0
source               0
status               0
studios              0
synopsis           519
title                0
title_english    16864
type                 0
dtype: int64

### Extracting Season and Year from primier column to create two new columns

In [14]:
data[['premiered_season', 'premiered_year']] = data['premiered'].str.split(expand = True)

In [15]:
data.head(5)

Unnamed: 0,mal_id,aired_from,aired_to,duration,episodes,genres,popularity,premiered,rank,rating,...,scored_by,source,status,studios,synopsis,title,title_english,type,premiered_season,premiered_year
0,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Action,38,Spring 1998,27.0,R - 17+ (violence & profanity),...,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
1,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Adventure,38,Spring 1998,27.0,R - 17+ (violence & profanity),...,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
2,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Comedy,38,Spring 1998,27.0,R - 17+ (violence & profanity),...,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
3,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Drama,38,Spring 1998,27.0,R - 17+ (violence & profanity),...,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
4,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,Sci-Fi,38,Spring 1998,27.0,R - 17+ (violence & profanity),...,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV,Spring,1998


In [16]:
data.columns

Index(['mal_id', 'aired_from', 'aired_to', 'duration', 'episodes', 'genres',
       'popularity', 'premiered', 'rank', 'rating', 'score', 'scored_by',
       'source', 'status', 'studios', 'synopsis', 'title', 'title_english',
       'type', 'premiered_season', 'premiered_year'],
      dtype='object')

### Dropping Columns

In [17]:
data.drop(['mal_id', 'aired_from', 'aired_to', 'synopsis', 'status'], axis = 1, inplace = True)

In [18]:
data.head(5)

Unnamed: 0,duration,episodes,genres,popularity,premiered,rank,rating,score,scored_by,source,studios,title,title_english,type,premiered_season,premiered_year
0,24 min per ep,26,Action,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
1,24 min per ep,26,Adventure,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
2,24 min per ep,26,Comedy,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
3,24 min per ep,26,Drama,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
4,24 min per ep,26,Sci-Fi,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998


#### Drop primiered column

In [19]:
data.drop(['premiered'], axis = 1, inplace = True)

In [20]:
data.head(5)

Unnamed: 0,duration,episodes,genres,popularity,rank,rating,score,scored_by,source,studios,title,title_english,type,premiered_season,premiered_year
0,24 min per ep,26,Action,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
1,24 min per ep,26,Adventure,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
2,24 min per ep,26,Comedy,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
3,24 min per ep,26,Drama,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
4,24 min per ep,26,Sci-Fi,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998


#### Drop English title column

In [21]:
data.drop(['title_english'], axis = 1, inplace = True)

In [22]:
data.head(5)

Unnamed: 0,duration,episodes,genres,popularity,rank,rating,score,scored_by,source,studios,title,type,premiered_season,premiered_year
0,24 min per ep,26,Action,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,TV,Spring,1998
1,24 min per ep,26,Adventure,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,TV,Spring,1998
2,24 min per ep,26,Comedy,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,TV,Spring,1998
3,24 min per ep,26,Drama,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,TV,Spring,1998
4,24 min per ep,26,Sci-Fi,38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,TV,Spring,1998


In [23]:
data.describe()

Unnamed: 0,episodes,popularity,rank,score,scored_by
count,35984.0,35984.0,33954.0,35984.0,35984.0
mean,12.826228,5104.415379,4533.252371,6.758248,33476.79
std,39.255418,3495.112622,2948.21952,0.855211,95459.1
min,1.0,1.0,1.0,1.86,100.0
25%,1.0,2076.0,1987.0,6.22,863.75
50%,3.0,4587.0,4215.0,6.79,3886.0
75%,13.0,7805.25,6856.0,7.35,22434.0
max,1818.0,14659.0,10621.0,9.22,1524129.0


### Fill NaN with 0 or make the empty column as string

In [24]:
data['rank'] = data['rank'].fillna(data['rank'].dropna().mode().values[0])
data['premiered_year'] = data['premiered_year'].fillna(data['premiered_year'].dropna().mode().values[0])
data['genres'].fillna('', inplace = True)
data['premiered_season'].fillna('', inplace = True)
data.isnull().sum()

duration            0
episodes            0
genres              0
popularity          0
rank                0
rating              0
score               0
scored_by           0
source              0
studios             0
title               0
type                0
premiered_season    0
premiered_year      0
dtype: int64

# Aprioi algorithm

In [25]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

In [26]:
# user_df = pd.read_csv('/Users/kristineumeh/Desktop/GradSchool/DataMining/AnimeProject/archive/user_data.csv')
# user_df.insert(0, 'user_id', range(1, 1 + len(user_df)))
# user_watched = user_df[['user_id', 'watched']]

# import ast
# user_data = []

# for i in range(len(user_df)):
#     row = user_watched.iloc[i].watched
#     row = row.strip('][').split('}, ')
#     for item in row:
#         row_dict = {}
#         if (item[-1] != "}"):
#             item = item + "}"
#         item_dict = ast.literal_eval(item)
#         row_dict['user_id'] = user_watched.iloc[i].user_id
#         row_dict['mal_id'] = item_dict['mal_id']
#         row_dict['rating'] = item_dict['score']
#         user_data.append(row_dict)

# df_user_data = pd.DataFrame(user_data)
# df_user_data.to_csv('user_score_data')

In [27]:
# user_df = pd.read_csv('/Users/kristineumeh/Desktop/GradSchool/DataMining/AnimeProject/archive/user_data.csv')
# user_df.insert(0, 'user_id', range(1, 1 + len(user_df)))
# user_favorites = user_df[['user_id', 'favorites']]

# import ast
# import re
# user_data = []

# for i in range(len(user_df)):
#     row = user_favorites.iloc[i].favorites
#     row_dict = ast.literal_eval(row)
#     favorites_lst = row_dict['anime']
#     mal_ids = []
#     for item in favorites_lst:
#         before, key, after = row.partition("mal_id': ")
#         mal_ids = re.findall(r'\b\d+\b', after)
#     for mal_id in mal_ids:
#         row_dict = {}
#         row_dict['user_id'] = user_favorites.iloc[i].user_id
#         row_dict['mal_id'] = mal_id
#         row_dict['favorited'] = 1
#         user_data.append(row_dict)

# df_user_favorite_data = pd.DataFrame(user_data)
# df_user_favorite_data.to_csv('user_favorited_data')

In [28]:
user_data_df = pd.read_csv('/Users/kristineumeh/Desktop/GradSchool/DataMining/AnimeProject/cs6220-sp22-main/datasets/user_score_data.csv', usecols=['user_id', 'mal_id', 'rating'], dtype={'user_id':'int32', 'mal_id':'int32', 'rating':'float32'})
animes_df = pd.read_csv('/Users/kristineumeh/Desktop/GradSchool/DataMining/AnimeProject/cs6220-sp22-main/datasets/anime_data.csv', usecols=['mal_id', 'title'], dtype={'mal_id':'int32', 'title':'string'})

In [29]:
user_data_df.head(5)

Unnamed: 0,user_id,mal_id,rating
0,1,29978,6.0
1,1,2467,10.0
2,1,28789,6.0
3,1,34881,6.0
4,1,101,10.0


In [30]:
animes_df.head(5)

Unnamed: 0,mal_id,title
0,1,Cowboy Bebop
1,100,Shin Shirayuki-hime Densetsu Prétear
2,1000,Uchuu Kaizoku Captain Herlock
3,10003,Kago Shintarou Anime Sakuhin Shuu
4,10005,Tetsujin 28-gou: Hakuchuu no Zangetsu


In [31]:
# convert the string datatype of mal_id column of movies dataframe to int
animes_df = animes_df.astype({'mal_id': 'int32'})

You can merge the two dataframe on a common column mal_id to obtain the records of user_data_df concatenated with the corresponding details of the movie from the animes_df. 

In [32]:
df = pd.merge(user_data_df, animes_df[['mal_id', 'title']], on='mal_id')
df.tail(200)

Unnamed: 0,user_id,mal_id,rating,title
931551,1823,8956,3.0,Denkou Chou Tokkyuu Hikarian
931552,2193,8956,7.0,Denkou Chou Tokkyuu Hikarian
931553,1473,3825,7.0,Dokaben
931554,1604,3825,7.0,Dokaben
931555,1755,3825,6.0,Dokaben
...,...,...,...,...
931746,2092,37896,7.0,Ling Yu 6th Season
931747,2193,37896,5.0,Ling Yu 6th Season
931748,2092,42044,6.0,Minegishi-san wa Ootsu-kun ni Tabesasetai
931749,2092,41528,6.0,Xing Chen Bian: Yu Li Cang Hai


Ensure there are no duplicate records for any given combination of user_id and title

In [33]:
df = df.drop_duplicates(['user_id','title'])

The apriori model needs data in a format such that the userId forms the index, the columns are the movie titles and the values can be 1 or 0 depending on whether that user has watched the movie of the corresponding column. The resulting data is like a user's watchlist, for each userId, having 1 in columns of the movies that the user has watched and 0 otherwise.

In [34]:
df_pivot = df.pivot(index='user_id', columns='title', values='rating').fillna(0)

In [35]:
df_pivot.head(5)

title,"""0""","""Bungaku Shoujo"" Kyou no Oyatsu: Hatsukoi","""Bungaku Shoujo"" Memoire","""Bungaku Shoujo"" Movie","""Calpis"" Hakkou Monogatari","""Eiji""","""Eiyuu"" Kaitai","""Kiss Dekiru Gyoza"" x Mameshiba Movie","""Parade"" de Satie","""R100"" x Mameshiba Original Manners",...,s.CRY.ed Alteration I: Tao,s.CRY.ed Alteration II: Quan,the FLY BanD!,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,ēlDLIVE,◯
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,9.0,9.0,7.0,9.0,9.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0


You need to convert the ratings to 0 or 1 and also convert all float values to int.

In [36]:
df_pivot = df_pivot.astype('int32')

In [37]:
def encode_ratings(x):
    if x<=0:
        return 0
    if x>=1:
        return 1

df_pivot = df_pivot.applymap(encode_ratings)

In [38]:
df_pivot.head()

title,"""0""","""Bungaku Shoujo"" Kyou no Oyatsu: Hatsukoi","""Bungaku Shoujo"" Memoire","""Bungaku Shoujo"" Movie","""Calpis"" Hakkou Monogatari","""Eiji""","""Eiyuu"" Kaitai","""Kiss Dekiru Gyoza"" x Mameshiba Movie","""Parade"" de Satie","""R100"" x Mameshiba Original Manners",...,s.CRY.ed Alteration I: Tao,s.CRY.ed Alteration II: Quan,the FLY BanD!,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,ēlDLIVE,◯
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [41]:
frequent_items = apriori(df_pivot, min_support=0.5, use_colnames=True)

In [43]:
frequent_items.head()

Unnamed: 0,support,itemsets
0,0.50615,(Akame ga Kill!)
1,0.609112,(Angel Beats!)
2,0.524374,(Ano Hi Mita Hana no Namae wo Bokutachi wa Mad...
3,0.529385,(Another)
4,0.627335,(Boku dake ga Inai Machi)


In [44]:
rules = association_rules(frequent_items, metric="lift", min_threshold=1)

In [45]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Angel Beats!),(Boku no Hero Academia),0.609112,0.744419,0.50205,0.824233,1.107217,0.048616,1.454092
1,(Boku no Hero Academia),(Angel Beats!),0.744419,0.609112,0.50205,0.674419,1.107217,0.048616,1.200586
2,(Shingeki no Kyojin),(Angel Beats!),0.783144,0.609112,0.523462,0.668412,1.097355,0.046441,1.178837
3,(Angel Beats!),(Shingeki no Kyojin),0.609112,0.783144,0.523462,0.859387,1.097355,0.046441,1.542219
4,(Boku dake ga Inai Machi),(Boku no Hero Academia),0.627335,0.744419,0.535763,0.854031,1.147244,0.068763,1.75092


Let's sort the result by descending order of lift. So that the most likely movie that the user will watch is recommended first.

In [46]:
result_df = rules.sort_values(by=['lift'], ascending=False)
result_df.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
216,(Shingeki no Kyojin Season 3),(Shingeki no Kyojin Season 3 Part 2),0.553986,0.518451,0.501139,0.904605,1.744823,0.213924,5.047962
217,(Shingeki no Kyojin Season 3 Part 2),(Shingeki no Kyojin Season 3),0.518451,0.553986,0.501139,0.966608,1.744823,0.213924,13.356936
81,(Code Geass: Hangyaku no Lelouch R2),(Code Geass: Hangyaku no Lelouch),0.517084,0.566743,0.511162,0.988546,1.74426,0.218108,37.826704
80,(Code Geass: Hangyaku no Lelouch),(Code Geass: Hangyaku no Lelouch R2),0.566743,0.517084,0.511162,0.901929,1.74426,0.218108,4.924157
147,(Kono Subarashii Sekai ni Shukufuku wo!),(Kono Subarashii Sekai ni Shukufuku wo! 2),0.609567,0.523007,0.515718,0.846039,1.617644,0.19691,3.09814


In [53]:
recomm_df = result_df[result_df['antecedents'].apply(lambda x: len(x) ==1 and next(iter(x)) == 'Code Geass: Hangyaku no Lelouch')]

In [54]:
recomm_df = recomm_df[recomm_df['lift'] > 1]

In [55]:
recomm_df.head(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
80,(Code Geass: Hangyaku no Lelouch),(Code Geass: Hangyaku no Lelouch R2),0.566743,0.517084,0.511162,0.901929,1.74426,0.218108,4.924157
83,(Code Geass: Hangyaku no Lelouch),(Shingeki no Kyojin),0.566743,0.783144,0.509795,0.899518,1.148599,0.065954,2.158156


In [56]:
anime_rec = recomm_df['consequents'].values

anime_rec_list = []
for rec in anime_rec:
    for title in rec:
        if title not in anime_rec_list:
            anime_rec_list.append(title)

The top anime that the user is most likely to watch can be obtained

In [69]:
anime_rec_list[:1]

['Code Geass: Hangyaku no Lelouch R2']