# MyAnimeList Data

## Data Pre-Processing 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import json
import ast

In [2]:
data = '/Users/soumengchea/Documents/MyAnimeList/anime_data.csv'

In [3]:
df = pd.read_csv(data)

In [4]:
df.head(5)

Unnamed: 0,mal_id,aired_from,aired_to,duration,episodes,genres,popularity,premiered,rank,rating,score,scored_by,source,status,studios,synopsis,title,title_english,type
0,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...",38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Finished Airing,"[{'mal_id': 14, 'name': 'Sunrise'}]","In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV
1,100,2001-04-04T00:00:00+00:00,2001-06-27T00:00:00+00:00,23 min per ep,13,"['Comedy', 'Drama', 'Fantasy', 'Magic', 'Roman...",2075,Spring 2001,2703.0,PG-13 - Teens 13 or older,7.21,23787,Manga,Finished Airing,"[{'mal_id': 34, 'name': 'Hal Film Maker'}]","Due to her father's remarriage, robust 16-year...",Shin Shirayuki-hime Densetsu Prétear,Prétear: The New Legend of Snow White,TV
2,1000,1978-03-14T00:00:00+00:00,1979-02-13T00:00:00+00:00,25 min per ep,42,"['Action', 'Sci-Fi', 'Adventure', 'Space', 'Dr...",2980,Spring 1978,1008.0,PG-13 - Teens 13 or older,7.71,7059,Manga,Finished Airing,"[{'mal_id': 18, 'name': 'Toei Animation'}]",It is 2977 AD and mankind has become stagnant....,Uchuu Kaizoku Captain Herlock,Space Pirate Captain Harlock,TV
3,10003,2008-01-01T00:00:00+00:00,,2 min per ep,15,"['Comedy', 'Dementia', 'Horror', 'Seinen']",6848,,10146.0,R+ - Mild Nudity,5.05,1181,Original,Finished Airing,[],"In these jokey short films, many of them crude...",Kago Shintarou Anime Sakuhin Shuu,,OVA
4,10005,2007-03-31T00:00:00+00:00,,1 hr 35 min,1,"['Action', 'Adventure', 'Mecha', 'Sci-Fi']",10765,,6121.0,G - All Ages,6.43,228,Unknown,Finished Airing,"[{'mal_id': 455, 'name': 'Palm Studio'}]",This theatrical version based on the manga by ...,Tetsujin 28-gou: Hakuchuu no Zangetsu,,Movie


### Extracting studio sequences into a new columns

Source: https://stackoverflow.com/questions/71432733/pandas-extracting-a-phrase-in-a-dict-column?noredirect=1#comment126259925_71432733

In case of the items in the column is just string, convert the column into actual object

In [5]:
df['studios'] = df['studios'].apply(ast.literal_eval)

Implementing .str to access indexes/keys from the lists/dicts of items in a column, and use a combination of pipe and where to fallback to the original values where the result from .str to returns NaN

In [6]:
df['studios'] = df['studios'].str[0].str['name'].pipe(lambda x: x.where(x.notna(), df['studios']))
df.head(5)

Unnamed: 0,mal_id,aired_from,aired_to,duration,episodes,genres,popularity,premiered,rank,rating,score,scored_by,source,status,studios,synopsis,title,title_english,type
0,1,1998-04-03T00:00:00+00:00,1999-04-24T00:00:00+00:00,24 min per ep,26,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...",38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Finished Airing,Sunrise,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,Cowboy Bebop,TV
1,100,2001-04-04T00:00:00+00:00,2001-06-27T00:00:00+00:00,23 min per ep,13,"['Comedy', 'Drama', 'Fantasy', 'Magic', 'Roman...",2075,Spring 2001,2703.0,PG-13 - Teens 13 or older,7.21,23787,Manga,Finished Airing,Hal Film Maker,"Due to her father's remarriage, robust 16-year...",Shin Shirayuki-hime Densetsu Prétear,Prétear: The New Legend of Snow White,TV
2,1000,1978-03-14T00:00:00+00:00,1979-02-13T00:00:00+00:00,25 min per ep,42,"['Action', 'Sci-Fi', 'Adventure', 'Space', 'Dr...",2980,Spring 1978,1008.0,PG-13 - Teens 13 or older,7.71,7059,Manga,Finished Airing,Toei Animation,It is 2977 AD and mankind has become stagnant....,Uchuu Kaizoku Captain Herlock,Space Pirate Captain Harlock,TV
3,10003,2008-01-01T00:00:00+00:00,,2 min per ep,15,"['Comedy', 'Dementia', 'Horror', 'Seinen']",6848,,10146.0,R+ - Mild Nudity,5.05,1181,Original,Finished Airing,[],"In these jokey short films, many of them crude...",Kago Shintarou Anime Sakuhin Shuu,,OVA
4,10005,2007-03-31T00:00:00+00:00,,1 hr 35 min,1,"['Action', 'Adventure', 'Mecha', 'Sci-Fi']",10765,,6121.0,G - All Ages,6.43,228,Unknown,Finished Airing,Palm Studio,This theatrical version based on the manga by ...,Tetsujin 28-gou: Hakuchuu no Zangetsu,,Movie


In [7]:
df['studios'].head(10)

0                   Sunrise
1            Hal Film Maker
2            Toei Animation
3                        []
4               Palm Studio
5    Telecom Animation Film
6                    Lerche
7                 WAO World
8                    Gallop
9                 Ascension
Name: studios, dtype: object

### Looking for missing value within the dataset and fill with 0

In [8]:
df.isnull().sum()

mal_id              0
aired_from          5
aired_to         5368
duration            0
episodes            0
genres              0
popularity          0
premiered        7880
rank             1282
rating              0
score               0
scored_by           0
source              0
status              0
studios             0
synopsis          237
title               0
title_english    5941
type                0
dtype: int64

In [9]:
df['aired_from'] = df['aired_from'].fillna(df['aired_from'].dropna().mode().values[0])
df['aired_to'] = df['aired_to'].fillna(df['aired_to'].dropna().mode().values[0])
df['premiered'] = df['premiered'].fillna(df['premiered'].dropna().mode().values[0])
df['rank'] = df['rank'].fillna(df['rank'].dropna().mode().values[0])
df['synopsis'] = df['synopsis'].fillna(df['synopsis'].dropna().mode().values[0])
df['title_english'] = df['title_english'].fillna(df['title_english'].dropna().mode().values[0])

In [10]:
df.isnull().sum()

mal_id           0
aired_from       0
aired_to         0
duration         0
episodes         0
genres           0
popularity       0
premiered        0
rank             0
rating           0
score            0
scored_by        0
source           0
status           0
studios          0
synopsis         0
title            0
title_english    0
type             0
dtype: int64

### Dropping Columns

In [11]:
df.drop(['mal_id', 'aired_from', 'aired_to', 'synopsis', 'status'], axis=1, inplace=True)

In [12]:
df.head(5)

Unnamed: 0,duration,episodes,genres,popularity,premiered,rank,rating,score,scored_by,source,studios,title,title_english,type
0,24 min per ep,26,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...",38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV
1,23 min per ep,13,"['Comedy', 'Drama', 'Fantasy', 'Magic', 'Roman...",2075,Spring 2001,2703.0,PG-13 - Teens 13 or older,7.21,23787,Manga,Hal Film Maker,Shin Shirayuki-hime Densetsu Prétear,Prétear: The New Legend of Snow White,TV
2,25 min per ep,42,"['Action', 'Sci-Fi', 'Adventure', 'Space', 'Dr...",2980,Spring 1978,1008.0,PG-13 - Teens 13 or older,7.71,7059,Manga,Toei Animation,Uchuu Kaizoku Captain Herlock,Space Pirate Captain Harlock,TV
3,2 min per ep,15,"['Comedy', 'Dementia', 'Horror', 'Seinen']",6848,Fall 2016,10146.0,R+ - Mild Nudity,5.05,1181,Original,[],Kago Shintarou Anime Sakuhin Shuu,Cyborg 009,OVA
4,1 hr 35 min,1,"['Action', 'Adventure', 'Mecha', 'Sci-Fi']",10765,Fall 2016,6121.0,G - All Ages,6.43,228,Unknown,Palm Studio,Tetsujin 28-gou: Hakuchuu no Zangetsu,Cyborg 009,Movie


### Pulling information of the dataset

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11335 entries, 0 to 11334
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   duration       11335 non-null  object 
 1   episodes       11335 non-null  int64  
 2   genres         11335 non-null  object 
 3   popularity     11335 non-null  int64  
 4   premiered      11335 non-null  object 
 5   rank           11335 non-null  float64
 6   rating         11335 non-null  object 
 7   score          11335 non-null  float64
 8   scored_by      11335 non-null  int64  
 9   source         11335 non-null  object 
 10  studios        11335 non-null  object 
 11  title          11335 non-null  object 
 12  title_english  11335 non-null  object 
 13  type           11335 non-null  object 
dtypes: float64(2), int64(3), object(9)
memory usage: 1.2+ MB


In [14]:
df.shape

(11335, 14)

### Extract genre list into an individual row

In [15]:
df['genres'].head(5)

0    ['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...
1    ['Comedy', 'Drama', 'Fantasy', 'Magic', 'Roman...
2    ['Action', 'Sci-Fi', 'Adventure', 'Space', 'Dr...
3           ['Comedy', 'Dementia', 'Horror', 'Seinen']
4           ['Action', 'Adventure', 'Mecha', 'Sci-Fi']
Name: genres, dtype: object

**Convert the values in the genres column to actual list, because it might just look like a list but actually be a string.**

In [16]:
df['genres'] = df['genres'].apply(ast.literal_eval)

**Implementing .explode() for genres column**

In [17]:
df.explode('genres').reset_index(drop=True)

Unnamed: 0,duration,episodes,genres,popularity,premiered,rank,rating,score,scored_by,source,studios,title,title_english,type
0,24 min per ep,26,Action,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV
1,24 min per ep,26,Adventure,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV
2,24 min per ep,26,Comedy,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV
3,24 min per ep,26,Drama,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV
4,24 min per ep,26,Sci-Fi,38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35979,30 min,1,Action,2436,Fall 2016,3141.0,PG-13 - Teens 13 or older,7.10,21174,Manga,Toei Animation,One Piece 3D: Mugiwara Chase,Cyborg 009,Movie
35980,30 min,1,Adventure,2436,Fall 2016,3141.0,PG-13 - Teens 13 or older,7.10,21174,Manga,Toei Animation,One Piece 3D: Mugiwara Chase,Cyborg 009,Movie
35981,30 min,1,Comedy,2436,Fall 2016,3141.0,PG-13 - Teens 13 or older,7.10,21174,Manga,Toei Animation,One Piece 3D: Mugiwara Chase,Cyborg 009,Movie
35982,30 min,1,Fantasy,2436,Fall 2016,3141.0,PG-13 - Teens 13 or older,7.10,21174,Manga,Toei Animation,One Piece 3D: Mugiwara Chase,Cyborg 009,Movie


In [18]:
df.shape

(11335, 14)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11335 entries, 0 to 11334
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   duration       11335 non-null  object 
 1   episodes       11335 non-null  int64  
 2   genres         11335 non-null  object 
 3   popularity     11335 non-null  int64  
 4   premiered      11335 non-null  object 
 5   rank           11335 non-null  float64
 6   rating         11335 non-null  object 
 7   score          11335 non-null  float64
 8   scored_by      11335 non-null  int64  
 9   source         11335 non-null  object 
 10  studios        11335 non-null  object 
 11  title          11335 non-null  object 
 12  title_english  11335 non-null  object 
 13  type           11335 non-null  object 
dtypes: float64(2), int64(3), object(9)
memory usage: 1.2+ MB


In [20]:
df.isnull().sum()

duration         0
episodes         0
genres           0
popularity       0
premiered        0
rank             0
rating           0
score            0
scored_by        0
source           0
studios          0
title            0
title_english    0
type             0
dtype: int64

### Extracting Season and Year from primier column to create two new columns

In [21]:
df[['premiered_season', 'premiered_year']] = df['premiered'].str.split(expand = True)

In [22]:
df.head(10)

Unnamed: 0,duration,episodes,genres,popularity,premiered,rank,rating,score,scored_by,source,studios,title,title_english,type,premiered_season,premiered_year
0,24 min per ep,26,"[Action, Adventure, Comedy, Drama, Sci-Fi, Space]",38,Spring 1998,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
1,23 min per ep,13,"[Comedy, Drama, Fantasy, Magic, Romance, Shouj...",2075,Spring 2001,2703.0,PG-13 - Teens 13 or older,7.21,23787,Manga,Hal Film Maker,Shin Shirayuki-hime Densetsu Prétear,Prétear: The New Legend of Snow White,TV,Spring,2001
2,25 min per ep,42,"[Action, Sci-Fi, Adventure, Space, Drama, Seinen]",2980,Spring 1978,1008.0,PG-13 - Teens 13 or older,7.71,7059,Manga,Toei Animation,Uchuu Kaizoku Captain Herlock,Space Pirate Captain Harlock,TV,Spring,1978
3,2 min per ep,15,"[Comedy, Dementia, Horror, Seinen]",6848,Fall 2016,10146.0,R+ - Mild Nudity,5.05,1181,Original,[],Kago Shintarou Anime Sakuhin Shuu,Cyborg 009,OVA,Fall,2016
4,1 hr 35 min,1,"[Action, Adventure, Mecha, Sci-Fi]",10765,Fall 2016,6121.0,G - All Ages,6.43,228,Unknown,Palm Studio,Tetsujin 28-gou: Hakuchuu no Zangetsu,Cyborg 009,Movie,Fall,2016
5,23 min,1,"[Adventure, Drama, Shounen]",8931,Fall 2016,4970.0,PG-13 - Teens 13 or older,6.67,776,Original,Telecom Animation Film,Tide-Line Blue: Kyoudai,Cyborg 009,Special,Fall,2016
6,14 min per ep,12,"[Comedy, Parody, Supernatural]",949,Fall 2016,655.0,PG-13 - Teens 13 or older,7.9,66932,Manga,Lerche,Carnival Phantasm,Cyborg 009,OVA,Fall,2016
7,25 min per ep,13,"[Drama, Historical]",6231,Spring 2011,3851.0,PG - Children,6.93,1122,Original,WAO World,Shouwa Monogatari,Cyborg 009,TV,Spring,2011
8,24 min per ep,73,"[Action, Fantasy, Game, Shounen]",2167,Spring 2011,6482.0,PG-13 - Teens 13 or older,6.35,23161,Manga,Gallop,Yu☆Gi☆Oh! Zexal,Yu-Gi-Oh! Zexal,TV,Spring,2011
9,24 min,1,"[Comedy, Martial Arts]",5948,Fall 2016,5319.0,G - All Ages,6.6,3147,Unknown,Ascension,Kizuna Ichigeki,Cyborg 009,Movie,Fall,2016


In [23]:
df.isnull().sum()

duration            0
episodes            0
genres              0
popularity          0
premiered           0
rank                0
rating              0
score               0
scored_by           0
source              0
studios             0
title               0
title_english       0
type                0
premiered_season    0
premiered_year      0
dtype: int64

### Drop Premier column

In [24]:
df.drop(['premiered'], axis=1, inplace=True)

In [25]:
df.head(5)

Unnamed: 0,duration,episodes,genres,popularity,rank,rating,score,scored_by,source,studios,title,title_english,type,premiered_season,premiered_year
0,24 min per ep,26,"[Action, Adventure, Comedy, Drama, Sci-Fi, Space]",38,27.0,R - 17+ (violence & profanity),8.79,544987,Original,Sunrise,Cowboy Bebop,Cowboy Bebop,TV,Spring,1998
1,23 min per ep,13,"[Comedy, Drama, Fantasy, Magic, Romance, Shouj...",2075,2703.0,PG-13 - Teens 13 or older,7.21,23787,Manga,Hal Film Maker,Shin Shirayuki-hime Densetsu Prétear,Prétear: The New Legend of Snow White,TV,Spring,2001
2,25 min per ep,42,"[Action, Sci-Fi, Adventure, Space, Drama, Seinen]",2980,1008.0,PG-13 - Teens 13 or older,7.71,7059,Manga,Toei Animation,Uchuu Kaizoku Captain Herlock,Space Pirate Captain Harlock,TV,Spring,1978
3,2 min per ep,15,"[Comedy, Dementia, Horror, Seinen]",6848,10146.0,R+ - Mild Nudity,5.05,1181,Original,[],Kago Shintarou Anime Sakuhin Shuu,Cyborg 009,OVA,Fall,2016
4,1 hr 35 min,1,"[Action, Adventure, Mecha, Sci-Fi]",10765,6121.0,G - All Ages,6.43,228,Unknown,Palm Studio,Tetsujin 28-gou: Hakuchuu no Zangetsu,Cyborg 009,Movie,Fall,2016
