In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

## Read csv
## Extract the columns that we identified as useful features
## Seperate data into two sets, one for TV series and another for Movies (from now on we focus on TV)

In [2]:
raw_data = pd.read_csv(r"Datasets/MAL Anime Top 10000 Details.csv.zip")

new_data = pd.DataFrame(raw_data[["Anime Title","Type","Episodes","Aired","Producers","Studios",
                                  "Source","Genres","Score","Members","Favorites"]])

TV_data = new_data.loc[new_data["Type"]=="TV"]
Movie_data = new_data.loc[new_data["Type"]=="Movie"]

## Create an array of studio names

In [3]:
TV_Studios_array=TV_data["Studios"].unique()

In [4]:
source = TV_data['Source']
genres = TV_data['Genres']
score = TV_data['Score']
members = pd.DataFrame(TV_data['Members'])
fav = TV_data['Favorites']
studios = TV_data['Studios']

## Create a new column to represent Favorites as a percentage of Members, to use as another response variable

In [5]:
TV_data.insert(loc=11, column='Fav%', value=round(TV_data['Favorites']*100/TV_data['Members'], 3))

## Group sources based on their similarities as the raw data had too many types of sources
## Convert to integer for ease of access

### 0 - All manga + Picture book

### 1 - Novel, Light Novel, Book

### 2 - All game + Visual novel

### 3 - Other + Unknown

### 4 - Music + Radio

### 5 - Original

In [6]:
source.unique()

array(['Manga', 'Visual novel', 'Light novel', 'Original', 'Web manga',
       'Novel', '4-koma manga', 'Card game', 'Book', 'Game', 'Other',
       'Unknown', 'Music', 'Picture book', 'Digital manga', 'Radio'],
      dtype=object)

In [7]:
source_list = source.tolist() #list of sources based on index in Tv_data
source_list_unique = ["Manga", "Novel", "Game", "Other", "Audio", "Original"] #compresed source_reference list

In [8]:
a = [0] * len(TV_data)
for i in range(len(source_list)):
    if source_list[i] in ["Manga", "Web manga", "4-koma manga", "Digital manga", "Picture book"]:
        a[i] = source_list_unique.index("Manga")
    elif source_list[i] in ["Visual novel", "Light novel", "Book"]:
        a[i] = source_list_unique.index("Novel")
    elif source_list[i] in ["Card game", "Game", "Visual novel"]:
        a[i] = source_list_unique.index("Game")
    elif source_list[i] in ["Other", "Unknown"]:
        a[i] = source_list_unique.index("Other")
    elif source_list[i] in ["Radio", "Music"]:
        a[i] = source_list_unique.index("Audio")
    elif source_list[i] == "Original":
        a[i] = source_list_unique.index("Original")
TV_data["Source_reference"] = a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TV_data["Source_reference"] = a


In [9]:
studio_reference_unique = [0] * 335
index = 0
for i in range(len(TV_data)):
    temp = studios.iloc[i]
    if(isinstance(temp,str)):
        temp = temp.split(",")
        for j in range(len(temp)):
            if(temp[j].strip() not in studio_reference_unique):
                studio_reference_unique[index] = temp[j].strip()
                index += 1

## The "Genres" column in the raw data was a single string of comma seperated genres in order of significance
## Thus we extracted the first two genres and created new columns
## Convert to integer for ease of access

In [10]:
genres_list_unique = [0] * 100
index = 0;
for i in range(len(genres)):
    temp_list = genres.iloc[i].split(",")
    for element in temp_list:
        if(element.strip() not in genres_list_unique):
            genres_list_unique[index] = element.strip()
            index += 1
genres_list_unique = [i for i in genres_list_unique if i != 0] 
genres_list_unique.insert(0, "Null") #genres_list_unique contains all possible genre, first index is NULL(no genre)

In [11]:
Genres = pd.DataFrame(genres_list_unique)

In [15]:
Genres = Genres.drop([31, 38, 35, 39, 40])

In [16]:
Genres

Unnamed: 0,0
0,Null
1,Action
2,Military
3,Adventure
4,Comedy
5,Drama
6,Magic
7,Fantasy
8,Shounen
9,Thriller


In [17]:
Genres = Genres.drop([34, 37])

In [18]:
Genres

Unnamed: 0,0
0,Null
1,Action
2,Military
3,Adventure
4,Comedy
5,Drama
6,Magic
7,Fantasy
8,Shounen
9,Thriller


In [12]:
a = [0] * len(TV_data)
b = [0] * len(TV_data)
for i in range(len(TV_data)):
    word = TV_data["Genres"].iloc[i].split(",")
    a[i] = genres_list_unique.index(word[0].strip())
    if(len(word)>1):
        b[i] = genres_list_unique.index(word[1].strip())

In [13]:
TV_data["genre_reference_1"] = a
TV_data["genre_reference_2"] = b

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TV_data["genre_reference_1"] = a
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TV_data["genre_reference_2"] = b


## Convert Aired date into Seasons in integer form for ease of access

In [14]:
c = [0] * len(TV_data)
for i  in range(len(TV_data)):
    temp = TV_data["Aired"].iloc[i][:3]
    if temp == 'Jan' or temp == 'Feb' or temp == 'Mar': #Winter
        c[i] = 1
    elif temp == 'Apr' or temp == 'May' or temp == 'Jun': #Spring
        c[i] = 2
    elif temp == 'Jul' or temp == 'Aug' or temp == 'Sep': # Summer
        c[i] = 3
    else:                                                #Autumn/Fall(americans)
        c[i] = 4
TV_data["Season"] = c

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TV_data["Season"] = c


In [15]:
import difflib

## Read new csv containing manga ratings

In [16]:
manga_raw = pd.read_csv(r"Datasets\manga.csv.zip")

## Use SequenceMatcher to match mangas to their corresponding anime seasons and insert the respective manga ratings in the correct row in our main dataset

In [17]:
TV_data_manga = TV_data.loc[TV_data["Source_reference"]==0]
a = [0] * len(TV_data_manga) 
for i in range(len(TV_data_manga)): 
    anime_title = TV_data_manga["Anime Title"].iloc[i] 
    for j in range(len(manga_raw)): 
        manga_title = manga_raw["title"].iloc[j] 
        if(difflib.SequenceMatcher(None, anime_title,manga_title).ratio() > 0.5): 
            a[i] = manga_raw["score"].iloc[j] 
            break 
    if i%100 ==0:
        print("%d. %f"%(i, a[i]))

0. 9.040000
100. 8.250000
200. 7.500000
300. 6.600000
400. 7.790000
500. 7.340000
600. 6.900000
700. 7.600000
800. 7.870000
900. 6.780000
1000. 7.170000
1100. 7.280000
1200. 7.200000
1300. 7.060000
1400. 7.290000
1500. 0.000000
1600. 7.440000


## Create a new dataset which only contains anime which have manga as their source material

In [18]:
TV_data_manga = TV_data.loc[TV_data["Source_reference"]==0]
TV_data_manga["Manga_score"]=a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TV_data_manga["Manga_score"]=a


## Remove animes that do not have a manga rating

In [19]:
TV_data_manga=TV_data_manga.loc[TV_data_manga["Manga_score"]>0]

## Create a new column to represent Score as a classification variable
### Score < 6.5 -> Low
### 6.5 <= Score <= 8.5 -> Medium
### Score > 8.5 -> High

In [20]:
TV_data_manga.insert(loc = 1, column = "Score_reference", value = "Medium")

In [21]:
for i in range(len(TV_data_manga)):
    if TV_data_manga["Score"].iloc[i] >8.5:
        TV_data_manga["Score_reference"].iloc[i] = "High"
    if TV_data_manga["Score"].iloc[i] <6.5:
        TV_data_manga["Score_reference"].iloc[i] = "Low" #SCORE REFERENCE IS FROM ANIME SCORE

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TV_data_manga["Score_reference"].iloc[i] = "High"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TV_data_manga["Score_reference"].iloc[i] = "Low" #SCORE REFERENCE IS FROM ANIME SCORE


## Create a new column to represent Score as another classification variable 

In [22]:
TV_data_manga.insert(loc = 2, column = "Score_reference2", value = "Bad")

In [23]:
for i in range(len(TV_data_manga)):
    if TV_data_manga["Score"].iloc[i] >7.5:
        TV_data_manga["Score_reference2"].iloc[i] = "Good"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TV_data_manga["Score_reference2"].iloc[i] = "Good"


In [24]:
TV_data.insert(loc = 2, column = "Score_reference2", value = "Bad")

In [25]:
for i in range(len(TV_data)):
    if TV_data["Score"].iloc[i] >7.5:
        TV_data["Score_reference2"].iloc[i] = "Good"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TV_data["Score_reference2"].iloc[i] = "Good"


## Create a new column for age of each show

In [26]:
Aired = TV_data_manga["Aired"]
a = []
for i in range(0,1522):
    if i==824:
        year = int(Aired.iloc[824][0:4])
    else:
        year = int(Aired.iloc[i].split(",")[1].strip()[0:4])
    age = 2020 - year
    a.append(age)

In [27]:
TV_data_manga["Age"] = a

In [28]:
TV_data_manga 

Unnamed: 0,Anime Title,Score_reference,Score_reference2,Type,Episodes,Aired,Producers,Studios,Source,Genres,Score,Members,Favorites,Fav%,Source_reference,genre_reference_1,genre_reference_2,Season,Manga_score,Age
0,Fullmetal Alchemist: Brotherhood,High,Good,TV,64,"Apr 5, 2009 to Jul 4, 2010","Aniplex, Square Enix, Mainichi Broadcasting Sy...",Bones,Manga,"Action, Military, Adventure, Comedy, Drama, Ma...",9.22,2025613,167812,8.285,0,1,2,2,9.04,11
2,Gintama°,High,Good,TV,51,"Apr 8, 2015 to Mar 30, 2016","TV Tokyo, Aniplex, Dentsu",Bandai Namco Pictures,Manga,"Action, Comedy, Historical, Parody, Samurai, S...",9.11,363879,10353,2.845,0,1,4,2,8.62,5
3,Hunter x Hunter (2011),High,Good,TV,148,"Oct 2, 2011 to Sep 24, 2014","VAP, Nippon Television Network, Shueisha",Madhouse,Manga,"Action, Adventure, Fantasy, Shounen, Super Power",9.11,1459320,130492,8.942,0,1,3,4,8.72,9
5,Gintama',High,Good,TV,51,"Apr 4, 2011 to Mar 26, 2012","TV Tokyo, Aniplex, Dentsu, Trinity Sound, Mira...",Sunrise,Manga,"Action, Sci-Fi, Comedy, Historical, Parody, Sa...",9.08,337198,6013,1.783,0,1,10,2,8.62,9
6,Shingeki no Kyojin Season 3 Part 2,High,Good,TV,10,"Apr 29, 2019 to Jul 1, 2019","Production I.G, Dentsu, Mainichi Broadcasting ...",Wit Studio,Manga,"Action, Military, Mystery, Super Power, Drama,...",9.07,815370,28479,3.493,0,1,2,2,6.78,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9841,Makeruna!! Aku no Gundan!,Low,Bad,TV,12,"Apr 5, 2017 to Jun 21, 2017",Bushiroad,Tatsunoko Production,Manga,"Comedy, Shounen",5.33,1628,0,0.000,0,4,8,2,6.60,3
9902,Nobunaga-sensei no Osanazuma,Low,Bad,TV,12,"Apr 6, 2019 to Jun 22, 2019",,Seven,Manga,"Comedy, Ecchi, Harem, Romance, School",5.29,52359,78,0.149,0,4,35,2,7.71,1
9978,Barom One,Low,Bad,TV,13,"Dec 7, 2002 to Mar 22, 2003",AT-X,E&G Films,Manga,"Action, Sci-Fi, Supernatural, Shounen",5.25,1576,1,0.063,0,1,10,4,8.30,18
9985,Omae wa Mada Gunma wo Shiranai,Low,Bad,TV,12,"Apr 2, 2018 to Jun 18, 2018",TOHO animation,Asahi Production,Web manga,"Comedy, School",5.25,11760,2,0.017,0,4,24,2,7.05,2


In [36]:
for i in range(len(TV_data_manga)):
    if TV_data_manga["Age"].iloc[i] > 5:
        TV_data_manga.loc[i, "Score"] = TV_data_manga["Fav%"].iloc[i]
    else:
        TV_data_manga.loc[i, "Score"] = TV_data_manga["Manga_score"].iloc[i]


In [37]:
TV_data_manga

Unnamed: 0,Anime Title,Score_reference,Score_reference2,Type,Episodes,Aired,Producers,Studios,Source,Genres,Score,Members,Favorites,Fav%,Source_reference,genre_reference_1,genre_reference_2,Season,Manga_score,Age
0,Fullmetal Alchemist: Brotherhood,High,Good,TV,64,"Apr 5, 2009 to Jul 4, 2010","Aniplex, Square Enix, Mainichi Broadcasting Sy...",Bones,Manga,"Action, Military, Adventure, Comedy, Drama, Ma...",8.285,2025613.0,167812.0,8.285,0.0,1.0,2.0,2.0,9.04,11.0
2,Gintama°,High,Good,TV,51,"Apr 8, 2015 to Mar 30, 2016","TV Tokyo, Aniplex, Dentsu",Bandai Namco Pictures,Manga,"Action, Comedy, Historical, Parody, Samurai, S...",8.942,363879.0,10353.0,2.845,0.0,1.0,4.0,2.0,8.62,5.0
3,Hunter x Hunter (2011),High,Good,TV,148,"Oct 2, 2011 to Sep 24, 2014","VAP, Nippon Television Network, Shueisha",Madhouse,Manga,"Action, Adventure, Fantasy, Shounen, Super Power",1.783,1459320.0,130492.0,8.942,0.0,1.0,3.0,4.0,8.72,9.0
5,Gintama',High,Good,TV,51,"Apr 4, 2011 to Mar 26, 2012","TV Tokyo, Aniplex, Dentsu, Trinity Sound, Mira...",Sunrise,Manga,"Action, Sci-Fi, Comedy, Historical, Parody, Sa...",1.124,337198.0,6013.0,1.783,0.0,1.0,10.0,2.0,8.62,9.0
6,Shingeki no Kyojin Season 3 Part 2,High,Good,TV,10,"Apr 29, 2019 to Jul 1, 2019","Production I.G, Dentsu, Mainichi Broadcasting ...",Wit Studio,Manga,"Action, Military, Mystery, Super Power, Drama,...",6.970,815370.0,28479.0,3.493,0.0,1.0,2.0,2.0,6.78,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2582,,,,,,,,,,,,,,,,,,,,
2583,,,,,,,,,,,,,,,,,,,,
2584,,,,,,,,,,,,,,,,,,,,
2586,,,,,,,,,,,,,,,,,,,,


In [56]:
cleaned_data = pd.DataFrame(TV_data_manga[["Anime Title", "genre_reference_1", "genre_reference_2", "Score"]])

In [57]:
cleaned_data.to_csv("Datasets\cleaned_data.csv")

In [20]:
Genres.to_csv("Datasets\Genres")