In [280]:
import pandas as pd
import pandas_profiling as pf
import plotly.express as px
import numpy as np

In [41]:
anime_df = pd.read_csv('../app/data/raw_data/animes.csv')
profiles_df = pd.read_csv('../app/data/raw_data/profiles.csv')
reviews_df = pd.read_csv('../app/data/raw_data/reviews.csv')

In [3]:
anime_df.head(3)

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,https://cdn.myanimelist.net/images/anime/6/867...,https://myanimelist.net/anime/34599/Made_in_Abyss


In [4]:
profiles_df.head(3)

Unnamed: 0,profile,gender,birthday,favorites_anime,link
0,DesolatePsyche,Male,"Oct 2, 1994","['33352', '25013', '5530', '33674', '1482', '2...",https://myanimelist.net/profile/DesolatePsyche
1,baekbeans,Female,"Nov 10, 2000","['11061', '31964', '853', '20583', '918', '925...",https://myanimelist.net/profile/baekbeans
2,skrn,,,"['918', '2904', '11741', '17074', '23273', '32...",https://myanimelist.net/profile/skrn


In [5]:
reviews_df.head(3)

Unnamed: 0,uid,profile,anime_uid,text,score,scores,link
0,255938,DesolatePsyche,34096,\n \n \n \n ...,8,"{'Overall': '8', 'Story': '8', 'Animation': '8...",https://myanimelist.net/reviews.php?id=255938
1,259117,baekbeans,34599,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=259117
2,253664,skrn,28891,\n \n \n \n ...,7,"{'Overall': '7', 'Story': '7', 'Animation': '9...",https://myanimelist.net/reviews.php?id=253664


# data cleaning

In [263]:
anime_cleaned = anime_df.drop_duplicates()
profiles_df.drop_duplicates(inplace=True)
reviews_df.drop_duplicates(inplace=True)

In [264]:
anime_cleaned = anime_cleaned[anime_cleaned.genre!='[]']
anime_cleaned = anime_cleaned[~anime_cleaned.score.isnull()]
profiles_df = profiles_df.drop_duplicates(subset=['profile'])


# Pie chart

In [228]:
# casting to list of strings
anime_cleaned['genre'] = anime_cleaned['genre'].str.strip('[]').str.split(',')
# stripping spaces from the words, sorting the lists and casting back to string
anime_cleaned['genre'] = anime_cleaned.genre.apply(lambda x: str(sorted([y.strip(' ') for y in x])))

anime_grouped = anime_cleaned.groupby('genre').uid.count().sort_values(ascending=False).reset_index()
anime_grouped.columns = ['genre','anime_count']
anime_grouped.head(3)

Unnamed: 0,genre,anime_count
0,"[""'Hentai'""]",946
1,"[""'Music'""]",695
2,"[""'Comedy'""]",623


In [229]:
fig = px.pie(anime_grouped[(anime_grouped.anime_count>50)&(anime_grouped.genre!="[]")], values='anime_count', names='genre')
fig.show()

In [230]:
anime_grouped.to_csv('../app/data/genre_pie_chart.csv')

# Score vs Reviews

In [337]:
reviewed_anime = anime_cleaned.join(reviews_df.set_index('anime_uid'), on='uid', rsuffix='_review')

In [338]:
reviews_grouped = reviewed_anime.groupby(pd.cut(reviewed_anime['score'],np.arange(0, 11, 1))).uid_review.count().reset_index(name='review_count')
reviews_grouped['score'] = reviews_grouped.score.astype(str)

In [341]:
fig = px.bar(reviews_grouped,x='score',y='review_count', title='Reviews Score distribution', labels={'score':'score_range'})
fig.show()

In [340]:
reviews_grouped.to_csv('../app/data/score_distribution_chart.csv')

In [304]:
reviewed_anime['genre'] = reviewed_anime['genre'].str.strip('[]').str.split(',')
# stripping spaces from the words, sorting the lists and casting back to string
reviewed_anime['genre'] = reviewed_anime.genre.apply(lambda x: str(sorted([y.strip(' ') for y in x])))

reviews_grouped = reviewed_anime.groupby('genre').uid_review.count().sort_values(ascending=False).reset_index()
reviews_grouped.columns = ['genre','review_count']


In [332]:
rev_test = reviewed_anime.groupby('uid').uid_review.count().sort_values(ascending=False).reset_index()


In [336]:
fig = px.box(rev_test,x='uid_review')
fig.show()

In [317]:
reviews_grouped.describe()

Unnamed: 0,review_count
count,3836.0
mean,34.370959
std,94.153888
min,0.0
25%,1.0
50%,6.0
75%,26.0
max,1923.0


In [319]:
fig = px.histogram(reviews_grouped[reviews_grouped.review_count>10], x='genre', y='review_count' )
fig.show()

In [329]:
fig = px.pie(reviews_grouped[(reviews_grouped.review_count>1000)&(reviews_grouped.genre!="[]")],values='review_count', names='genre', width=900)
fig.show()

# TO-DO: work on the web app and add the two graphs already created.