In [1]:
import pandas as pd

In [2]:
anime_reviews_df = pd.read_csv('data/anime_reviews/reviews.csv')

In [3]:
anime_reviews_df.head()

Unnamed: 0,uid,profile,anime_uid,text,score,scores,link
0,255938,DesolatePsyche,34096,\n \n \n \n ...,8,"{'Overall': '8', 'Story': '8', 'Animation': '8...",https://myanimelist.net/reviews.php?id=255938
1,259117,baekbeans,34599,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=259117
2,253664,skrn,28891,\n \n \n \n ...,7,"{'Overall': '7', 'Story': '7', 'Animation': '9...",https://myanimelist.net/reviews.php?id=253664
3,8254,edgewalker00,2904,\n \n \n \n ...,9,"{'Overall': '9', 'Story': '9', 'Animation': '9...",https://myanimelist.net/reviews.php?id=8254
4,291149,aManOfCulture99,4181,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=291149


In [4]:
anime_df = pd.read_csv('data/anime_reviews/animes.csv')
anime_df.head(2)

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...


# Anime reviews sentiment dataset

In [5]:
def clean_review(text):
    to_stitch= []
    sentences = text.split('\n\n')[1].strip().split('\r\n')
    for s in sentences:
        if len(s)>20:
            to_stitch.append(s)
    clean_text = ' '.join(to_stitch)
    return clean_text

In [37]:
anime_reviews_df['text_clean'] = anime_reviews_df.text.apply(clean_review)

In [38]:
anime_reviews_df.head(2)

Unnamed: 0,uid,profile,anime_uid,text,score,scores,link,text_clean
0,255938,DesolatePsyche,34096,\n \n \n \n ...,8,"{'Overall': '8', 'Story': '8', 'Animation': '8...",https://myanimelist.net/reviews.php?id=255938,"First things first. My ""reviews"" system is exp..."
1,259117,baekbeans,34599,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=259117,Let me start off by saying that Made in Abyss ...


In [43]:
low_score = anime_reviews_df[anime_reviews_df.score<5].sample(12000)
medium_score = anime_reviews_df[(anime_reviews_df.score>=5)&(anime_reviews_df.score<8)].sample(12000)
high_score = anime_reviews_df[anime_reviews_df.score>=8].sample(12000)

In [44]:
low_score['sentiment'] = 0
medium_score['sentiment'] = 1
high_score['sentiment'] = 2

In [46]:
low_score = low_score[['uid','text_clean','sentiment']]
medium_score = medium_score[['uid','text_clean','sentiment']]
high_score = high_score[['uid','text_clean','sentiment']]

In [47]:
low_score_test = low_score.sample(2000)
low_score_train = low_score.drop(low_score_test.index)
medium_score_test = medium_score.sample(2000)
medium_score_train = medium_score.drop(medium_score_test.index)
high_score_test = high_score.sample(2000)
high_score_train = high_score.drop(high_score_test.index)

In [48]:
sentiment_train_dataset = pd.concat([low_score_train,medium_score_train, high_score_train])
sentiment_test_dataset = pd.concat([low_score_test, medium_score_test, high_score_test])

In [49]:
sentiment_test_dataset.to_csv('data/anime_reviews/sentiment_test.csv', index=False)
sentiment_train_dataset.to_csv('data/anime_reviews/sentiment_train.csv', index=False)

# Anime Genre classification dataset

In [21]:
anime_cleaned = anime_df.drop_duplicates()
anime_cleaned = anime_cleaned.dropna(subset=['synopsis'])
anime_cleaned = anime_cleaned[anime_cleaned.genre!='[]']
anime_cleaned = anime_cleaned[~anime_cleaned.score.isnull()]

In [22]:
# casting to list of strings
anime_cleaned['genre'] = anime_cleaned['genre'].str.strip('[]').str.split(',')
# stripping spaces from the words, sorting the lists and casting back to string
anime_cleaned['genre'] = anime_cleaned.genre.apply(lambda x: str(sorted([y.strip(' ') for y in x])))

In [23]:
anime_grouped = anime_cleaned.groupby('genre').uid.count().sort_values(ascending=False).reset_index()
anime_grouped.columns = ['genre','anime_count']

In [24]:
anime_joined = anime_cleaned.join(anime_grouped.set_index('genre'), on='genre')

In [25]:
anime_joined.head(2)

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link,anime_count
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"[""'Comedy'"", ""'Drama'"", ""'School'"", ""'Shounen'...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...,11
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"[""'Drama'"", ""'Music'"", ""'Romance'"", ""'School'""...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...,2


In [41]:
genre_list = anime_joined[anime_joined.anime_count>70].genre.unique()

In [42]:
genre_map = dict([(g,v) for g,v in zip(genre_list,range(len(genre_list)))])
genre_map

{'["\'Comedy\'", "\'Slice of Life\'"]': 0,
 '["\'Comedy\'"]': 1,
 '["\'Action\'", "\'Mecha\'", "\'Sci-Fi\'"]': 2,
 '["\'Music\'"]': 3,
 '["\'Adventure\'", "\'Fantasy\'"]': 4,
 '["\'Drama\'", "\'Historical\'"]': 5,
 '["\'Drama\'"]': 6,
 '["\'Fantasy\'"]': 7,
 '["\'Comedy\'", "\'Parody\'"]': 8,
 '["\'Adventure\'"]': 9,
 '["\'Kids\'"]': 10,
 '["\'Dementia\'"]': 11,
 '["\'Comedy\'", "\'Kids\'"]': 12,
 '["\'Fantasy\'", "\'Kids\'"]': 13,
 '["\'Drama\'", "\'Kids\'"]': 14,
 '["\'Adventure\'", "\'Kids\'"]': 15,
 '["\'Kids\'", "\'Music\'"]': 16,
 '["\'Slice of Life\'"]': 17,
 '["\'Historical\'"]': 18,
 '["\'Hentai\'"]': 19,
 '["\'Dementia\'", "\'Music\'"]': 20}

In [43]:
anime_joined['genre_encoded'] = anime_joined.genre.map(genre_map)

In [44]:
anime_joined[anime_joined.anime_count>80].head(2)

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link,anime_count,genre_encoded
35,38145,"Doukyonin wa Hiza, Tokidoki, Atama no Ue.",Subaru Mikazuki is a 23-year-old mystery novel...,"[""'Comedy'"", ""'Slice of Life'""]","Jan 9, 2019 to Mar 27, 2019",12.0,89612,1216,784.0,7.86,https://cdn.myanimelist.net/images/anime/1251/...,https://myanimelist.net/anime/38145/Doukyonin_...,245,0.0
48,38958,Manga de Wakaru! Fate/Grand Order,A short anime based on Manga de Wakaru! Fate/...,"[""'Comedy'""]","Dec 31, 2018",1.0,7545,4889,770.0,7.87,https://cdn.myanimelist.net/images/anime/1830/...,https://myanimelist.net/anime/38958/Manga_de_W...,578,1.0


In [45]:
anime_genre_classification = anime_joined[anime_joined.anime_count>70][['uid','synopsis','genre', 'genre_encoded']]


In [46]:
anime_genre_classification.dropna(subset=['synopsis'], inplace=True)

In [47]:
anime_genre_balanced = anime_genre_classification.groupby('genre_encoded').sample(70)

In [48]:
anime_genre_balanced

Unnamed: 0,uid,synopsis,genre,genre_encoded
11800,33854,The series focuses on the staff of the Keihime...,"[""'Comedy'"", ""'Slice of Life'""]",0.0
9670,10020,The true end arc of Ore no Imouto. These four ...,"[""'Comedy'"", ""'Slice of Life'""]",0.0
12425,38299,A collaboration between Kamiusagi Rope and N...,"[""'Comedy'"", ""'Slice of Life'""]",0.0
13476,36784,The story centers around the cheerful calico T...,"[""'Comedy'"", ""'Slice of Life'""]",0.0
15180,37789,Picture Dramas released with the Tenchi Muyou...,"[""'Comedy'"", ""'Slice of Life'""]",0.0
...,...,...,...,...
7523,10436,"A myriad of shadows resembling humans, animals...","[""'Dementia'"", ""'Music'""]",20.0
8195,36158,Official music video for the international mul...,"[""'Dementia'"", ""'Music'""]",20.0
9201,35935,Official music video for Tamaki Roy's song Ko...,"[""'Dementia'"", ""'Music'""]",20.0
3472,38970,Short animation directed by Masanobu Hiraoka.,"[""'Dementia'"", ""'Music'""]",20.0


In [49]:
anime_genre_test = anime_genre_balanced.sample(frac=0.2)
anime_genre_train = anime_genre_balanced.drop(anime_genre_test.index)

In [50]:
anime_genre_test.to_csv('data/anime_reviews/anime_genre_test.csv', index=False)
anime_genre_train.to_csv('data/anime_reviews/anime_genre_train.csv', index=False)