In [27]:
import pandas as pd
import itertools

In [2]:
anime_reviews_df = pd.read_csv('data/anime_reviews/reviews.csv')

In [3]:
anime_reviews_df.head()

Unnamed: 0,uid,profile,anime_uid,text,score,scores,link
0,255938,DesolatePsyche,34096,\n \n \n \n ...,8,"{'Overall': '8', 'Story': '8', 'Animation': '8...",https://myanimelist.net/reviews.php?id=255938
1,259117,baekbeans,34599,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=259117
2,253664,skrn,28891,\n \n \n \n ...,7,"{'Overall': '7', 'Story': '7', 'Animation': '9...",https://myanimelist.net/reviews.php?id=253664
3,8254,edgewalker00,2904,\n \n \n \n ...,9,"{'Overall': '9', 'Story': '9', 'Animation': '9...",https://myanimelist.net/reviews.php?id=8254
4,291149,aManOfCulture99,4181,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=291149


In [4]:
anime_df = pd.read_csv('data/anime_reviews/animes.csv')
anime_df.head(2)

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...


# Anime reviews sentiment dataset

In [5]:
def clean_review(text):
    to_stitch= []
    sentences = text.split('\n\n')[1].strip().split('\r\n')
    for s in sentences:
        if len(s)>20:
            to_stitch.append(s)
    clean_text = ' '.join(to_stitch)
    return clean_text

In [37]:
anime_reviews_df['text_clean'] = anime_reviews_df.text.apply(clean_review)

In [38]:
anime_reviews_df.head(2)

Unnamed: 0,uid,profile,anime_uid,text,score,scores,link,text_clean
0,255938,DesolatePsyche,34096,\n \n \n \n ...,8,"{'Overall': '8', 'Story': '8', 'Animation': '8...",https://myanimelist.net/reviews.php?id=255938,"First things first. My ""reviews"" system is exp..."
1,259117,baekbeans,34599,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=259117,Let me start off by saying that Made in Abyss ...


In [43]:
low_score = anime_reviews_df[anime_reviews_df.score<5].sample(12000)
medium_score = anime_reviews_df[(anime_reviews_df.score>=5)&(anime_reviews_df.score<8)].sample(12000)
high_score = anime_reviews_df[anime_reviews_df.score>=8].sample(12000)

In [44]:
low_score['sentiment'] = 0
medium_score['sentiment'] = 1
high_score['sentiment'] = 2

In [46]:
low_score = low_score[['uid','text_clean','sentiment']]
medium_score = medium_score[['uid','text_clean','sentiment']]
high_score = high_score[['uid','text_clean','sentiment']]

In [47]:
low_score_test = low_score.sample(2000)
low_score_train = low_score.drop(low_score_test.index)
medium_score_test = medium_score.sample(2000)
medium_score_train = medium_score.drop(medium_score_test.index)
high_score_test = high_score.sample(2000)
high_score_train = high_score.drop(high_score_test.index)

In [48]:
sentiment_train_dataset = pd.concat([low_score_train,medium_score_train, high_score_train])
sentiment_test_dataset = pd.concat([low_score_test, medium_score_test, high_score_test])

In [49]:
sentiment_test_dataset.to_csv('data/anime_reviews/sentiment_test.csv', index=False)
sentiment_train_dataset.to_csv('data/anime_reviews/sentiment_train.csv', index=False)

# Anime Genre classification dataset

In [5]:
anime_cleaned = anime_df.drop_duplicates()
anime_cleaned = anime_cleaned.dropna(subset=['synopsis'])
anime_cleaned = anime_cleaned[anime_cleaned.genre!='[]']
anime_cleaned = anime_cleaned[~anime_cleaned.score.isnull()]

In [6]:
# casting to list of strings
anime_cleaned['genre'] = anime_cleaned['genre'].str.strip('[]').str.split(',')
# stripping spaces from the words, sorting the lists and casting back to string
anime_cleaned['genre'] = anime_cleaned.genre.apply(lambda x: sorted([y.strip(' ') for y in x]))

In [14]:
genres = anime_cleaned['genre'].tolist()
unique_genres = []
for l in genres:
    for l1 in l:
        unique_genres.append(l1)
    

In [25]:
unique_genres = sorted(list(set(unique_genres)))
len(unique_genres)

43

In [21]:
one_hot_encoding=[]
for l in genres:
    encoding_list = [0]*len(unique_genres)
    for g in l:
        idx = unique_genres.index(g)
        encoding_list[idx] = 1
    one_hot_encoding.append(encoding_list)

In [28]:
one_hot_genres = list(map(list, itertools.zip_longest(*one_hot_encoding, fillvalue=None))) # discards no data if jagged and fills short nested lists with None


In [35]:
for col, values in zip(unique_genres, one_hot_genres):
    anime_cleaned[col.strip("'")] = values

In [36]:
anime_cleaned.head(3)

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,...,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Drama', 'School', 'Shounen', 'Spor...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,...,1,0,0,1,0,0,0,0,0,0
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,...,1,0,0,0,0,0,0,0,0,0
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Adventure', 'Drama', 'Fantasy', 'Mystery', '...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,...,0,0,0,0,0,0,0,0,0,0


In [37]:
anime_cleaned.to_csv('data/anime_reviews/anime_genre_multilabel.csv', index=False)