# Finding Similar Anime By Genre

An anime finder by genre using simple feature and jaccard similarity score. 

In [None]:
import numpy as np
import pandas as pd
import itertools
import collections
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import jaccard_similarity_score

# Preprocessing

We split the genre in preprocessing, so later we can turn them to features.

In [None]:
animes = pd.read_csv("anime.csv") 

animes['genre'] = animes['genre'].fillna('None')
animes['genre'] = animes['genre'].apply(lambda x: x.split(', '))

genre_data = itertools.chain(*animes['genre'].values.tolist())
genre_counter = collections.Counter(genre_data)
genres = pd.DataFrame.from_dict(genre_counter, orient='index').reset_index().rename(columns={'index':'genre', 0:'count'})
genres.sort_values('count', ascending=False, inplace=True)

print(genres)

# Feature Extraction
The feature extraction is simple, a binary encoded vector of genre.

this shows which feature/genre each show has in binary (by id)

In [None]:
genre_map = {genre: idx for idx, genre in enumerate(genre_counter.keys())}
def extract_feature(genre):
    feature = np.zeros(len(genre_map.keys()), dtype=int)
    feature[[genre_map[idx] for idx in genre]] += 1
    return feature
    
anime_feature = pd.concat([animes['name'], animes['genre']], axis=1)
anime_feature['genre'] = anime_feature['genre'].apply(lambda x: extract_feature(x))
print(anime_feature.head(30))

# Testing

In [None]:
test_data = anime_feature.take([841, 111, 0, 23])
for row in test_data.iterrows():
    print('Similar anime like {}:'.format(row[1]['name']))
    search = anime_feature.drop([row[0]]) # drop current anime
    search['result'] = search['genre'].apply(lambda x: jaccard_similarity_score(row[1]['genre'], x))
    search_result = search.sort_values('result', ascending=False)['name'].head(10)
    for res in search_result.values:
        print('\t{}'.format(res))
    print()

In [None]:
# for row in anime_feature.iterrows():
#     print('Similar anime like {}:'.format(row[1]['name']))
#     search = anime_feature.drop([row[0]])
#     search['result'] = search['genre'].apply(lambda x: jaccard_similarity_score(row[1]['genre'], x))
#     search_result = search.sort_values('result', ascending=False)['name'].head(5)
#     for res in search_result.values:
#         print('\t{}'.format(res))