In [1]:
import pickle
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

**Load dataframe**

In [2]:
%%time
with open('UserAnimeList_Pivot-3.9.12.pkl', 'rb') as f:
    df = pickle.load(f)

CPU times: user 8.87 s, sys: 747 ms, total: 9.61 s
Wall time: 9.65 s


In [3]:
df

username,-------,----------,-------------,------o,-----aaa,-----noname-----,----Adrenadroid,----XII----,----phoebelyn,---Kuma---,...,zzzero,zzzett,zzzgaarazzz,zzzsss,zzzu,zzzyeknom0,zzzzz-chan,zzzzzzzBen,zzzzzzzzzzzzzz,zzzzzzzzzzzzzzz
1,8.0,,8.0,9.0,,8.0,,,0.0,,...,8.0,,,,10.0,10.0,,0.0,,
5,8.0,,7.0,,,0.0,,,10.0,,...,8.0,,,,8.0,,,,,
6,0.0,,,,,,,10.0,,,...,9.0,,,,9.0,0.0,8.0,3.0,,0.0
7,0.0,,,,,,,,,,...,6.0,,,,,,,,,0.0
8,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37023,,,,,,,,,,,...,,,,,,,,0.0,,
37029,,,,,,,,,,,...,,,,,,,,,,
37126,,,,,,,,,,,...,,,,,,,,,,
37140,,,,,,,,,,,...,,,,,,,,,,


**Convert DataFrame to CSR format**

In [4]:
%%time
df = df.fillna(0)

CPU times: user 5.31 s, sys: 147 ms, total: 5.45 s
Wall time: 5.44 s


In [5]:
%%time
sparse_matrix = csr_matrix(df.sparse.to_coo())

CPU times: user 16.6 s, sys: 2.87 s, total: 19.5 s
Wall time: 19.5 s


In [6]:
sparse_matrix.shape

(2961, 281568)

**Build and persist KNN model**

In [7]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

In [8]:
model_knn.fit(sparse_matrix)

In [9]:
with open('movies_index.pkl', 'wb') as handle:
    pickle.dump(df.index, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [25]:
with open('user_anime_pivot.pkl', 'wb') as handle:
    pickle.dump(sparse_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

**Save anime data to database**

In [11]:
anime_df = pd.read_csv('anime_cleaned.csv.zip')
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6668 entries, 0 to 6667
Data columns (total 33 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   anime_id         6668 non-null   int64  
 1   title            6668 non-null   object 
 2   title_english    3438 non-null   object 
 3   title_japanese   6663 non-null   object 
 4   title_synonyms   4481 non-null   object 
 5   image_url        6666 non-null   object 
 6   type             6668 non-null   object 
 7   source           6668 non-null   object 
 8   episodes         6668 non-null   int64  
 9   status           6668 non-null   object 
 10  airing           6668 non-null   bool   
 11  aired_string     6668 non-null   object 
 12  aired            6668 non-null   object 
 13  duration         6668 non-null   object 
 14  rating           6668 non-null   object 
 15  score            6668 non-null   float64
 16  scored_by        6668 non-null   int64  
 17  rank          

In [12]:
anime_df = anime_df[anime_df.anime_id.isin(df.index)]

In [13]:
anime_df['genre'] = anime_df['genre'].str.split(',\s+')

In [20]:
# CDN host changed as of 2022
anime_df['image_url'] = \
    anime_df['image_url'].str.replace('myanimelist.cdn-dena.com', 'cdn.myanimelist.net', regex=False)

In [23]:
subset_df = anime_df[['anime_id', 'title', 'title_english', 'title_synonyms', 'image_url', 
                      'type', 'source', 'score', 'rank', 'genre', 'aired_string', 'studio']]

In [24]:
subset_df.to_parquet('anime_db.parquet')