In [1]:
import numpy as np
import pandas as pd

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from load_data import load_data
df, _ = load_data()

In [4]:
user_n = 4000
anime_n = 4000
stump_df_idx = (df['rating'] != -1) & (df['user_id'] < user_n) & (df['anime_id'] < anime_n)

In [5]:
df[stump_df_idx]

Unnamed: 0,user_id,anime_id,rating
156,3,20,8
157,3,154,6
158,3,170,9
159,3,199,10
160,3,225,9
...,...,...,...
388281,3998,3503,2
388282,3998,3782,9
388283,3998,3783,9
388377,3999,20,7


In [6]:
df_without_trash = df[stump_df_idx]

In [7]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df_without_trash, test_size=0.2, random_state=42)


  from scipy.sparse import csr_matrix, issparse


In [8]:
from lfm import LatentFactorModel


In [9]:
n_fact = 30
lr = 0.01
reg = 0.1
n_epoch = 50

In [10]:
model = LatentFactorModel(learning_rate=lr, reg=reg, n_factors=n_fact)
model.fit(train_df, user_col_name='user_id', item_col_name='anime_id', rating_col_name='rating', test_df=test_df, n_iters=n_epoch)

Iteration 1/50, RMSE = 6.7711
Iteration 2/50, RMSE = 4.0843
Iteration 3/50, RMSE = 3.0581
Iteration 4/50, RMSE = 2.5983
Iteration 5/50, RMSE = 2.3331
Iteration 6/50, RMSE = 2.1601
Iteration 7/50, RMSE = 2.0382
Iteration 8/50, RMSE = 1.9477
Iteration 9/50, RMSE = 1.8779
Iteration 10/50, RMSE = 1.8227
Iteration 11/50, RMSE = 1.7779
Iteration 12/50, RMSE = 1.7409
Iteration 13/50, RMSE = 1.7100
Iteration 14/50, RMSE = 1.6838
Iteration 15/50, RMSE = 1.6614
Iteration 16/50, RMSE = 1.6420
Iteration 17/50, RMSE = 1.6251
Iteration 18/50, RMSE = 1.6103
Iteration 19/50, RMSE = 1.5972
Iteration 20/50, RMSE = 1.5856
Iteration 21/50, RMSE = 1.5752
Iteration 22/50, RMSE = 1.5659
Iteration 23/50, RMSE = 1.5576
Iteration 24/50, RMSE = 1.5500
Iteration 25/50, RMSE = 1.5432
Iteration 26/50, RMSE = 1.5369
Iteration 27/50, RMSE = 1.5313
Iteration 28/50, RMSE = 1.5261
Iteration 29/50, RMSE = 1.5214
Iteration 30/50, RMSE = 1.5170
Iteration 31/50, RMSE = 1.5131
Iteration 32/50, RMSE = 1.5094
Iteration 33/50, 

In [11]:
model.evaluate(test_df)

1.4768938279392494

In [20]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df_without_trash[["user_id", "anime_id", "rating"]], reader)


surprise_model = SVD(
    n_factors=n_fact,
    lr_all=lr,
    reg_all=reg,
    n_epochs=n_epoch,
)
print(
    cross_validate(
        surprise_model,
        data,
        measures=["RMSE", "MAE"],
        cv=5,
        verbose=True,
    )
)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2345  1.2347  1.2141  1.2399  1.2489  1.2344  0.0114  
MAE (testset)     0.9399  0.9353  0.9271  0.9421  0.9463  0.9381  0.0066  
Fit time          0.56    0.51    0.53    0.49    0.57    0.53    0.03    
Test time         0.12    0.12    0.12    0.25    0.15    0.15    0.05    
{'test_rmse': array([1.234519  , 1.2346701 , 1.21413591, 1.23990085, 1.24890386]), 'test_mae': array([0.93991806, 0.93526866, 0.92711926, 0.94209433, 0.94633719]), 'fit_time': (0.562861442565918, 0.5107367038726807, 0.5273182392120361, 0.4922168254852295, 0.5730063915252686), 'test_time': (0.12100100517272949, 0.12400126457214355, 0.12065768241882324, 0.2535364627838135, 0.14999151229858398)}


In [13]:
from sklearn.model_selection import KFold
def cross_validate_model(df, model_gen, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    rmse_history = []
    it=0
    for train_index, test_index in kf.split(df):
        it+=1
        df_train = df.iloc[train_index]
        df_test = df.iloc[test_index]
        model = model_gen(learning_rate=lr, reg=reg, n_factors=n_fact)
        model.fit(df_train, user_col_name='user_id', item_col_name='anime_id', rating_col_name='rating', n_iters=20, flag_print=False)
        rmse = model.evaluate(df_test)

        rmse_history.append(rmse)
        print(f"Iteration {it}/{k}, RMSE = {rmse:.4f}")

    return np.mean(rmse_history), np.std(rmse_history)

In [14]:
avg_acc_custom, std_acc_custom = cross_validate_model(df_without_trash, LatentFactorModel, k=5)
print(f"Средняя точность (Custom): {avg_acc_custom:.4f} ± {std_acc_custom:.4f}")

Iteration 1/5, RMSE = 1.5688
Iteration 2/5, RMSE = 1.5863
Iteration 3/5, RMSE = 1.5866
Iteration 4/5, RMSE = 1.5591
Iteration 5/5, RMSE = 1.6007
Средняя точность (Custom): 1.5803 ± 0.0146


# Время

In [16]:
n_epoch = 15

In [17]:
%%timeit
model_for_time = LatentFactorModel(learning_rate=lr, reg=reg, n_factors=n_fact)
model_for_time.fit(train_df, user_col_name='user_id', item_col_name='anime_id', rating_col_name='rating', n_iters=n_epoch)

1min 2s ± 7.6 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
%timeit model.predict_df(test_df)

436 ms ± 109 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
from surprise.model_selection import train_test_split as train_test_split_surprise
train_data, test_data = train_test_split_surprise(data, test_size=0.2, random_state=42)

In [25]:
%%timeit
surprise_model_for_time = SVD(
    n_factors=n_fact,
    lr_all=lr,
    reg_all=reg,
    n_epochs=n_epoch,
)
surprise_model_for_time.fit(train_data)

545 ms ± 83.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [26]:
%timeit surprise_model.test(test_data)

125 ms ± 9.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [27]:
len(df_without_trash['user_id'].unique()), len(df_without_trash['anime_id'].unique())

(3476, 2480)

In [28]:
_, anime = load_data()

In [32]:
filtered_df = anime[anime['name'].str.contains('Berserk', case=False, na=False)]
filtered_df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
151,33,Berserk,"Action, Adventure, Demons, Drama, Fantasy, Hor...",TV,25,8.4,226430
198,12115,Berserk: Ougon Jidai-hen III - Kourin,"Action, Adventure, Demons, Drama, Fantasy, Hor...",Movie,1,8.33,65594
419,12113,Berserk: Ougon Jidai-hen II - Doldrey Kouryaku,"Action, Adventure, Demons, Drama, Fantasy, Hor...",Movie,1,8.09,66721
657,10218,Berserk: Ougon Jidai-hen I - Haou no Tamago,"Action, Adventure, Demons, Fantasy, Military, ...",Movie,1,7.91,77103
4052,32379,Berserk (2016),"Action, Adventure, Demons, Drama, Fantasy, Hor...",TV,12,6.81,90817
10924,34055,Berserk (2017),"Action, Adventure, Demons, Drama, Fantasy, Hor...",TV,Unknown,,13463


In [39]:
berserk_pred = model.predict_df(df_without_trash[df_without_trash['anime_id'] == 33])
berserk_pred

4859      8.257078
4952      9.041274
6003      7.947333
8832      8.055352
10038     7.397682
            ...   
382180    9.889663
383363    8.095561
383557    8.317863
386598    8.561800
387520    7.759800
Length: 269, dtype: float64

In [40]:
berserk_pred.describe()

count    269.000000
mean       7.955258
std        0.750086
min        4.112557
25%        7.523219
50%        7.975763
75%        8.445199
max        9.889663
dtype: float64

In [35]:
filtered_df = anime[anime['name'].str.contains('Evangelion', case=False, na=False)]
filtered_df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
90,3784,Evangelion: 2.0 You Can (Not) Advance,"Action, Mecha, Sci-Fi",Movie,1,8.53,182224
130,32,Neon Genesis Evangelion: The End of Evangelion,"Dementia, Drama, Mecha, Psychological, Sci-Fi",Movie,1,8.45,215630
211,30,Neon Genesis Evangelion,"Action, Dementia, Drama, Mecha, Psychological,...",TV,26,8.32,461946
294,2759,Evangelion: 1.0 You Are (Not) Alone,"Action, Mecha, Sci-Fi",Movie,1,8.21,194561
1029,3785,Evangelion: 3.0 You Can (Not) Redo,"Action, Mecha, Sci-Fi",Movie,1,7.71,135318
1538,31,Neon Genesis Evangelion: Death &amp; Rebirth,"Drama, Mecha, Psychological, Sci-Fi",Movie,1,7.51,102093
6628,4130,Petit Eva: Evangelion@School,"Comedy, Parody, School",ONA,24,6.03,15734
7320,31115,Schick x Evangelion,"Comedy, Parody",Special,2,5.56,2021
10976,3786,Evangelion: 3.0+1.0,"Action, Mecha, Sci-Fi",Movie,1,,66600


In [37]:
evangilion_pred = model.predict_df(df_without_trash[df_without_trash['anime_id'] == 30])

In [41]:
evangilion_pred.describe()

count    691.000000
mean       7.987436
std        0.837766
min        2.840964
25%        7.543509
50%        8.040452
75%        8.521086
max        9.914933
dtype: float64