# 1. Import Data

## Load Data

In [1]:
# import numpy and pandas library
import numpy as np
import pandas as pd

First, load rating data

In [2]:
path_rating = './anime_dataset/rating.csv'

In [3]:
df_rating = pd.read_csv(path_rating)

In [4]:
df_rating

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


In [None]:
# sampling data

## Check Data Condition

In [5]:
df_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


In [6]:
df_rating.shape

(7813737, 3)

In [9]:
df_rating['user_id'].nunique()

73515

In [10]:
df_rating['anime_id'].nunique()

11200

Rating columns have value -1, it means user watched it but didn't assign a rating

In [11]:
df_rating['rating'].value_counts()

 8     1646019
-1     1476496
 7     1375287
 9     1254096
 10     955715
 6      637775
 5      282806
 4      104291
 3       41453
 2       23150
 1       16649
Name: rating, dtype: int64

# 2. Data Cleansing

## Handling duplicates data


In [12]:
# Check duplicate data
df_rating.duplicated(subset=['user_id', 'anime_id']).sum()

7

In [13]:
dropped_data = df_rating.drop_duplicates(subset=['user_id', 'anime_id'])

In [14]:
# Check duplicate data after dropped
dropped_data.duplicated(subset=['user_id', 'anime_id']).sum()

0

In [15]:
# check data shape after dropping duplicate value
dropped_data.shape

(7813730, 3)

## Treatment -1 value

In [16]:
# replace -1 with NaN and drop NaN values
df_clean = dropped_data.replace({-1:np.nan}).dropna()
df_clean

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10.0
81,1,11617,10.0
83,1,11757,10.0
101,1,15451,10.0
153,2,11771,10.0
...,...,...,...
7813732,73515,16512,7.0
7813733,73515,17187,9.0
7813734,73515,22145,10.0
7813735,73516,790,9.0


# 3. Modeling
Modeling using `suprise` library

## Load Data
This part has purpose to adjust input data in surprise library

In [17]:
# import some modul from surprise library
from surprise import Dataset, Reader

In [18]:
# input rating scale from anime data, the rating scale between 1-10
reader = Reader(rating_scale = (1, 10))
reader

<surprise.reader.Reader at 0x7f8548e638e0>

In [19]:
# create utility data from clean dataframe
utility_data = Dataset.load_from_df(
                    df = df_clean[['user_id', 'anime_id', 'rating']].copy(),
                    reader = reader
                )

utility_data

<surprise.dataset.DatasetAutoFolds at 0x7f854a1df760>

In [20]:
utility_data.df.head()

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10.0
81,1,11617,10.0
83,1,11757,10.0
101,1,15451,10.0
153,2,11771,10.0


In [21]:
df_clean.head()

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10.0
81,1,11617,10.0
83,1,11757,10.0
101,1,15451,10.0
153,2,11771,10.0


## Data Splitting
Split data to train set and test set for modeling

In [22]:
# Load library for copy object
import copy

In [23]:
# Create data splitting function
def train_test_split(utility_data, test_size, random_state):
    # Deep copy the utility_data
    full_data = copy.deepcopy(utility_data)

    # Generate random seed
    np.random.seed(random_state)

    # Shuffle the raw_ratings 
    raw_ratings = full_data.raw_ratings
    np.random.shuffle(raw_ratings)

    # Define the threshold
    threshold = int((1-test_size) * len(raw_ratings))

    # Split the data
    train_raw_ratings = raw_ratings[:threshold]
    test_raw_ratings = raw_ratings[threshold:]

    # Get the data
    full_data.raw_ratings = train_raw_ratings
    train_data = full_data.build_full_trainset()
    test_data = full_data.construct_testset(test_raw_ratings)

    return full_data, train_data, test_data


In [24]:
# Split the data
full_data, train_data, test_data = train_test_split(utility_data,
                                                    test_size = 0.2,
                                                    random_state = 42)

In [25]:
# Check splitting result
train_data.n_ratings, len(test_data)

(5069787, 1267447)

## Build Model

In [26]:
# Import library for modeling
from surprise import AlgoBase, KNNBaseline, SVD


### Baseline Model

In [27]:
class MeanPred(AlgoBase):
    '''Baseline prediction. Return global mean as prediction'''
    def __init__(self):
        AlgoBase.__init__(self)

    def fit(self, trainset):
        '''Fit the train data'''
        AlgoBase.fit(self, trainset)

    def estimate(self, u, i):
        '''Perform the estimation/prediction.'''
        estimation = self.trainset.global_mean
        return estimation

In [28]:
# Build baseline model
base_model = MeanPred()
base_model

<__main__.MeanPred at 0x7f8548e6f880>

In [29]:
# Import the cross validation module
from surprise.model_selection import cross_validate

In [30]:
# do cross validation
cv_base = cross_validate(algo = base_model,
                             data = full_data,
                             cv = 5,
                             measures = ['rmse'])

In [33]:
import pickle

# save the iris classification model as a pickle file
base_model_pkl_file = "base_model.pkl"  

with open(base_model_pkl_file, 'wb') as file:  
    pickle.dump(cv_base, file)

In [31]:
# Extract cross validation result
cv_base_rmse = cv_base['test_rmse'].mean()
cv_base_rmse

1.5722800383405358

### KNN Model

In [34]:
# import randomize search cross validation module
from surprise.model_selection.search import RandomizedSearchCV

In [35]:
#create parameter dictionary for KNN model
params_knn = {'k':list(np.arange(start=5, stop=50, step=5)),
          'sim_options':{'name':['cosine','pearson_baseline'],'user_based':[True,False]}}

In [36]:
cv_knn = RandomizedSearchCV(algo_class=KNNBaseline, param_distributions = params_knn, cv=5)

In [None]:
cv_knn.fit(data=full_data)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


### Funk SVD Model

In [None]:
#create parameter dictionary for Funk SVD model
params_svd = {'lr_all' : [1,0.1,0.01,0.001,0.0001], 'n_factors' : [50,100,150,200],
              'reg_all' : [1,0.1,0.01,0.02]
              }

In [None]:
cv_svd = RandomizedSearchCV(algo_class=SVD, param_distributions = params_svd, cv=5)

In [None]:
cv_svd.fit(data=full_data)

### Model Comparison

In [None]:
df_comp = pd.DataFrame({'Model': ['Baseline', 'KNN','Funk SVD'],
                        'CV RMSE': [cv_base_rmse,cv_knn.best_score['rmse'],cv_svd.best_score['rmse'] ],
                        'Model Configuration':['N/A',f'{cv_knn.best_params["rmse"]}',f'{cv_svd.best_params["rmse"]}']})

df_comp