# Import Modules and Load Data Files

In [1]:
import numpy as np
import pandas as pd
from surprise import NormalPredictor, BaselineOnly, KNNBaseline, KNNWithMeans, Reader, Dataset, accuracy, dump
from surprise.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/book-ratings/ratings.csv
/kaggle/input/book-ratings/books.csv


In [2]:
# Read CSV files into dataframes
books = pd.read_csv('../input/book-ratings/books.csv')
ratings = pd.read_csv('../input/book-ratings/ratings.csv')

# Clean Data Sets
We will remove books with duplicate tiles and drop books titles that contain non-Latin characters

In [3]:
# See which variables have missing values
books.isnull().sum(axis=0)

book_id                         0
goodreads_book_id               0
best_book_id                    0
work_id                         0
books_count                     0
isbn                          700
isbn13                        585
authors                         0
original_publication_year      21
original_title                585
title                           0
language_code                1084
average_rating                  0
ratings_count                   0
work_ratings_count              0
work_text_reviews_count         0
ratings_1                       0
ratings_2                       0
ratings_3                       0
ratings_4                       0
ratings_5                       0
image_url                       0
small_image_url                 0
dtype: int64

In [4]:
# Print descriptive statistics for ratings data
ratings.describe()

Unnamed: 0,user_id,book_id,rating
count,5976479.0,5976479.0,5976479.0
mean,26224.46,2006.477,3.919866
std,15413.23,2468.499,0.9910868
min,1.0,1.0,1.0
25%,12813.0,198.0,3.0
50%,25938.0,885.0,4.0
75%,39509.0,2973.0,5.0
max,53424.0,10000.0,5.0


In [5]:
# Drop duplicate book titles
books = books.drop_duplicates(subset=['title'])

In [6]:
def is_english(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

# Drop books with titles that contain non-Latin characters
books = books[books['title'].apply(is_english)]

# Drop user ratings that correspond to books with titles that contain non-Latin characters
ratings = ratings[ratings['book_id'].isin(books['book_id'].tolist())]

# Create Training and Test Data Sets
We will use a 80/20 split for the training and test sets

In [7]:
# Reader class to define the structure of the ratings file
reader = Reader(rating_scale=(1, 5))

# Read dataframe into the Dataset class
book_data = Dataset.load_from_df(ratings[['user_id', 'book_id', 'rating']], reader)

# Split the data into a training and testing set using an 80/20 split
train_set, test_set = train_test_split(book_data, test_size=0.2)

# Model Fitting and Parameter Adjustment

In [8]:
norm = NormalPredictor()
norm.fit(train_set)
norm_pred = norm.test(test_set)
accuracy.rmse(norm_pred)

RMSE: 1.3230


1.323036569665345

In [9]:
baseline = BaselineOnly(bsl_options = {'method': 'als'})
baseline.fit(train_set)
baseline_pred = baseline.test(test_set)
accuracy.rmse(baseline_pred)

Estimating biases using als...
RMSE: 0.8549


0.8548847840711998

In [10]:
als_model = KNNBaseline(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False}, bsl_options = {'method': 'als'})
als_model.fit(train_set)
als_pred = als_model.test(test_set)
accuracy.rmse(als_pred)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.7970


0.7969910311736281

In [11]:
sgd_model = KNNBaseline(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False}, bsl_options = {'method': 'sgd'})
sgd_model.fit(train_set)
sgd_pred = sgd_model.test(test_set)
accuracy.rmse(sgd_pred)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.7973


0.7972748523750234

In [12]:
als_model = KNNBaseline(k=20, sim_options={'name': 'pearson_baseline', 'user_based': False}, bsl_options = {'method': 'als'})
als_model.fit(train_set)
als_pred = als_model.test(test_set)
accuracy.rmse(als_pred)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.7959


0.7958647039682978

In [13]:
sgd_model = KNNBaseline(k=20, sim_options={'name': 'pearson_baseline', 'user_based': False}, bsl_options = {'method': 'sgd'})
sgd_model.fit(train_set)
sgd_pred = sgd_model.test(test_set)
accuracy.rmse(sgd_pred)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.7963


0.7962780858648439

In [14]:
# Save the model so we can resuse in our application
dump.dump('/kaggle/working/item_model', algo=als_model)