In [1]:
from pycaret.regression import *
import pandas as pd
import numpy as np
import re

In [2]:
inters = pd.read_csv('~/data/train_ratings.csv')
users = pd.read_csv('~/data/users.csv')
books = pd.read_csv('~/data/books.csv')

In [3]:
users['location_city'] = users['location'].apply(lambda x: re.sub("\s+", '', x.split(',')[0])).replace('n/a', np.NaN).replace('', np.NaN)
users['location_state'] = users['location'].apply(lambda x: re.sub("\s+", '', x.split(',')[1])).replace('n/a', np.NaN).replace('', np.NaN)
users['location_country'] = users['location'].apply(lambda x: re.sub("\s+", '', x.split(',')[2])).replace('n/a', np.NaN).replace('', np.NaN)
users = users.drop(['location'], axis = 1)

In [4]:
books['summary'] = books['summary'].fillna('None')

In [5]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68092 entries, 0 to 68091
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           68092 non-null  int64  
 1   age               40259 non-null  float64
 2   location_city     67999 non-null  object 
 3   location_state    64920 non-null  object 
 4   location_country  65970 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 2.6+ MB


In [6]:
train = inters.merge(users, on='user_id', how='left').merge(books.drop(['img_url', 'img_path'], axis=1), on='isbn', how='left')

In [7]:
train = train.drop(['book_title', 'summary'], axis=1)

In [8]:
# indexing

def age_map(x: int) -> int:
    x = int(x)
    if x < 20:
        return 1
    elif x >= 20 and x < 30:
        return 2
    elif x >= 30 and x < 40:
        return 3
    elif x >= 40 and x < 50:
        return 4
    elif x >= 50 and x < 60:
        return 5
    else:
        return 6

user2idx = {id: idx for idx, id in enumerate(users['user_id'])}
loc_city2idx = {v:k for k,v in enumerate(users['location_city'].unique())}
loc_state2idx = {v:k for k,v in enumerate(users['location_state'].unique())}
loc_country2idx = {v:k for k,v in enumerate(users['location_country'].unique())}

isbn2idx = {isbn:idx for idx, isbn in enumerate(books['isbn'])}
category2idx = {v:k for k,v in enumerate(books['category'].unique())}
publisher2idx = {v:k for k,v in enumerate(books['publisher'].unique())}
language2idx = {v:k for k,v in enumerate(books['language'].unique())}
author2idx = {v:k for k,v in enumerate(books['book_author'].unique())}


train['user_id'] = train['user_id'].map(user2idx)
train['location_city'] = train['location_city'].map(loc_city2idx)
train['location_state'] = train['location_state'].map(loc_state2idx)
train['location_country'] = train['location_country'].map(loc_country2idx)
train['age'] = train['age'].fillna(int(train['age'].mean()))
train['age'] = train['age'].apply(age_map)

train['isbn'] = train['isbn'].map(isbn2idx)
train['category'] = train['category'].map(category2idx)
train['publisher'] = train['publisher'].map(publisher2idx)
train['language'] = train['language'].map(language2idx)
train['book_author'] = train['book_author'].map(author2idx)
train['year_of_publication'] = train['year_of_publication'].apply(lambda x: int(x))

In [9]:
train.head()

Unnamed: 0,user_id,isbn,rating,age,location_city,location_state,location_country,book_author,year_of_publication,publisher,language,category
0,0,0,4,3,0,0,0,0,2001,0,0,0
1,3,0,7,3,3,0,0,0,2001,0,0,0
2,7,0,8,3,5,0,0,0,2001,0,0,0
3,9,0,8,3,6,0,0,0,2001,0,0,0
4,10,0,9,3,7,0,0,0,2001,0,0,0


In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 306795 entries, 0 to 306794
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype
---  ------               --------------   -----
 0   user_id              306795 non-null  int64
 1   isbn                 306795 non-null  int64
 2   rating               306795 non-null  int64
 3   age                  306795 non-null  int64
 4   location_city        306795 non-null  int64
 5   location_state       306795 non-null  int64
 6   location_country     306795 non-null  int64
 7   book_author          306795 non-null  int64
 8   year_of_publication  306795 non-null  int64
 9   publisher            306795 non-null  int64
 10  language             306795 non-null  int64
 11  category             306795 non-null  int64
dtypes: int64(12)
memory usage: 30.4 MB


In [27]:
reg = setup(data = train,
            target = 'rating',
            # numeric_features = ['age', 'year_of_publication'],
            # categorical_features = ['user_id', 'isbn', 'age', 'location_city', 'location_state', 'location_country', 'category', 'publisher', 'language', 'book_author'],
            categorical_features = ['user_id', 'isbn', 'age', 'location_city', 'location_state', 'location_country', 'category', 'publisher', 'language', 'book_author'],
            
            # imputation_type = 'simple',
            
            # text_features = ['book_title', 'summary'],
            # text_features_method = 'tf-idf',
            
            # group_features = ['location_city', 'location_state', 'location_country'],
            
            # max_encoding_ohe = -1,

            preprocess = False,
            
            # fix_imbalance = ,
            # fix_imbalance_method = ,
            
            # remove_outliers = False,
            # outliers_method = ,
            # outliers_threshold
            
            
            
            train_size = 0.8,
            data_split_stratify = True,
            fold_strategy = 'stratifiedkfold',
            fold_shuffle = True,
            fold = 5,
            use_gpu = False
        
           )

Unnamed: 0,Description,Value
0,Session id,6075
1,Target,rating
2,Target type,Regression
3,Original data shape,"(306795, 12)"
4,Transformed data shape,"(306795, 12)"
5,Transformed train set shape,"(245436, 12)"
6,Transformed test set shape,"(61359, 12)"
7,Numeric features,1
8,Categorical features,10


In [73]:
best = compare_models(sort='RMSE', n_select = 5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,1.7378,5.1552,2.2705,0.1293,0.399,0.5254,26.069
xgboost,Extreme Gradient Boosting,1.7427,5.1982,2.28,0.122,0.4041,0.5417,4.684
catboost,CatBoost Regressor,1.7456,5.2033,2.2811,0.1211,0.4045,0.5434,9.687
et,Extra Trees Regressor,1.7631,5.3188,2.3063,0.1016,0.4033,0.5288,9.442
lightgbm,Light Gradient Boosting Machine,1.7726,5.3315,2.309,0.0995,0.4086,0.5526,0.466
gbr,Gradient Boosting Regressor,1.8214,5.5679,2.3596,0.0595,0.4154,0.5664,7.943
lr,Linear Regression,1.88,5.8827,2.4254,0.0064,0.4242,0.5843,0.26
ridge,Ridge Regression,1.88,5.8827,2.4254,0.0064,0.4242,0.5843,0.037
br,Bayesian Ridge,1.8799,5.8827,2.4254,0.0064,0.4242,0.5844,0.05
lar,Least Angle Regression,1.88,5.8828,2.4255,0.0064,0.4242,0.5843,0.035


Processing:   0%|          | 0/91 [00:00<?, ?it/s]

In [None]:
tuned_model_list = []
for model in best:
    tuned_model_list.append(tune_model(model, optimize = 'RMSE', fold=5, n_iter = 100, search_library = 'optuna', choose_better = True))