### Baseline model for predicting reccomended book

In [90]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


### Import dataset, split into train/val/test

In [91]:
df = pd.read_csv('model_data/model_data_wo_eXtern.csv')

df.head(5)

Unnamed: 0,User-ID,Age,City,Region/State,Country,Age_Missing,ISBN,Book-Rating,Rating_Categ,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,2,18.0,stockton,california,usa,0,195153448,0,Not rated,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press
1,7,-1.0,washington,dc,usa,1,34542252,0,Not rated,,,,
2,8,-1.0,timmins,ontario,canada,1,2005018,5,Avarege (4-5),Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada
3,8,-1.0,timmins,ontario,canada,1,60973129,0,Not rated,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial
4,8,-1.0,timmins,ontario,canada,1,374157065,0,Not rated,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux


### Remove not rated

In [92]:
### Unique values and counts from rating_categ
df['Rating_Categ'].value_counts()

Rating_Categ
Not rated           716109
Excellent (8-10)    249887
Good (6-7)          113381
Avarege (4-5)        59878
Bad (1-3)            10525
Name: count, dtype: int64

In [93]:

## Rating counts 
rating_counts = df['Book-Rating'].value_counts()

## Sort by index
rating_counts = pd.DataFrame(rating_counts.sort_index())
## Calcualte percentages also and add as another column
rating_counts['percentage'] = round(rating_counts/rating_counts.sum()*100,1)

## Add cumulative percentage
rating_counts['cumulative_percentage'] = rating_counts['percentage'].cumsum()



rating_counts

Unnamed: 0_level_0,count,percentage,cumulative_percentage
Book-Rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,716109,62.3,62.3
1,1770,0.2,62.5
2,2759,0.2,62.7
3,5996,0.5,63.2
4,8904,0.8,64.0
5,50974,4.4,68.4
6,36924,3.2,71.6
7,76457,6.6,78.2
8,103736,9.0,87.2
9,67541,5.9,93.1


In [94]:
### Create a new binary category for rating (everything above 8 is 1, otherwise 0)

df['Reccomnend'] = np.where(df['Book-Rating'] >= 8, 1, 0)

In [95]:
## Remove not rated
df = df[df['Rating_Categ'] != 'Not rated']
df['Rating_Categ'].value_counts()

Rating_Categ
Excellent (8-10)    249887
Good (6-7)          113381
Avarege (4-5)        59878
Bad (1-3)            10525
Name: count, dtype: int64

In [96]:
df['Reccomnend'].value_counts()

Reccomnend
1    249887
0    183784
Name: count, dtype: int64

Split into train/test/val

In [97]:
### Split into train, val, test
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)  # 80% Train
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)  # 10% Val, 10% Test

del(temp_df)

print(f"Train samples:\t\t {len(train_df)/1000} k")
print(f"Validation samples:\t {len(val_df)/1000} k")
print(f"Test samples:\t\t {len(test_df)/1000} k")

Train samples:		 260.202 k
Validation samples:	 86.734 k
Test samples:		 86.735 k


Select features and targets

In [98]:
df.columns

Index(['User-ID', 'Age', 'City', 'Region/State', 'Country', 'Age_Missing',
       'ISBN', 'Book-Rating', 'Rating_Categ', 'Book-Title', 'Book-Author',
       'Year-Of-Publication', 'Publisher', 'Reccomnend'],
      dtype='object')

In [99]:
#features = ['Age', 'City', 'Region/State', 'Country', 'Age_Missing','Year-Of-Publication']
features = ['Age', 'Age_Missing','Year-Of-Publication']
target_class = 'Reccomnend'
target_reg = 'Book-Rating'

In [100]:
X_train = train_df.loc[:,features]
y_train = train_df.loc[:,target_class]

X_val = val_df.loc[:,features]
y_val = val_df.loc[:,target_class]

X_test = test_df.loc[:,features]
y_test = test_df.loc[:,target_class]

y_train_reg = train_df.loc[:,target_reg]
y_val_reg = val_df.loc[:,target_reg]
y_test_reg = test_df.loc[:,target_reg]


In [101]:
X_train = X_train.fillna(0)
X_val = X_val.fillna(0)
X_test = X_test.fillna(0)

### Collaborative filtering

In [102]:
print(train_df.shape)
train_df.columns

(260202, 14)


Index(['User-ID', 'Age', 'City', 'Region/State', 'Country', 'Age_Missing',
       'ISBN', 'Book-Rating', 'Rating_Categ', 'Book-Title', 'Book-Author',
       'Year-Of-Publication', 'Publisher', 'Reccomnend'],
      dtype='object')

In [103]:
### Unique User-IDs
user_ids = len(train_df['User-ID'].unique())
print(f"Unique User-IDs: {user_ids}")

### Unique ISBNs
ISBNs = len(train_df['ISBN'].unique())
print(f"Unique ISBNs: {ISBNs}")

Unique User-IDs: 57790
Unique ISBNs: 128919


In [104]:
### Filter boooks

In [105]:
# Get the top N most rated books
n_books = 1000

top_n_books = train_df.groupby('ISBN').size().sort_values(ascending=False).head(1000).index

# Filter the DataFrame to include only the top N books
subset_df_books = train_df[train_df['ISBN'].isin(top_n_books)].copy()


print(f"Unique ISBNs: {len(subset_df_books['ISBN'].unique())}")


Unique ISBNs: 1000


Filter user

In [116]:
# Step 1: Count the number of ratings per user
user_counts = train_df.groupby('User-ID').size()

n = 25
# Step 2: Filter users who have at least 50 ratings (or any threshold)
users_with_at_least_n_ratings = user_counts[user_counts >= 25].index

subset_df_book_users = subset_df_books[subset_df_books['User-ID'].isin(users_with_at_least_50_ratings)]

print(f"Unique User-IDs: {len(subset_df_book_users['User-ID'].unique())}")
print(f"Unique ISBNs: {len(subset_df_book_users['ISBN'].unique())}")


Unique User-IDs: 1502
Unique ISBNs: 997


In [117]:
subset_df_book_users.shape

(13867, 14)

In [118]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Assume you have a DataFrame with user-item interactions (ratings)
# Example: user-item ratings matrix where rows are users and columns are items
user_item_matrix = subset_df_book_users.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating')

# Compute the cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix.fillna(0))  # Replace NaNs with 0 for similarity calculation

In [119]:
user_similarity

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], shape=(1502, 1502))