# Part 1: Deep Learning-Based Recommendation
Read the paper Wide and Deep Learning for Recommender Systems. Download the https://www.kaggle.com/code/jirakst/book-recommendation/input. Based on the architecture described in the paper, build your own Wide and Deep Recommender system for the Book Review dataset. Your model should learn the features of each user and item, not just the associated ID numbers. Utilize an 80/20 train-test split and record your model’s prediction accuracy.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
import torch.optim as optim
from torch_snippets import *
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tqdm.notebook import tqdm

In [None]:
# Check if CUDA is available
device = torch.device("cuda")
print(f"Using device: {device}")

In [None]:
users = pd.read_csv('/home/zeke/Desktop/datacentral/BookCrossing/BX-Users.csv', delimiter=';', encoding = 'ISO-8859-1')

In [None]:
books = pd.read_csv('/home/zeke/Desktop/datacentral/BookCrossing/BX-Books.csv', delimiter=';', encoding='ISO-8859-1', on_bad_lines='skip')

  books = pd.read_csv('/home/zeke/Desktop/datacentral/BookCrossing/BX-Books.csv', delimiter=';', encoding='ISO-8859-1', on_bad_lines='skip')


In [None]:
ratings = pd.read_csv('/home/zeke/Desktop/datacentral/BookCrossing/BX-Book-Ratings.csv',  delimiter=';', encoding = 'ISO-8859-1')

In [None]:
users.shape

(278858, 3)

In [None]:
users.columns

Index(['User-ID', 'Location', 'Age'], dtype='object')

In [None]:
books.shape

(271360, 8)

In [None]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [None]:
ratings.shape

(1149780, 3)

In [None]:
ratings.columns

Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')

In [None]:
data = pd.merge(ratings, users, on='User-ID', how='inner')

In [None]:
data = pd.merge(data, books, on='ISBN', how='inner')

In [None]:
data.columns

Index(['User-ID', 'ISBN', 'Book-Rating', 'Location', 'Age', 'Book-Title',
       'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-S',
       'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [None]:
to_drop = ['Image-URL-S', 'Image-URL-M', 'Image-URL-L']

data = data.drop(to_drop, axis=1, inplace=False)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1031136 entries, 0 to 1031135
Data columns (total 9 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   User-ID              1031136 non-null  int64  
 1   ISBN                 1031136 non-null  object 
 2   Book-Rating          1031136 non-null  int64  
 3   Location             1031136 non-null  object 
 4   Age                  753301 non-null   float64
 5   Book-Title           1031136 non-null  object 
 6   Book-Author          1031134 non-null  object 
 7   Year-Of-Publication  1031136 non-null  object 
 8   Publisher            1031134 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 70.8+ MB


In [None]:
data.shape

(1031136, 9)

In [None]:
data.head(10)

Unnamed: 0,User-ID,ISBN,Book-Rating,Location,Age,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,276725,034545104X,0,"tyler, texas, usa",,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,276726,0155061224,5,"seattle, washington, usa",,Rites of Passage,Judith Rae,2001,Heinle
2,276727,0446520802,0,"h, new south wales, australia",16.0,The Notebook,Nicholas Sparks,1996,Warner Books
3,276729,052165615X,3,"rijeka, n/a, croatia",16.0,Help!: Level 1,Philip Prowse,1999,Cambridge University Press
4,276729,0521795028,6,"rijeka, n/a, croatia",16.0,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press
5,276733,2080674722,0,"paris, n/a, france",37.0,Les Particules Elementaires,Michel Houellebecq,1998,Flammarion
6,276744,038550120X,7,"torrance, california, usa",,A Painted House,JOHN GRISHAM,2001,Doubleday
7,276746,0425115801,0,"fort worth, ,",,Lightning,Dean R. Koontz,1996,Berkley Publishing Group
8,276746,0449006522,0,"fort worth, ,",,Manhattan Hunt Club,JOHN SAUL,2002,Ballantine Books
9,276746,0553561618,0,"fort worth, ,",,Dark Paradise,TAMI HOAG,1994,Bantam


In [None]:
print('Number of books: ', data['ISBN'].nunique())

In [None]:
print('Number of users: ',data['User-ID'].nunique())

In [None]:
print('Missing data [%]')
round(data.isnull().sum() / len(data) * 100, 4)

User-ID                 0.0000
ISBN                    0.0000
Book-Rating             0.0000
Location                0.0000
Age                    26.9446
Book-Title              0.0000
Book-Author             0.0002
Year-Of-Publication     0.0000
Publisher               0.0002
dtype: float64

In [None]:
# Cast to numeric
data['Year-Of-Publication'] = pd.to_numeric(data['Year-Of-Publication'], errors='coerce')
data['Year-Of-Publication'] = data['Year-Of-Publication'].fillna(2099).astype(int)


In [None]:
data['Book-Rating'] = data['Book-Rating'].replace(0, None)

In [None]:
data['Age'] = np.where(data['Age']>90, None, data['Age'])

In [None]:
data[['Book-Author', 'Publisher']] = data[['Book-Author', 'Publisher']].fillna('Unknown')

In [None]:
data[['Book-Author', 'Publisher']].isnull().sum()

Book-Author    0
Publisher      0
dtype: int64

In [None]:
median = data["Age"].median()
std = data["Age"].std()
is_null = data["Age"].isnull().sum()
rand_age = np.random.randint(median - std, median + std, size = is_null)
age_slice = data["Age"].copy()
age_slice[pd.isnull(age_slice)] = rand_age
data["Age"] = age_slice
data["Age"] = data["Age"].astype(int)

In [None]:
data['Age'].isnull().sum()

0

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1031136 entries, 0 to 1031135
Data columns (total 9 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   User-ID              1031136 non-null  int64 
 1   ISBN                 1031136 non-null  object
 2   Book-Rating          383842 non-null   object
 3   Location             1031136 non-null  object
 4   Age                  1031136 non-null  int64 
 5   Book-Title           1031136 non-null  object
 6   Book-Author          1031136 non-null  object
 7   Year-Of-Publication  1031136 non-null  int64 
 8   Publisher            1031136 non-null  object
dtypes: int64(3), object(6)
memory usage: 70.8+ MB


In [None]:
data.head(15)

Unnamed: 0,User-ID,ISBN,Book-Rating,Location,Age,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,276725,034545104X,,"tyler, texas, usa",46,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,276726,0155061224,5.0,"seattle, washington, usa",34,Rites of Passage,Judith Rae,2001,Heinle
2,276727,0446520802,,"h, new south wales, australia",16,The Notebook,Nicholas Sparks,1996,Warner Books
3,276729,052165615X,3.0,"rijeka, n/a, croatia",16,Help!: Level 1,Philip Prowse,1999,Cambridge University Press
4,276729,0521795028,6.0,"rijeka, n/a, croatia",16,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press
5,276733,2080674722,,"paris, n/a, france",37,Les Particules Elementaires,Michel Houellebecq,1998,Flammarion
6,276744,038550120X,7.0,"torrance, california, usa",23,A Painted House,JOHN GRISHAM,2001,Doubleday
7,276746,0425115801,,"fort worth, ,",24,Lightning,Dean R. Koontz,1996,Berkley Publishing Group
8,276746,0449006522,,"fort worth, ,",25,Manhattan Hunt Club,JOHN SAUL,2002,Ballantine Books
9,276746,0553561618,,"fort worth, ,",24,Dark Paradise,TAMI HOAG,1994,Bantam


In [None]:
df = pd.DataFrame(data)

In [None]:
df['Book-Rating'] = df['Book-Rating'].fillna(df['Book-Rating'].median())
user_encoder = LabelEncoder()
df['User-ID'] = user_encoder.fit_transform(df['User-ID'])
isbn_encoder = LabelEncoder()
df['ISBN'] = isbn_encoder.fit_transform(df['ISBN'])

  df['Book-Rating'] = df['Book-Rating'].fillna(df['Book-Rating'].median())


In [None]:
# Encode categorical variables
user_encoder = LabelEncoder()
df['User-ID'] = user_encoder.fit_transform(df['User-ID'])
isbn_encoder = LabelEncoder()
df['ISBN'] = isbn_encoder.fit_transform(df['ISBN'])

In [None]:
# Normalize continuous variables
scaler = StandardScaler()
df['Age'] = scaler.fit_transform(df[['Age']])
df['Year-Of-Publication'] = scaler.fit_transform(df[['Year-Of-Publication']])
df

Unnamed: 0,User-ID,ISBN,Book-Rating,Location,Age,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,91362,45921,8.0,"tyler, texas, usa",0.886300,Flesh Tones: A Novel,M. J. Rose,0.146276,Ballantine Books
1,91363,22731,5.0,"seattle, washington, usa",-0.193852,Rites of Passage,Judith Rae,0.141949,Heinle
2,91364,92659,8.0,"h, new south wales, australia",-1.814080,The Notebook,Nicholas Sparks,0.120313,Warner Books
3,91365,111478,3.0,"rijeka, n/a, croatia",-1.814080,Help!: Level 1,Philip Prowse,0.133295,Cambridge University Press
4,91365,111512,6.0,"rijeka, n/a, croatia",-1.814080,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,0.141949,Cambridge University Press
...,...,...,...,...,...,...,...,...,...
1031131,91358,197689,8.0,"cedar park, texas, usa",0.166199,Edgar Cayce on the Akashic Records: The Book o...,Kevin J. Todeschi,0.128968,A.R.E. Press (Association of Research &amp; Enlig
1031132,91358,224480,9.0,"cedar park, texas, usa",0.526250,Get Clark Smart : The Ultimate Guide for the S...,Clark Howard,0.137622,Longstreet Press
1031133,91359,143500,8.0,"quebec, quebec, canada",-1.634055,Eight Weeks to Optimum Health: A Proven Progra...,Andrew Weil,0.124641,Alfred A. Knopf
1031134,91360,108009,10.0,"mannington, west virginia, usa",0.166199,The Sherbrooke Bride (Bride Trilogy (Paperback)),Catherine Coulter,0.120313,Jove Books


In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
label_encoder = LabelEncoder()
df['Location'] = label_encoder.fit_transform(df['Location'])
df['Age_Bin'] = pd.cut(df['Age'], bins=3, labels=False)  # Binning age into 3 categories
df['Age_Location'] = df['Age_Bin'].astype(str) + '_' + df['Location'].astype(str)

# For crossed features or high cardinality features, you can use feature hashing
hasher = FeatureHasher(n_features=10, input_type='string')
hashed_features = hasher.transform(df[['Age_Location']].astype(str).values)
wide_features = hashed_features.toarray()

In [None]:
X_deep = df[['User-ID', 'ISBN', 'Age', 'Year-Of-Publication']]
X_wide = df[['Location', 'Age', 'Year-Of-Publication']]
y = df['Book-Rating'].astype(np.float32)

In [None]:
# Train-test split
X_deep_train, X_deep_test, X_wide_train, X_wide_test, y_train, y_test = train_test_split(X_deep, X_wide, y, test_size=0.2, random_state=42)

In [None]:
class BookDataset(Dataset):
    def __init__(self, features_deep, features_wide, labels):
        self.features_deep = features_deep.to_numpy()
        self.features_wide = features_wide.to_numpy()
        self.labels = labels.to_numpy()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        feature_deep = torch.tensor(self.features_deep[idx], dtype=torch.float)
        feature_wide = torch.tensor(self.features_wide[idx], dtype=torch.float)
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        return feature_deep, feature_wide, label

# Creating datasets
train_dataset = BookDataset(X_deep_train, X_wide_train, y_train)
test_dataset = BookDataset(X_deep_test, X_wide_test, y_test)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)



In [None]:
import torch.nn.functional as F

In [None]:
# Model definition
class WideAndDeep(nn.Module):
    def __init__(self, num_users, num_books, num_continuous, num_wide_features):
        super(WideAndDeep, self).__init__()
        # Deep Component
        self.user_embedding = nn.Embedding(num_users, 8)
        self.book_embedding = nn.Embedding(num_books, 8)
        self.fc1 = nn.Linear(8*2 + num_continuous, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

        # Wide Component
        self.wide_linear = nn.Linear(num_wide_features, 1)

    def forward(self, x, wide_features):
        # Deep Component Processing
        user_ids, book_ids, continuous_data = x[:, 0].long(), x[:, 1].long(), x[:, 2:].float()
        user_embedded = self.user_embedding(user_ids)
        book_embedded = self.book_embedding(book_ids)
        deep_input = torch.cat([user_embedded, book_embedded, continuous_data], dim=1)
        deep_output = F.leaky_relu(self.fc1(deep_input))
        deep_output = F.leaky_relu(self.fc2(deep_output))
        deep_output = self.fc3(deep_output)

        # Wide Component Processing
        wide_output = self.wide_linear(wide_features)

        # Combining Wide and Deep Components
        combined_output = deep_output + wide_output

        return combined_output



In [None]:
# Model initialization
num_users = df['User-ID'].nunique()
num_books = df['ISBN'].nunique()
num_wide_features = 3
model = WideAndDeep(num_users, num_books, 2, num_wide_features).to(device)


# Training setup
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 20
for epoch in tqdm(range(epochs), desc='Epochs'):
    model.train()  # Set model to training mode
    total_loss = 0
    for deep_features, wide_features, labels in train_loader:
        # Move data to the device
        deep_features, wide_features, labels = deep_features.to(device), wide_features.to(device), labels.to(device)

        optimizer.zero_grad()
        # Adjust the model call to accept both wide and deep features
        outputs = model(deep_features, wide_features)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    true_ratings = []
    predicted_ratings = []

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')
    torch.save(model.state_dict(), f'model_epoch_{epoch+1}.pth')


Epochs:   0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
true_ratings = []
predicted_ratings = []

model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # No gradients needed for evaluation
    for deep_features, wide_features, labels in test_loader:
        # Move data to the correct device
        deep_features, wide_features = deep_features.to(device), wide_features.to(device)
        labels = labels.to(device)

        # Adjust the model call to accept both wide and deep features
        outputs = model(deep_features, wide_features).squeeze()

        # Move the outputs and labels to CPU for further operations
        true_ratings.extend(labels.cpu().tolist())
        predicted_ratings.extend(outputs.cpu().tolist())

# Calculate RMSE
rmse = sqrt(mean_squared_error(true_ratings, predicted_ratings))
print(f"RMSE: {rmse}")