# Preprocessing

In [82]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [83]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

books_df = pd.read_csv('/content/drive/MyDrive/ice_global/data/Books.csv', encoding="latin-1")
print(books_df.shape)
books_df.head()

(271360, 8)


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [84]:
ratings_df = pd.read_csv('/content/drive/MyDrive/ice_global/data/Ratings.csv', encoding="latin-1")
print(ratings_df.shape)
ratings_df.head()

(1149780, 3)


Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [85]:
users_df = pd.read_csv('/content/drive/MyDrive/ice_global/data/Users.csv', encoding="latin-1")
print(users_df.shape)
users_df.head()

(278858, 3)


Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


### Age Data

In [86]:
users_df = users_df.dropna(subset=["Age"])
print(users_df.shape)

(168096, 3)


### Book Data

In [87]:
books_df = books_df.dropna(subset=["Book-Title", "Book-Author", "Year-Of-Publication", "Publisher"])
print(books_df.shape)

(271356, 8)


### Rating Data

In [88]:
# Take only users who have rated at least 5 books
user_rating_counts = ratings_df["User-ID"].value_counts()
active_users = user_rating_counts[user_rating_counts >= 5].index
ratings_df = ratings_df[ratings_df["User-ID"].isin(active_users)]

# Take only books that have been rated at least 5 times
book_rating_counts = ratings_df["ISBN"].value_counts()
popular_books = book_rating_counts[book_rating_counts >= 5].index
ratings_df = ratings_df[ratings_df["ISBN"].isin(popular_books)]

# ratings_df = ratings_df[ratings_df["Book-Rating"] > 0]
ratings_df = ratings_df.dropna(subset=["Book-Rating"])

print(ratings_df.shape)

(608766, 3)


### Merge the Data

In [89]:
# Ratings + Books
ratings_books = pd.merge(ratings_df, books_df, on="ISBN", how="inner")

# Ratings + Users
ratings_full = pd.merge(ratings_books, users_df, on="User-ID", how="inner")

# Final tablo
final_df = ratings_full.copy()
print(final_df.shape)
final_df

(437552, 12)


Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Location,Age
0,276747,0060517794,9,Little Altars Everywhere,Rebecca Wells,2003,HarperTorch,http://images.amazon.com/images/P/0060517794.0...,http://images.amazon.com/images/P/0060517794.0...,http://images.amazon.com/images/P/0060517794.0...,"iowa city, iowa, usa",25.0
1,276747,0451192001,0,How Stella Got Her Groove Back,Terry McMillan,1997,Signet Book,http://images.amazon.com/images/P/0451192001.0...,http://images.amazon.com/images/P/0451192001.0...,http://images.amazon.com/images/P/0451192001.0...,"iowa city, iowa, usa",25.0
2,276747,0609801279,0,The Law of Love,Laura Esquivel,1997,Three Rivers Press (CA),http://images.amazon.com/images/P/0609801279.0...,http://images.amazon.com/images/P/0609801279.0...,http://images.amazon.com/images/P/0609801279.0...,"iowa city, iowa, usa",25.0
3,276747,0671537458,9,Waiting to Exhale,Terry McMillan,1995,Pocket,http://images.amazon.com/images/P/0671537458.0...,http://images.amazon.com/images/P/0671537458.0...,http://images.amazon.com/images/P/0671537458.0...,"iowa city, iowa, usa",25.0
4,276747,0679776818,8,Birdsong: A Novel of Love and War,Sebastian Faulks,1997,Vintage Books USA,http://images.amazon.com/images/P/0679776818.0...,http://images.amazon.com/images/P/0679776818.0...,http://images.amazon.com/images/P/0679776818.0...,"iowa city, iowa, usa",25.0
...,...,...,...,...,...,...,...,...,...,...,...,...
437547,276690,0440439884,0,Island of the Blue Dolphins,SCOTT O'DELL,1971,Random House Children's Books,http://images.amazon.com/images/P/0440439884.0...,http://images.amazon.com/images/P/0440439884.0...,http://images.amazon.com/images/P/0440439884.0...,"wakeman, ohio, usa",43.0
437548,276690,0590453653,0,"Welcome to Dead House (Goosebumps, No 1)",R. L. Stine,1995,Scholastic,http://images.amazon.com/images/P/0590453653.0...,http://images.amazon.com/images/P/0590453653.0...,http://images.amazon.com/images/P/0590453653.0...,"wakeman, ohio, usa",43.0
437549,276690,0590453688,0,"Say Cheese and Die! (Goosebumps, No 4)",R. L. Stine,1995,Scholastic,http://images.amazon.com/images/P/0590453688.0...,http://images.amazon.com/images/P/0590453688.0...,http://images.amazon.com/images/P/0590453688.0...,"wakeman, ohio, usa",43.0
437550,276690,0590455419,0,The Addams Family,Elizabeth Faucher,1991,Scholastic Paperbacks (Mm),http://images.amazon.com/images/P/0590455419.0...,http://images.amazon.com/images/P/0590455419.0...,http://images.amazon.com/images/P/0590455419.0...,"wakeman, ohio, usa",43.0


# Models

### Neural Collaborative Filtering (NCF)

In [90]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

def build_sampled_implicit_df(df, min_rating=7, neg_ratio=4):
    pos_df = df[df["Book-Rating"] >= min_rating].copy()
    neg_samples = []
    users = pos_df["User-ID"].unique()
    all_books = set(df["ISBN"].unique())
    for user in users:
        pos_books = set(pos_df[pos_df["User-ID"] == user]["ISBN"])
        user_books = set(df[df["User-ID"] == user]["ISBN"])
        neg_candidates = list(all_books - user_books)
        np.random.shuffle(neg_candidates)
        n_neg = min(len(pos_books) * neg_ratio, len(neg_candidates))
        for neg_isbn in neg_candidates[:n_neg]:
            neg_samples.append([user, neg_isbn, 0])
    neg_df = pd.DataFrame(neg_samples, columns=["User-ID", "ISBN", "Book-Rating"])
    sampled_df = pd.concat([pos_df[["User-ID", "ISBN", "Book-Rating"]], neg_df], ignore_index=True)
    return sampled_df

users_df['location_index'] = LabelEncoder().fit_transform(users_df['Location'].astype(str))
users_df['Age'] = users_df['Age'].fillna(users_df['Age'].mean())
users_df['Age_norm'] = (users_df['Age'] - users_df['Age'].min()) / (users_df['Age'].max() - users_df['Age'].min())

raw_train_df, raw_test_df = train_test_split(final_df, test_size=0.1, random_state=42)
raw_test_df = raw_test_df[
    raw_test_df["User-ID"].isin(raw_train_df["User-ID"]) &
    raw_test_df["ISBN"].isin(raw_train_df["ISBN"])
].copy()

sampled_train_df = build_sampled_implicit_df(raw_train_df, min_rating=7, neg_ratio=4)

user_encoder = LabelEncoder()
book_encoder = LabelEncoder()
sampled_train_df["user_index"] = user_encoder.fit_transform(sampled_train_df["User-ID"])
sampled_train_df["book_index"] = book_encoder.fit_transform(sampled_train_df["ISBN"])

raw_test_df = raw_test_df[
    raw_test_df["User-ID"].isin(user_encoder.classes_) &
    raw_test_df["ISBN"].isin(book_encoder.classes_)
].copy()
raw_test_df["user_index"] = user_encoder.transform(raw_test_df["User-ID"])
raw_test_df["book_index"] = book_encoder.transform(raw_test_df["ISBN"])

sampled_train_df = sampled_train_df.merge(users_df[["User-ID", "location_index", "Age_norm"]], on="User-ID", how="left")
raw_test_df = raw_test_df.merge(users_df[["User-ID", "location_index", "Age_norm"]], on="User-ID", how="left")

class BPRDataset(Dataset):
    def __init__(self, df, num_items):
        self.users = df["user_index"].values
        self.pos_items = df["book_index"].values
        self.locations = df["location_index"].values
        self.ages = torch.tensor(df["Age_norm"].values, dtype=torch.float32)
        self.num_items = num_items
        self.user_pos_items = df[df["Book-Rating"] >= 0].groupby("user_index")["book_index"].apply(set).to_dict()
    def __len__(self):
        return len(self.users)
    def __getitem__(self, idx):
        user = self.users[idx]
        pos_item = self.pos_items[idx]
        location = self.locations[idx]
        age = self.ages[idx]
        # Negatif sampling
        while True:
            neg_item = np.random.randint(self.num_items)
            if neg_item not in self.user_pos_items.get(user, set()):
                break
        return user, pos_item, neg_item, location, age

class NCF(nn.Module):
    def __init__(self, num_users, num_items, num_locations, embedding_dim=64):
        super(NCF, self).__init__()
        self.user_embed = nn.Embedding(num_users, embedding_dim)
        self.item_embed = nn.Embedding(num_items, embedding_dim)
        self.loc_embed = nn.Embedding(num_locations, 8)
        self.age_fc = nn.Linear(1, 4)
        self.final_fc = nn.Linear(1 + 8 + 4, 1)  # sim_score + loc_vec + age_vec

    def forward(self, user, item, location, age):
        user_vec = self.user_embed(user)
        item_vec = self.item_embed(item)
        sim_score = (user_vec * item_vec).sum(dim=1, keepdim=True)
        loc_vec = self.loc_embed(location)
        age_vec = self.age_fc(age.unsqueeze(1))
        x = torch.cat([sim_score, loc_vec, age_vec], dim=1)
        out = self.final_fc(x)
        return out.squeeze()

def bpr_loss(pos_scores, neg_scores):
    return -torch.log(torch.sigmoid(pos_scores - neg_scores) + 1e-8).mean()

def train_bpr_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0
    for users, pos_items, neg_items, locations, ages in loader:
        users = users.to(device)
        pos_items = pos_items.to(device)
        neg_items = neg_items.to(device)
        locations = locations.to(device)
        ages = ages.to(device)
        optimizer.zero_grad()
        pos_scores = model(users, pos_items, locations, ages)
        neg_scores = model(users, neg_items, locations, ages)
        loss = bpr_loss(pos_scores, neg_scores)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

num_users = sampled_train_df["user_index"].nunique()
num_books = sampled_train_df["book_index"].nunique()
num_locations = users_df["location_index"].nunique()

train_dataset = BPRDataset(sampled_train_df, num_books)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NCF(num_users, num_books, num_locations).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(100):
    train_loss = train_bpr_epoch(model, train_loader, optimizer, device)
    print(f"Epoch {epoch+1} - BPR Loss: {train_loss:.4f}")

Epoch 1 - BPR Loss: 0.8259
Epoch 2 - BPR Loss: 0.6931
Epoch 3 - BPR Loss: 0.6930
Epoch 4 - BPR Loss: 0.6930
Epoch 5 - BPR Loss: 0.6930
Epoch 6 - BPR Loss: 0.6929
Epoch 7 - BPR Loss: 0.6926
Epoch 8 - BPR Loss: 0.6920
Epoch 9 - BPR Loss: 0.6908
Epoch 10 - BPR Loss: 0.6882
Epoch 11 - BPR Loss: 0.6831
Epoch 12 - BPR Loss: 0.6757
Epoch 13 - BPR Loss: 0.6662
Epoch 14 - BPR Loss: 0.6553
Epoch 15 - BPR Loss: 0.6429
Epoch 16 - BPR Loss: 0.6303
Epoch 17 - BPR Loss: 0.6182
Epoch 18 - BPR Loss: 0.6041
Epoch 19 - BPR Loss: 0.5905
Epoch 20 - BPR Loss: 0.5771
Epoch 21 - BPR Loss: 0.5628
Epoch 22 - BPR Loss: 0.5475
Epoch 23 - BPR Loss: 0.5341
Epoch 24 - BPR Loss: 0.5198
Epoch 25 - BPR Loss: 0.5065
Epoch 26 - BPR Loss: 0.4923
Epoch 27 - BPR Loss: 0.4791
Epoch 28 - BPR Loss: 0.4657
Epoch 29 - BPR Loss: 0.4527
Epoch 30 - BPR Loss: 0.4396
Epoch 31 - BPR Loss: 0.4266
Epoch 32 - BPR Loss: 0.4142
Epoch 33 - BPR Loss: 0.4040
Epoch 34 - BPR Loss: 0.3923
Epoch 35 - BPR Loss: 0.3803
Epoch 36 - BPR Loss: 0.3696
E

In [None]:
def recommend_books_ncf(user_id, top_k=10):
    user_id = str(user_id).strip()
    if user_id not in user_encoder.classes_:
        return []
    user_idx = user_encoder.transform([user_id])[0]

    user_row = users_df[users_df["User-ID"] == user_id]
    if user_row.empty:
        return []
    location_idx = user_row["location_index"].values[0]
    age_norm = user_row["Age_norm"].values[0]
    read_books = set() 
    unread_books = [isbn for isbn in book_encoder.classes_ if isbn not in read_books]
    if not unread_books:
        return []
    unread_book_indices = book_encoder.transform(unread_books)
    user_tensor = torch.tensor([user_idx] * len(unread_book_indices), dtype=torch.long)
    book_tensor = torch.tensor(unread_book_indices, dtype=torch.long)
    loc_tensor = torch.tensor([location_idx] * len(unread_book_indices), dtype=torch.long)
    age_tensor = torch.tensor([age_norm] * len(unread_book_indices), dtype=torch.float32)
    model.eval()
    with torch.no_grad():
        scores = model(user_tensor, book_tensor, loc_tensor, age_tensor).cpu().numpy()
    top_indices = scores.argsort()[::-1][:top_k]
    recommended_isbns = [unread_books[i] for i in top_indices]
    return recommended_isbns


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import random

books_df["ISBN"] = books_df["ISBN"].astype(str).str.strip()
final_df["ISBN"] = final_df["ISBN"].astype(str).str.strip()
final_df["User-ID"] = final_df["User-ID"].astype(str).str.strip()

train_df, test_df = train_test_split(final_df, test_size=0.1, random_state=42)
train_df["User-ID"] = train_df["User-ID"].astype(str).str.strip()
test_df["User-ID"] = test_df["User-ID"].astype(str).str.strip()
train_df["ISBN"] = train_df["ISBN"].astype(str).str.strip()
test_df["ISBN"] = test_df["ISBN"].astype(str).str.strip()

test_df = test_df[
    test_df["User-ID"].isin(train_df["User-ID"]) &
    test_df["ISBN"].isin(train_df["ISBN"])
].copy()

user_encoder = LabelEncoder()
book_encoder = LabelEncoder()
user_encoder.fit(train_df["User-ID"])
book_encoder.fit(train_df["ISBN"])
train_df["user_index"] = user_encoder.transform(train_df["User-ID"])
train_df["book_index"] = book_encoder.transform(train_df["ISBN"])
test_df["user_index"] = user_encoder.transform(test_df["User-ID"])
test_df["book_index"] = book_encoder.transform(test_df["ISBN"])

users_df["User-ID"] = users_df["User-ID"].astype(str).str.strip()
users_df["location_index"] = users_df["Location"].astype("category").cat.codes
users_df["Age"] = pd.to_numeric(users_df["Age"], errors='coerce')
users_df.loc[(users_df["Age"] < 5) | (users_df["Age"] > 100), "Age"] = np.nan
users_df["Age"].fillna(users_df["Age"].median(), inplace=True)
users_df["Age_norm"] = (users_df["Age"] - users_df["Age"].min()) / (users_df["Age"].max() - users_df["Age"].min())

user_counts = test_df["user_index"].value_counts()
active_users = user_counts[user_counts >= 10].index
book_counts = test_df["book_index"].value_counts()
popular_books = book_counts[book_counts >= 10].index

filtered_test_df = test_df[
    test_df["user_index"].isin(active_users) &
    test_df["book_index"].isin(popular_books)
].copy()

encoder_user_set = set(user_encoder.classes_)
encoder_book_set = set(book_encoder.classes_)

filtered_test_df = filtered_test_df[
    filtered_test_df["User-ID"].isin(encoder_user_set) &
    filtered_test_df["ISBN"].isin(encoder_book_set)
].copy()

sample_users = filtered_test_df["User-ID"].drop_duplicates().astype(str).str.strip().tolist()

In [93]:
def coverage(recommend_func, user_list, total_books, top_k=10):
    recommended_books = set()
    for user_id in user_list:
        recs = recommend_func(user_id, top_k=top_k)
        recommended_books.update(recs)
    return len(recommended_books) / total_books

coverage_score = coverage(recommend_books_ncf, sample_users, num_books, top_k=10)
print(f"Coverage@10: {coverage_score:.3f}")


Coverage@10: 0.280


In [94]:
def diversity(recommend_func, user_list, top_k=10):
    all_books = []
    for user_id in user_list:
        recs = set(recommend_func(user_id, top_k=top_k))
        all_books.append(recs)

    from itertools import combinations
    jaccard_sims = []
    for a, b in combinations(all_books, 2):
        intersection = len(a & b)
        union = len(a | b)
        if union > 0:
            jaccard = intersection / union
            jaccard_sims.append(jaccard)
    mean_jaccard = np.mean(jaccard_sims)
    diversity_score = 1 - mean_jaccard
    return diversity_score

diversity_score = diversity(recommend_books_ncf, sample_users[:10000], top_k=10)  # 10000
print(f"Diversity@10: {diversity_score:.3f}")


Diversity@10: 0.999


In [95]:
def personalization(recommend_func, user_list, top_k=10):
    all_recs = [set(recommend_func(uid, top_k=top_k)) for uid in user_list]
    from itertools import combinations
    similarities = []
    for i, j in combinations(range(len(all_recs)), 2):
        overlap = len(all_recs[i].intersection(all_recs[j]))
        similarities.append(overlap / top_k)
    mean_similarity = np.mean(similarities)
    personalization_score = 1 - mean_similarity
    return personalization_score

personalization_score = personalization(recommend_books_ncf, sample_users[:100], top_k=10)
print(f"Personalization@10: {personalization_score:.3f}")


Personalization@10: 0.998


In [96]:
def novelty(recommend_func, user_list, pop_counts, top_k=10):
    total = 0
    count = 0
    for user_id in user_list:
        recs = recommend_func(user_id, top_k=top_k)
        for isbn in recs:
            pop = pop_counts.get(isbn, 0)
            total += np.log2(1 + pop)
            count += 1
    novelty_score = 1 - (total / count) / np.log2(1 + pop_counts.max())
    return novelty_score

popularity_counts = sampled_train_df["ISBN"].value_counts()
novelty_score = novelty(recommend_books_ncf, sample_users[:100], popularity_counts, top_k=10)
print(f"Novelty@10: {novelty_score:.3f}")

Novelty@10: 0.453


In [97]:
torch.save(model.state_dict(), "/content/drive/MyDrive/ice_global/notebook/model.pth")

In [104]:
import pickle
with open('user_encoder.pkl', 'wb') as f:
    pickle.dump(user_encoder, f)
with open('book_encoder.pkl', 'wb') as f:
    pickle.dump(book_encoder, f)

users_df.to_csv("/content/drive/MyDrive/ice_global/data/users_df_for_dashboard.csv", index=False)
books_df.to_csv("/content/drive/MyDrive/ice_global/data/books_df_for_dashboard.csv", index=False)