In [3]:
import pandas as pd

In [4]:
df_book = pd.read_csv('Books.csv')
df_book.head()

  df_book = pd.read_csv('Books.csv')


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [5]:
df_book['Publisher'].value_counts().shape

(16807,)

In [6]:
df_rating = pd.read_csv('Ratings.csv')
df_rating.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [7]:
df_user = pd.read_csv('Users.csv')
df_user.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [8]:
df_user['Age'].isna().sum()

np.int64(110762)

In [9]:
df_user.shape

(278858, 3)

In [10]:
df_user['Location'].apply(lambda x: len(x.split(','))).value_counts()

Location
3    277348
4      1417
5        72
6        11
7         4
9         2
8         2
2         1
1         1
Name: count, dtype: int64

In [13]:
# --- 0) Imports & config
import os, re, math, gc, random
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

RANDOM_SEED = 42
random.seed(RANDOM_SEED); np.random.seed(RANDOM_SEED)

DATA_DIR = "."                  # where Books.csv, Users.csv, Ratings.csv live
OUT_DIR  = "./processed"        # where we’ll save cleaned files
os.makedirs(OUT_DIR, exist_ok=True)

# --- 1) Load raw CSVs (be tolerant to mixed types)
books   = pd.read_csv(f"{DATA_DIR}/Books.csv", dtype=str, encoding="latin-1")
ratings = pd.read_csv(f"{DATA_DIR}/Ratings.csv", dtype=str, encoding="latin-1")
users   = pd.read_csv(f"{DATA_DIR}/Users.csv", dtype=str, encoding="latin-1")

# Standardize column names
books.columns   = [c.strip().replace(" ", "-") for c in books.columns]
ratings.columns = [c.strip().replace(" ", "-") for c in ratings.columns]
users.columns   = [c.strip().replace(" ", "-") for c in users.columns]

# --- 2) Light cleaning helpers
def to_int_safe(x):
    try:
        return int(float(str(x).strip()))
    except:
        return np.nan

def clamp_year(y):
    y = to_int_safe(y)
    # reasonable publication years (Book-Crossing has oddities)
    return y if (not pd.isna(y) and 1450 <= y <= 2025) else np.nan

def clean_age(a):
    a = to_int_safe(a)
    return a if (not pd.isna(a) and 5 <= a <= 95) else np.nan

def split_location(loc):
    # "city, state, country" -> (city, state, country)
    if pd.isna(loc): return pd.Series({"City": np.nan, "State": np.nan, "Country": np.nan})
    parts = [p.strip().lower() for p in str(loc).split(",")]
    parts += [np.nan] * (3 - len(parts))
    return pd.Series({"City": parts[0], "State": parts[1], "Country": parts[2]})

In [None]:

# --- 3) Clean Users
users["Age"] = users["Age"].apply(clean_age)
loc_split = users["Location"].apply(split_location)
users = pd.concat([users.drop(columns=["Location"], errors="ignore"), loc_split], axis=1)

# Age buckets (categorical)
bins = [0, 18, 25, 35, 50, 70, 120]
labels = ["<18","18-24","25-34","35-49","50-69","70+"]
users["AgeBucket"] = pd.cut(users["Age"], bins=bins, labels=labels, include_lowest=True)
users["AgeBucket"] = users["AgeBucket"].astype("category")
users["User-ID"] = users["User-ID"].astype(int)

# --- 4) Clean Books
books["Year-Of-Publication"] = books["Year-Of-Publication"].apply(clamp_year)
# Normalize text
for c in ["Book-Title", "Book-Author", "Publisher"]:
    if c in books.columns:
        books[c] = books[c].fillna("").astype(str).str.strip()

# Keep a de-duplicated ISBN (first occurrence)
books = books.drop_duplicates(subset=["ISBN"])


In [20]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [26]:
# --- 5) Clean Ratings
# Cast numeric columns
ratings["Book-Rating"] = ratings["Book-Rating"].apply(to_int_safe)
ratings["User-ID"]     = ratings["User-ID"].apply(to_int_safe)
# Drop rows with missing ids or ratings
ratings = ratings.dropna(subset=["User-ID", "ISBN", "Book-Rating"])
ratings["User-ID"] = ratings["User-ID"].astype(int)

# Merge metadata (helps filtering invalid ISBNs, etc.)
df = ratings.merge(books[["ISBN"]], on="ISBN", how="inner")
df = df.merge(users[["User-ID"]], on="User-ID", how="inner")

In [29]:
# --- 6) Activity filtering (stabilizes training)
min_user_interactions = 5
min_item_interactions = 5
user_counts = df["User-ID"].value_counts()
item_counts = df["ISBN"].value_counts()
keep_users = set(user_counts[user_counts >= min_user_interactions].index)
keep_items = set(item_counts[item_counts >= min_item_interactions].index)
df = df[df["User-ID"].isin(keep_users) & df["ISBN"].isin(keep_items)].copy()

In [None]:
# --- 7) Build explicit & implicit views
# explicit rating in [0..10] (many zeros in Book-Crossing = "no explicit rating")
explicit = df.copy()

# implicit: 1 if rating >= 5 (you can tune threshold), else 0
implicit = df.copy()
implicit["y"] = (implicit["Book-Rating"].fillna(0) >= 5).astype(int)

# --- 8) Create ID <-> index maps (contiguous indices for modeling)
uid2ix = {u:i for i, u in enumerate(sorted(explicit["User-ID"].unique()))}
ix2uid = {i:u for u,i in uid2ix.items()}
isbn2ix = {b:i for i, b in enumerate(sorted(explicit["ISBN"].unique()))}
ix2isbn = {i:b for b,i in isbn2ix.items()}

def map_ids(_df):
    out = _df.copy()
    out["uix"] = out["User-ID"].map(uid2ix)
    out["iix"] = out["ISBN"].map(isbn2ix)
    return out.dropna(subset=["uix","iix"])

explicit_mapped = map_ids(explicit)
implicit_mapped = map_ids(implicit)


In [38]:
# --- 9) Title text → TF-IDF → SVD(=dense 64-D) as lightweight item content features
#     (This helps content/hybrid models + cold-start for items)
title_series = books.set_index("ISBN").reindex(sorted(isbn2ix.keys()))["Book-Title"].fillna("").astype(str)
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=2)
X_tfidf = tfidf.fit_transform(title_series.tolist())

svd = TruncatedSVD(n_components=64, random_state=RANDOM_SEED)
item_title_emb = svd.fit_transform(X_tfidf)  # shape: [num_items, 64]

# Attach to a DataFrame aligned with iix
item_content = pd.DataFrame(item_title_emb, columns=[f"title_svd_{k}" for k in range(item_title_emb.shape[1])])
item_content["ISBN"] = sorted(isbn2ix.keys())
item_content["iix"]  = item_content["ISBN"].map(isbn2ix)

# --- 10) Simple user demographics features (one-hots for top countries + age buckets)
users_small = users[users["User-ID"].isin(uid2ix.keys())].copy()
users_small["uix"] = users_small["User-ID"].map(uid2ix)

In [44]:
# Top K countries to one-hot; rest = "other"
K = 15
top_countries = users_small["Country"].value_counts().head(K).index.tolist()
for c in top_countries:
    users_small[f"country__{c}"] = (users_small["Country"] == c).astype(int)
users_small["country__other"] = (~users_small["Country"].isin(top_countries)).astype(int)

# Age bucket one-hots
for b in users_small["AgeBucket"].cat.categories:
    users_small[f"age__{b}"] = (users_small["AgeBucket"] == b).astype(int)

user_features = users_small[["uix"] + [c for c in users_small.columns if c.startswith(("country__", "age__"))]].copy()

In [43]:
item_content.shape

(40906, 66)

In [51]:


# --- 11) Train/Val/Test split (leave-one-out per user for ranking)
def leave_one_out_split(df_in, rating_col="Book-Rating"):
    # Assumes each row is a user-item interaction
    df_sorted = df_in.sample(frac=1.0, random_state=RANDOM_SEED)  # shuffle
    val_rows, test_rows = [], []
    seen_for_val, seen_for_test = set(), set()

    # pick 1 item per user for val and 1 for test (if available)
    for _, row in df_sorted.iterrows():
        u = row["User-ID"]
        if u not in seen_for_val:
            val_rows.append(row.name); seen_for_val.add(u)
        elif u not in seen_for_test:
            test_rows.append(row.name); seen_for_test.add(u)

    val  = df_in.loc[val_rows]
    test = df_in.loc[test_rows]
    train = df_in.drop(index=set(val_rows) | set(test_rows))
    return train, val, test

train_exp, val_exp, test_exp = leave_one_out_split(explicit_mapped)
train_imp, val_imp, test_imp = leave_one_out_split(implicit_mapped)

# --- 12) Save artifacts for next steps
books.to_parquet(f"{OUT_DIR}/books_clean.parquet", index=False, engine="fastparquet")
users.to_parquet(f"{OUT_DIR}/users_clean.parquet", index=False, engine="fastparquet")
explicit_mapped.to_parquet(f"{OUT_DIR}/ratings_explicit.parquet", index=False, engine="fastparquet")
implicit_mapped.to_parquet(f"{OUT_DIR}/ratings_implicit.parquet", index=False, engine="fastparquet")

item_content.to_parquet(f"{OUT_DIR}/item_title_svd64.parquet", index=False, engine="fastparquet")
user_features.to_parquet(f"{OUT_DIR}/user_demo_features.parquet", index=False, engine="fastparquet")


pd.Series(uid2ix).to_json(f"{OUT_DIR}/uid2ix.json")
pd.Series(isbn2ix).to_json(f"{OUT_DIR}/isbn2ix.json")

print("✅ Data prepared and saved to", OUT_DIR)
print("Shapes:",
      "\n  explicit:", explicit_mapped.shape,
      "\n  implicit:", implicit_mapped.shape,
      "\n  item_content:", item_content.shape,
      "\n  user_features:", user_features.shape)


✅ Data prepared and saved to ./processed
Shapes: 
  explicit: (596753, 5) 
  implicit: (596753, 6) 
  item_content: (40906, 66) 
  user_features: (20131, 23)


In [67]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import numpy as np
import heapq

# --- Build User-Item sparse matrix (unchanged)
n_users = len(uid2ix)
n_items = len(isbn2ix)

rows = train_imp["uix"].to_numpy()
cols = train_imp["iix"].to_numpy()
vals = train_imp["y"].to_numpy()

ui_matrix = csr_matrix((vals, (rows, cols)), shape=(n_users, n_items))

from sklearn.neighbors import NearestNeighbors
import numpy as np, heapq

# Fit once (items = columns of UI matrix)
knn_items = NearestNeighbors(
    n_neighbors=50,       # tune K
    metric="cosine",
    algorithm="brute",
    n_jobs=-1
).fit(ui_matrix.T)

def recommend_items_for_user(user_id, N=10, K=50):
    """
    On-the-fly item-item recommendations for a user.
    Uses only the items the user interacted with, expands to K neighbors each,
    and aggregates neighbor similarities.
    """
    if user_id not in uid2ix:
        return []

    uix = uid2ix[user_id]
    user_items = ui_matrix[uix].indices  # items this user interacted with
    if len(user_items) == 0:
        return []

    # Query neighbors for just these items
    dist, idx = knn_items.kneighbors(ui_matrix.T[user_items], n_neighbors=K, return_distance=True)
    sim = 1.0 - dist   # cosine similarity

    # Accumulate scores
    scores = np.zeros(n_items, dtype=np.float32)
    for nbrs, sims in zip(idx, sim):
        # skip self (first neighbor is usually the item itself)
        for j, s in zip(nbrs[1:], sims[1:]):
            scores[j] += s

    # filter items already seen
    seen_items = set(user_items.tolist())
    if seen_items:
        scores[list(seen_items)] = -1e9

    top_items = heapq.nlargest(N, range(n_items), key=lambda i: scores[i])
    return [ix2isbn[i] for i in top_items]


from sklearn.neighbors import NearestNeighbors
import numpy as np, heapq

# Fit once (users = rows of UI matrix)
knn_users = NearestNeighbors(
    n_neighbors=50,     # tune K
    metric="cosine",
    algorithm="brute",
    n_jobs=-1
).fit(ui_matrix)

def recommend_by_similar_users(user_id, N=10, K=50):
    """
    User-user collaborative filtering.
    Finds K nearest users to the target user and aggregates their interactions
    weighted by similarity, all in sparse form (no dense matrix conversion).
    """
    if user_id not in uid2ix:
        return []

    uix = uid2ix[user_id]

    # Find K nearest neighbors for this user (includes self at idx 0)
    dist, idx = knn_users.kneighbors(ui_matrix[uix], n_neighbors=K, return_distance=True)
    idx = idx.ravel()
    sim = (1.0 - dist.ravel()).astype(np.float32)

    # drop self if present
    mask = idx != uix
    nbr_idxs = idx[mask]
    nbr_sims = sim[mask]

    # Accumulate scores sparsely: scores[j] += w * interaction(nbr, j)
    scores = np.zeros(n_items, dtype=np.float32)
    for nbr, w in zip(nbr_idxs, nbr_sims):
        row = ui_matrix[nbr]           # sparse row
        if row.nnz:
            scores[row.indices] += w * row.data

    # filter items already seen by target user
    seen_items = set(ui_matrix[uix].indices.tolist())
    if seen_items:
        scores[list(seen_items)] = -1e9

    top_items = heapq.nlargest(N, range(n_items), key=lambda i: scores[i])
    return [ix2isbn[i] for i in top_items]


# Pick any known user_id from your mappings
some_user_id = next(iter(uid2ix.keys()))

dict_book = books[['ISBN', 'Book-Title']].set_index('ISBN').to_dict()['Book-Title']

recommended_item_itembased = recommend_items_for_user(some_user_id, N=5, K=50)
recommended_item_userbased  = recommend_by_similar_users(some_user_id, N=5, K=50)

print("Item-Item recs:", recommended_item_itembased)
print("User-User recs:", recommended_item_userbased)
[dict_book[x] for x in recommended_item_userbased]

Item-Item recs: ['0373791313', '0373791356', '0515131105', '0609607235', '0671676369']
User-User recs: ['0060953691', '0375759778', '0394575202', '0684848783', '0020811853']


['La Cucina: A Novel of Rapture',
 'Prague : A Novel',
 'The Power of One',
 'Tis : A Memoir',
 'POSTCARDS']

In [None]:
import numpy as np
from collections import defaultdict

def precision_at_k(pred, gt, k):
    if k == 0: return 0.0
    pred_k = pred[:k]
    if len(pred_k) == 0: return 0.0
    return len(set(pred_k) & set(gt)) / float(k)

def recall_at_k(pred, gt, k):
    if len(gt) == 0: return 0.0
    pred_k = pred[:k]
    return len(set(pred_k) & set(gt)) / float(len(gt))

def ndcg_at_k(pred, gt, k):
    pred_k = pred[:k]
    if len(pred_k) == 0: return 0.0
    # relevance = 1 if in GT else 0
    gains = [1.0 if p in gt else 0.0 for p in pred_k]
    dcg = sum(g / np.log2(i + 2) for i, g in enumerate(gains))
    ideal_gains = sorted(gains, reverse=True)
    idcg = sum(g / np.log2(i + 2) for i, g in enumerate(ideal_gains))
    return dcg / idcg if idcg > 0 else 0.0

# Build ground-truth sets from TEST split (implicit: any test interaction is "relevant")
user_test_gt = defaultdict(list)
for _, r in test_imp.iterrows():
    user_test_gt[int(r["uix"])].append(int(r["iix"]))

def eval_recommender(fn_recommend, k=10, max_users=None):
    """fn_recommend should take a user_id and return a ranked list of ISBNs."""
    users = list(user_test_gt.keys())
    if max_users is not None:
        users = users[:max_users]

    p, r, n = [], [], []
    for uix in users:
        uid = ix2uid[uix]  # back to raw user id
        gt_iix = user_test_gt[uix]
        if not gt_iix:
            continue
        gt_isbn = [ix2isbn[i] for i in gt_iix]

        preds_isbn = fn_recommend(uid)  # just call the provided function
        preds_isbn = preds_isbn[:k]

        p.append(precision_at_k(preds_isbn, gt_isbn, k))
        r.append(recall_at_k(preds_isbn, gt_isbn, k))
        n.append(ndcg_at_k(preds_isbn, gt_isbn, k))

    return {
        f"Precision@{k}": float(np.mean(p)) if p else 0.0,
        f"Recall@{k}": float(np.mean(r)) if r else 0.0,
        f"NDCG@{k}": float(np.mean(n)) if n else 0.0,
        "Users_evaluated": len(p)
    }

# Wrap recommenders with chosen N and K once
rec_item = lambda uid: recommend_items_for_user(uid, N=50, K=50)
rec_user = lambda uid: recommend_by_similar_users(uid, N=50, K=50)

# Run evaluation (e.g., k=10)
metrics_item = eval_recommender(rec_item, k=10)
metrics_user = eval_recommender(rec_user, k=10)

print("Item-Item metrics:", metrics_item)
print("User-User metrics:", metrics_user)


In [None]:
def eval_recommender(fn_recommend, k=10, max_users=None):
    """fn_recommend should take a user_id and return a ranked list of ISBNs."""
    users = list(user_test_gt.keys())
    if max_users is not None:
        users = users[:max_users]

    p, r, n = [], [], []
    for uix in users:
        uid = ix2uid[uix]  # back to raw user id
        gt_iix = user_test_gt[uix]
        if not gt_iix:
            continue
        gt_isbn = [ix2isbn[i] for i in gt_iix]

        preds_isbn = fn_recommend(uid)  # just call the provided function
        preds_isbn = preds_isbn[:k]

        p.append(precision_at_k(preds_isbn, gt_isbn, k))
        r.append(recall_at_k(preds_isbn, gt_isbn, k))
        n.append(ndcg_at_k(preds_isbn, gt_isbn, k))

    return {
        f"Precision@{k}": float(np.mean(p)) if p else 0.0,
        f"Recall@{k}": float(np.mean(r)) if r else 0.0,
        f"NDCG@{k}": float(np.mean(n)) if n else 0.0,
        "Users_evaluated": len(p)
    }

# Wrap recommenders with chosen N and K once
rec_item = lambda uid: recommend_items_for_user(uid, N=50, K=50)
rec_user = lambda uid: recommend_by_similar_users(uid, N=50, K=50)

# Run evaluation (e.g., k=10)
metrics_item = eval_recommender(rec_item, k=10)
metrics_user = eval_recommender(rec_user, k=10)

print("Item-Item metrics:", metrics_item)
print("User-User metrics:", metrics_user)
