In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.preprocessing import LabelEncoder
from implicit.als import AlternatingLeastSquares
import random

In [2]:
train = pd.read_csv(
    "/Users/Zhuanz/fashion-recommendation-system/Fashion recommendation/data/cleaned/train.csv",
    dtype={"customer_id": "string", "article_id": "int32"}
)

valid = pd.read_csv(
    "/Users/Zhuanz/fashion-recommendation-system/Fashion recommendation/data/cleaned/valid.csv",
    dtype={"customer_id": "string", "article_id": "int32"}
)

print(train.shape, valid.shape)

(24627945, 7) (6131468, 7)


In [3]:
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

train["user_idx"] = user_encoder.fit_transform(train["customer_id"])
train["item_idx"] = item_encoder.fit_transform(train["article_id"])


In [4]:
from scipy.sparse import coo_matrix
import numpy as np

num_users = train["user_idx"].nunique()
num_items = train["item_idx"].nunique()

print("Users:", num_users)
print("Items:", num_items)


Users: 880152
Items: 81656


In [5]:
from scipy.sparse import coo_matrix
import numpy as np

interaction_matrix = coo_matrix(
    (
        np.ones(len(train)),
        (train["item_idx"], train["user_idx"])
    ),
    shape=(num_items, num_users)
).tocsr()

print("Matrix shape:", interaction_matrix.shape)


Matrix shape: (81656, 880152)


In [6]:
ALPHA = 40
confidence_matrix = interaction_matrix * ALPHA

In [7]:
# sanity check
print("Original matrix shape:", confidence_matrix.shape)

matrix_for_training = confidence_matrix.T

print("Matrix used for training:", matrix_for_training.shape)

model = AlternatingLeastSquares(
    factors=64,
    regularization=0.01,
    iterations=10,
    random_state=42
)

model.fit(matrix_for_training)

print("User factors:", model.user_factors.shape)
print("Item factors:", model.item_factors.shape)


Original matrix shape: (81656, 880152)
Matrix used for training: (880152, 81656)




  0%|          | 0/10 [00:00<?, ?it/s]

User factors: (880152, 64)
Item factors: (81656, 64)


In [8]:
def recommend_als(user_id, top_k=12):

    if user_id not in user_encoder.classes_:
        return []

    user_idx = user_encoder.transform([user_id])[0]

    recommended, scores = model.recommend(
        user_idx,
        confidence_matrix.T,   # IMPORTANT
        N=top_k,
        filter_already_liked_items=True
    )

    return item_encoder.classes_[recommended].tolist()


In [9]:
def recall_at_k(recs, ground_truth, k=12):
    if len(ground_truth) == 0:
        return 0
    hits = len(set(recs[:k]) & set(ground_truth))
    return hits / len(ground_truth)


In [10]:
# create a dictionary: user_id -> set of items in validation
valid_user_items = (
    valid.groupby("customer_id")["article_id"]
    .apply(set)
    .to_dict()
)

# quick check
print("Number of users in validation:", len(valid_user_items))
sample_user = list(valid_user_items.keys())[0]
print("Items for first user:", valid_user_items[sample_user])


Number of users in validation: 561584
Items for first user: {568601043}


In [12]:
recalls = []

start = time.time()

for user in sample_users:
    user_idx = user_encoder.transform([user])[0]
    user_items_row = confidence_matrix.T[user_idx].tocsr()
    
    recommended, scores = model.recommend(
        user_idx,
        user_items_row,
        N=12,
        filter_already_liked_items=True
    )
    
    recs = item_encoder.classes_[recommended]
    recalls.append(
        recall_at_k(recs, valid_user_items[user])
    )

print("Recall@12:", np.mean(recalls))
print("Time taken:", round(time.time() - start, 2), "seconds")


Recall@12: 0.0071808784725193395
Time taken: 164.67 seconds


In [13]:
model.save("als_model.npz")


In [14]:
import pickle

with open("user_encoder.pkl", "wb") as f:
    pickle.dump(user_encoder, f)

with open("item_encoder.pkl", "wb") as f:
    pickle.dump(item_encoder, f)

In [15]:
import numpy as np

np.save("item_embeddings.npy", model.item_factors)
np.save("user_embeddings.npy", model.user_factors)


##### We implemented implicit-feedback Alternating Least Squares using an item-user interaction matrix with confidence weighting. The model learns 64-dimensional latent embeddings for both users and items and significantly improves Recall@12 over baseline recommenders.