In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

In [2]:
articles = pd.read_csv(
    "/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv"
)

print("Articles:", articles.shape)
articles.head()

Articles: (105542, 25)


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [3]:
articles["text"] = (
    articles["product_type_name"].fillna("") + " " +
    articles["product_group_name"].fillna("") + " " +
    articles["department_name"].fillna("")
)

tfidf = TfidfVectorizer(
    max_features=800,
    stop_words="english"
)

item_tfidf = tfidf.fit_transform(articles["text"])

article_ids = articles["article_id"].values

article_id_to_index = {
    aid: idx for idx, aid in enumerate(article_ids)
}

index_to_article_id = {
    idx: aid for aid, idx in article_id_to_index.items()
}

item_tfidf.shape


(105542, 256)

In [6]:
def get_similar_items(article_id, top_k=10):
    if article_id not in article_id_to_index:
        return []

    idx = article_id_to_index[article_id]

    item_vec = item_tfidf[idx]
    sims = cosine_similarity(item_vec, item_tfidf).flatten()

    top_indices = np.argsort(sims)[::-1][1 : top_k + 1]

    return [index_to_article_id[i] for i in top_indices]

In [7]:

test_item = 706016001
similar_items = get_similar_items(test_item, 10)
similar_items

[np.int64(539723039),
 np.int64(909059002),
 np.int64(707082002),
 np.int64(542119001),
 np.int64(542143001),
 np.int64(543745001),
 np.int64(539723001),
 np.int64(539723002),
 np.int64(539723003),
 np.int64(539723004)]

In [8]:
articles[
    articles["article_id"].isin(similar_items)
][[
    "article_id",
    "product_type_name",
    "colour_group_name",
    "index_group_name"
]]

Unnamed: 0,article_id,product_type_name,colour_group_name,index_group_name
10285,539723001,Trousers,Black,Divided
10286,539723002,Trousers,Greenish Khaki,Divided
10287,539723003,Trousers,Dark Grey,Divided
10288,539723004,Trousers,Light Blue,Divided
10301,539723039,Trousers,Dark Blue,Divided
10613,542119001,Trousers,Blue,Divided
10620,542143001,Trousers,Black,Divided
10864,543745001,Trousers,Black,Divided
54258,707082002,Trousers,Greenish Khaki,Divided
103052,909059002,Trousers,Blue,Divided


In [9]:
import pandas as pd

train = pd.read_csv(
    "/kaggle/input/hm-cleaned/data/train.csv",
    dtype={
        "customer_id": "string",
        "article_id": "int32"
    }
)

valid = pd.read_csv(
    "/kaggle/input/hm-cleaned/data/valid.csv",
    dtype={
        "customer_id": "string",
        "article_id": "int32"
    }
)

In [10]:
user_history = (
    train
    .groupby("customer_id")["article_id"]
    .apply(list)
)

In [11]:
assert "train" in globals(), "Train data not loaded"

In [12]:
valid_user_items = (
    valid
    .groupby("customer_id")["article_id"]
    .apply(set)
    .to_dict()
)

In [13]:
import random

users = list(valid_user_items.keys())
random.shuffle(users)

EVAL_USERS = users[:5000]  # start small


In [14]:
def recall_at_k(recommended_items, ground_truth_items, k):
    if len(ground_truth_items) == 0:
        return 0.0
    hits = len(set(recommended_items[:k]) & ground_truth_items)
    return hits / len(ground_truth_items)


In [15]:
def build_user_vector_limited(article_ids, max_items=20):
    vectors = []

    for aid in article_ids[-max_items:]:
        if aid in article_id_to_index:
            vectors.append(item_tfidf[article_id_to_index[aid]])

    if not vectors:
        return None

    user_vec = sp.vstack(vectors).mean(axis=0)
    user_vec = np.asarray(user_vec)

    if user_vec.ndim == 1:
        user_vec = user_vec.reshape(1, -1)

    return user_vec

In [17]:
import scipy.sparse as sp
u = next(iter(user_history.keys()))
v = build_user_vector_limited(user_history[u])

type(v), v.shape

(numpy.ndarray, (1, 256))

In [18]:
def recommend_for_user(user_id, top_k=12):
    if user_id not in user_history:
        return []

    user_vec = build_user_vector_limited(user_history[user_id])
    if user_vec is None:
        return []

    sims = cosine_similarity(user_vec, item_tfidf).ravel()
    ranked_indices = np.argsort(sims)[::-1]

    seen = set(user_history[user_id])
    recs = []

    for idx in ranked_indices:
        aid = index_to_article_id[idx]
        if aid in seen:
            continue
        recs.append(aid)
        if len(recs) == top_k:
            break

    return recs

In [19]:
def recall_at_k(recommended, ground_truth, k):
    if not ground_truth:
        return 0.0
    return len(set(recommended[:k]) & ground_truth) / len(ground_truth)


In [20]:
EVAL_USERS = random.sample(list(valid_user_items.keys()), 1000)

TOP_K = 12
recalls = []

for i, user in enumerate(EVAL_USERS):
    if user not in user_history:
        continue

    recs = recommend_for_user(user, TOP_K)
    if not recs:
        continue

    gt = valid_user_items.get(user, set())
    if not gt:
        continue

    recalls.append(recall_at_k(recs, gt, TOP_K))

    if i % 100 == 0:
        print(f"Processed {i} users")

np.mean(recalls)


Processed 100 users
Processed 200 users
Processed 300 users
Processed 400 users
Processed 500 users
Processed 600 users
Processed 700 users
Processed 800 users
Processed 900 users


np.float64(0.0012675761852490722)

In [21]:
import pickle

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

In [22]:
sp.save_npz("item_tfidf.npz", item_tfidf)


In [24]:
article_id_to_index_json = {
    str(k): int(v)
    for k, v in article_id_to_index.items()
}

index_to_article_id_json = {
    str(k): int(v)
    for k, v in index_to_article_id.items()
}

import json

with open("article_id_to_index.json", "w") as f:
    json.dump(article_id_to_index_json, f)

with open("index_to_article_id.json", "w") as f:
    json.dump(index_to_article_id_json, f)
