In [1]:
! pip install cornac

Collecting cornac
  Downloading cornac-2.3.5-cp312-cp312-manylinux1_x86_64.whl.metadata (51 kB)
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/51.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.4/51.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting powerlaw (from cornac)
  Downloading powerlaw-2.0.0-py3-none-any.whl.metadata (9.9 kB)
Downloading cornac-2.3.5-cp312-cp312-manylinux1_x86_64.whl (29.6 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m29.6/29.6 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading powerlaw-2.0.0-py3-none-any.whl (191 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚

In [2]:
# src/dataset_textual.py

import random
import re
from collections import defaultdict
from dataclasses import dataclass
from typing import Dict, List, Tuple

from cornac.datasets import amazon_clothing


UserId = str
ItemId = str
Rating = float

Triplet = Tuple[UserId, ItemId, Rating]


@dataclass
class TextualDataset:
    """Contient tout ce qu'il faut pour les phases suivantes."""
    train: List[Triplet]
    test: List[Triplet]
    item_texts: Dict[ItemId, str]
    users: List[UserId]
    items: List[ItemId]


# ---------- Nettoyage texte ----------

def clean_text(text: str) -> str:
    """Nettoyage tr√®s simple : lowercase + caract√®res alphanum√©riques uniquement."""
    if text is None:
        return ""
    text = text.lower()
    # garder lettres/chiffres, remplacer le reste par des espaces
    text = re.sub(r"[^a-z0-9]+", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


# ---------- Chargement brut depuis Cornac ----------

def load_raw_amazon_clothing() -> Tuple[List[Triplet], Dict[ItemId, str]]:
    """
    Charge :
      - feedback : (user, item, rating)
      - textes des items : description produit

    Retourne :
      - feedback filtr√© pour ne garder que les items avec texte
      - dict item_id -> texte nettoy√©
    """
    # (user, item, rating)
    feedback: List[Triplet] = amazon_clothing.load_feedback()

    # texts: List[str], ids: List[item_id]
    texts, item_ids = amazon_clothing.load_text()

    item_texts_raw = {iid: txt for txt, iid in zip(texts, item_ids)}
    item_texts_clean = {iid: clean_text(txt) for iid, txt in item_texts_raw.items()}

    # filtrer les feedbacks pour ne garder que les items ayant un texte
    filtered_feedback: List[Triplet] = [
        (u, i, float(r))
        for (u, i, r) in feedback
        if i in item_texts_clean
    ]

    return filtered_feedback, item_texts_clean


# ---------- Split leave-one-out par utilisateur ----------

def leave_one_out_split(
    feedback: List[Triplet],
    seed: int = 42,
) -> Tuple[List[Triplet], List[Triplet]]:
    """
    Split leave-one-out :
      - pour chaque utilisateur avec >=2 interactions,
        on met 1 interaction en test, le reste en train
      - pour les utilisateurs avec 1 seule interaction,
        on la laisse dans le train uniquement (sinon on ne peut pas apprendre)
    """
    by_user: Dict[UserId, List[Triplet]] = defaultdict(list)
    for u, i, r in feedback:
        by_user[u].append((u, i, r))

    random.seed(seed)

    train: List[Triplet] = []
    test: List[Triplet] = []

    for u, user_trips in by_user.items():
        if len(user_trips) == 1:
            # trop peu d'infos pour faire un test, tout va dans le train
            train.extend(user_trips)
            continue

        # on choisit 1 interaction au hasard comme test
        test_idx = random.randrange(len(user_trips))
        for idx, trip in enumerate(user_trips):
            if idx == test_idx:
                test.append(trip)
            else:
                train.append(trip)

    return train, test


# ---------- Pipeline complet Phase 1 ----------

def load_textual_dataset(seed: int = 42) -> TextualDataset:
    """
    Pipeline Phase 1 :
      1) charger feedback + textes items
      2) nettoyer textes (d√©j√† fait dans load_raw_amazon_clothing)
      3) split leave-one-out par utilisateur
      4) retourner un objet structur√© utilisable partout
    """
    feedback, item_texts = load_raw_amazon_clothing()
    train, test = leave_one_out_split(feedback, seed=seed)

    users = sorted({u for u, _, _ in feedback})
    items = sorted(item_texts.keys())

    return TextualDataset(
        train=train,
        test=test,
        item_texts=item_texts,
        users=users,
        items=items,
    )


if __name__ == "__main__":
    ds = load_textual_dataset()

    print(f"#users: {len(ds.users)}")
    print(f"#items: {len(ds.items)}")
    print(f"#train interactions: {len(ds.train)}")
    print(f"#test interactions: {len(ds.test)}")

    # aper√ßu d'un item texte
    some_item = next(iter(ds.item_texts.keys()))
    print(f"Exemple item_id: {some_item}")
    print(f"Texte associ√©: {ds.item_texts[some_item][:200]}...")


Data from https://static.preferred.ai/cornac/datasets/amazon_clothing/rating.zip
will be cached into /root/.cornac/amazon_clothing/rating.txt


0.00B [00:00, ?B/s]

Unzipping ...
File cached!
Data from https://static.preferred.ai/cornac/datasets/amazon_clothing/text.zip
will be cached into /root/.cornac/amazon_clothing/text.txt


0.00B [00:00, ?B/s]

Unzipping ...
File cached!
#users: 5377
#items: 3393
#train interactions: 9260
#test interactions: 4429
Exemple item_id: 0000031887
Texte associ√©: this adorable basic ballerina tutu is perfect for dance recitals fairy princes dress up costume play and much comes individually packaged use for a tinkerbell dress up accessory and watch her flutter ...


In [3]:
from dataset_textual import load_textual_dataset

ds = load_textual_dataset(seed=42)

print(len(ds.train), len(ds.test))
print(list(ds.train[:5]))
print(ds.item_texts[list(ds.item_texts.keys())[0]][:200])


ModuleNotFoundError: No module named 'dataset_textual'

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


----

In [5]:
!pip install cornac sentence-transformers numpy pandas scikit-learn torch

import os

# üìÅ Cr√©ation arborescence projet
REPO = "/content/reco-amazon-embeddings"

os.makedirs(f"{REPO}/src", exist_ok=True)
os.makedirs(f"{REPO}/data", exist_ok=True)
os.makedirs(f"{REPO}/results", exist_ok=True)
os.makedirs(f"{REPO}/notebooks", exist_ok=True)

print("‚úî Structure projet cr√©√©e ! ‚Üí", REPO)


‚úî Structure projet cr√©√©e ! ‚Üí /content/reco-amazon-embeddings


In [6]:
code = """
import random
import re
from collections import defaultdict
from dataclasses import dataclass
from typing import Dict, List, Tuple
from cornac.datasets import amazon_clothing

UserId = str
ItemId = str
Rating = float
Triplet = Tuple[UserId, ItemId, Rating]

@dataclass
class TextualDataset:
    train: List[Triplet]
    test: List[Triplet]
    item_texts: Dict[ItemId, str]
    users: List[UserId]
    items: List[ItemId]

def clean_text(t: str) -> str:
    if not t:
        return ""
    t = t.lower()
    t = re.sub(r"[^a-z0-9]+"," ",t)
    return re.sub(r"\\s+"," ",t).strip()

def load_raw():
    fb = amazon_clothing.load_feedback()
    texts, ids = amazon_clothing.load_text()
    item_texts = {iid: clean_text(txt) for txt,iid in zip(texts,ids)}
    fb = [(u,i,float(r)) for u,i,r in fb if i in item_texts]
    return fb, item_texts

def leave_one_out(fb,seed=42):
    random.seed(seed)
    by_user = defaultdict(list)
    for u,i,r in fb:
        by_user[u].append((u,i,r))

    train,test=[],[]
    for u,trips in by_user.items():
        if len(trips)==1:
            train.extend(trips)
            continue
        idx=random.randrange(len(trips))
        for j,t in enumerate(trips):
            (test if j==idx else train).append(t)
    return train,test

def load_textual_dataset(seed=42):
    fb,item_texts=load_raw()
    train,test=leave_one_out(fb,seed)
    users=sorted({u for u,_,_ in fb})
    items=sorted(item_texts.keys())
    return TextualDataset(train=train,test=test,item_texts=item_texts,users=users,items=items)
"""

with open("/content/reco-amazon-embeddings/src/dataset_textual.py","w") as f:
    f.write(code)

print("‚úî Phase 1 ‚ñ∂ dataset_textual.py cr√©√©")


‚úî Phase 1 ‚ñ∂ dataset_textual.py cr√©√©


In [7]:
import sys
sys.path.append("/content/reco-amazon-embeddings/src")

from dataset_textual import load_textual_dataset

ds = load_textual_dataset()
print("\nPHASE 1 VALID√âE ‚úî")
print("Train size:", len(ds.train))
print("Test size :", len(ds.test))
print("Items    :", len(ds.items))
print("Users    :", len(ds.users))



PHASE 1 VALID√âE ‚úî
Train size: 9260
Test size : 4429
Items    : 3393
Users    : 5377


In [8]:
code = """
from sentence_transformers import SentenceTransformer
import numpy as np
import os

from dataset_textual import load_textual_dataset


def generate_item_embeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    save_path="/content/reco-amazon-embeddings/data/item_embeddings.npy",
    batch_size=64,
):
    ds = load_textual_dataset()

    print(f"üß† Chargement du mod√®le BERT : {model_name}")
    model = SentenceTransformer(model_name)

    item_ids = list(ds.items)
    texts = [ds.item_texts[i] for i in item_ids]

    print("üìå Items √† encoder :", len(item_ids))

    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )

    np.save(save_path, {"item_ids": item_ids, "embeddings": embeddings})
    print(f"‚úî Embeddings sauvegard√©s ‚Üí {save_path}")
    print("üìê Vecteurs shape :", embeddings.shape)

    return embeddings, item_ids


if __name__ == "__main__":
    generate_item_embeddings()
"""

with open("/content/reco-amazon-embeddings/src/embed_items.py","w") as f:
    f.write(code)

print("‚úî Phase 2 ‚ñ∂ embed_items.py cr√©√©")


‚úî Phase 2 ‚ñ∂ embed_items.py cr√©√©


In [9]:
import sys
sys.path.append("/content/reco-amazon-embeddings/src")

from embed_items import generate_item_embeddings

embeddings, item_ids = generate_item_embeddings()

üß† Chargement du mod√®le BERT : sentence-transformers/all-MiniLM-L6-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

üìå Items √† encoder : 3393


Batches:   0%|          | 0/54 [00:00<?, ?it/s]

‚úî Embeddings sauvegard√©s ‚Üí /content/reco-amazon-embeddings/data/item_embeddings.npy
üìê Vecteurs shape : (3393, 384)


---
cette partie est √† part dans la modelisation == petit test pour voir les statistques de la base


In [10]:
import sys
import numpy as np
from collections import defaultdict

# acc√®s aux modules
sys.path.append("/content/reco-amazon-embeddings/src")

from dataset_textual import load_textual_dataset

# Chargement du dataset phase 1
ds = load_textual_dataset()

# =============================
#  üìä Comptage items par user
# =============================
user_counts = defaultdict(int)

for (u,i,r) in ds.train:   # uniquement train -> normal pour profiling utilisateur
    user_counts[u] += 1

counts = np.array(list(user_counts.values()))

# =============================
#  üìå Statistiques descriptives
# =============================
print("üìä STATISTIQUES SUR LE NOMBRE D‚ÄôITEMS PAR UTILISATEUR")
print("-----------------------------------------------------")
print(f"Nombre total d‚Äôutilisateurs    : {len(counts)}")
print(f"Min items consomm√©s            : {counts.min()}")
print(f"Q1 (25%)                       : {np.percentile(counts,25)}")
print(f"M√©diane                        : {np.percentile(counts,50)}")
print(f"Q3 (75%)                       : {np.percentile(counts,75)}")
print(f"Max                            : {counts.max()}")
print(f"Moyenne                        : {counts.mean():.2f}")
print(f"√âcart-type                     : {counts.std():.2f}")


üìä STATISTIQUES SUR LE NOMBRE D‚ÄôITEMS PAR UTILISATEUR
-----------------------------------------------------
Nombre total d‚Äôutilisateurs    : 5377
Min items consomm√©s            : 1
Q1 (25%)                       : 1.0
M√©diane                        : 1.0
Q3 (75%)                       : 2.0
Max                            : 31
Moyenne                        : 1.72
√âcart-type                     : 1.11


cette partie est √† part dans la modelisation == petit test pour voir les statistques de la base

---

In [11]:
code = """
import numpy as np
import os
from dataset_textual import load_textual_dataset
from sentence_transformers import SentenceTransformer


def build_user_embeddings_concat(
    item_embedding_path="/content/reco-amazon-embeddings/data/item_embeddings.npy",
    save_path="/content/reco-amazon-embeddings/data/user_embeddings_concat_2.npy",
    X=2,
    model_name="sentence-transformers/all-MiniLM-L6-v2"
):
    ds = load_textual_dataset()

    print("üì• Chargement des embeddings items...")
    data = np.load(item_embedding_path, allow_pickle=True).item()
    item_ids = data["item_ids"]
    item_vecs = data["embeddings"]

    # cr√©ation dictionnaire item -> index
    index = { item_ids[i]: i for i in range(len(item_ids)) }

    print("üß† Chargement du mod√®le BERT pour encoding utilisateur...")
    model = SentenceTransformer(model_name)

    user_embeddings = {}

    print(f"üîß G√©n√©ration embeddings user avec strat√©gie X={X}")
    for user in ds.users:

        # items consomm√©s par user (train uniquement)
        consumed = [i for (u,i,r) in ds.train if u == user]

        if len(consumed) == 0:
            user_embeddings[user] = np.zeros(item_vecs.shape[1])
            continue

        # tri pour simuler dernier enregistrement (pas d'horodatage => ordre brut)
        consumed = consumed[-X:] if len(consumed) >= X else consumed

        # concat texte des items
        text_concat = " ".join([ ds.item_texts[i] for i in consumed ])

        # embedding unique
        embedding = model.encode(text_concat, convert_to_numpy=True)
        user_embeddings[user] = embedding

    os.makedirs("/content/reco-amazon-embeddings/data", exist_ok=True)
    np.save(save_path, user_embeddings)

    print("‚úî user_embeddings g√©n√©r√© et enregistr√© !")
    print(f"üìÅ fichier ‚Üí {save_path}")
    print(f"üìê Dimensions ‚Üí {len(user_embeddings)} utilisateurs √ó {len(next(iter(user_embeddings.values())))} features")


if __name__ == "__main__":
    build_user_embeddings_concat()
"""

with open("/content/reco-amazon-embeddings/src/embed_users_concat.py","w") as f:
    f.write(code)

print("üìÑ Phase 3 ‚ñ∂ embed_users_concat.py cr√©√© avec strat√©gie X=2 !")


üìÑ Phase 3 ‚ñ∂ embed_users_concat.py cr√©√© avec strat√©gie X=2 !


In [12]:
import sys
sys.path.append("/content/reco-amazon-embeddings/src")

from embed_users_concat import build_user_embeddings_concat
user_emb = build_user_embeddings_concat(X=2)


üì• Chargement des embeddings items...
üß† Chargement du mod√®le BERT pour encoding utilisateur...
üîß G√©n√©ration embeddings user avec strat√©gie X=2
‚úî user_embeddings g√©n√©r√© et enregistr√© !
üìÅ fichier ‚Üí /content/reco-amazon-embeddings/data/user_embeddings_concat_2.npy
üìê Dimensions ‚Üí 5377 utilisateurs √ó 384 features


In [13]:
code = """
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from dataset_textual import load_textual_dataset


def recommend_top_k(
    user_emb_path="/content/reco-amazon-embeddings/data/user_embeddings_concat_2.npy",
    item_emb_path="/content/reco-amazon-embeddings/data/item_embeddings.npy",
    K=10
):
    ds = load_textual_dataset()

    # Chargement embeddings
    user_emb = np.load(user_emb_path, allow_pickle=True).item()   # dict user -> vector
    item_data = np.load(item_emb_path, allow_pickle=True).item()
    item_ids  = item_data["item_ids"]
    item_vecs = item_data["embeddings"]

    item_matrix = np.vstack(item_vecs)            # (nb_items, 384)
    item_index  = { item_ids[i]: i for i in range(len(item_ids)) }

    recommendations = {}

    print(f"üîé G√©n√©ration des recommandations Top-{K} pour {len(ds.users)} utilisateurs...")

    for user in ds.users:

        # vecteur utilisateur
        u_vec = user_emb[user].reshape(1,-1)

        # similarit√© user ‚Üî tous les items
        scores = cosine_similarity(u_vec, item_matrix)[0]

        # items d√©j√† vus ‚Üí exclusion
        seen = { i for (u,i,r) in ds.train if u == user }
        candidate_scores = [(item_ids[i], scores[i]) for i in range(len(item_ids)) if item_ids[i] not in seen]

        # tri Top-K
        top_k = sorted(candidate_scores, key=lambda x: x[1], reverse=True)[:K]
        recommendations[user] = top_k

    print("‚úî Recommandations g√©n√©r√©es avec succ√®s !")
    return recommendations


if __name__ == "__main__":
    recs = recommend_top_k(K=10)
"""
with open("/content/reco-amazon-embeddings/src/recommender.py","w") as f:
    f.write(code)

print("üìÑ Phase 4 ‚ñ∂ recommender.py cr√©√©")


üìÑ Phase 4 ‚ñ∂ recommender.py cr√©√©


In [14]:
import sys
sys.path.append("/content/reco-amazon-embeddings/src")

from recommender import recommend_top_k

recs = recommend_top_k(K=10)

# afficher recommandations d‚Äôun utilisateur au hasard
some_user = list(recs.keys())[0]
print("\nüßç Utilisateur :", some_user)
print("Top-10 recommand√©s :")
for item, score in recs[some_user]:
    print("  ‚Ä¢", item, " | score:", round(score,4))


üîé G√©n√©ration des recommandations Top-10 pour 5377 utilisateurs...
‚úî Recommandations g√©n√©r√©es avec succ√®s !

üßç Utilisateur : A034597326Z83X79S50FI
Top-10 recommand√©s :
  ‚Ä¢ B00DFUUUAW  | score: 0.6138
  ‚Ä¢ B0064O9U0W  | score: 0.5433
  ‚Ä¢ B007QUKSN6  | score: 0.5367
  ‚Ä¢ B007QUITZU  | score: 0.5286
  ‚Ä¢ B00546MXYQ  | score: 0.5223
  ‚Ä¢ B001D3FSWS  | score: 0.5105
  ‚Ä¢ B007FI7IMS  | score: 0.4812
  ‚Ä¢ B005HSGB5S  | score: 0.4547
  ‚Ä¢ B00DI2LDIK  | score: 0.4484
  ‚Ä¢ B009R2G4RK  | score: 0.4353


In [15]:
code = """
import cornac
from cornac.datasets import amazon_clothing
from dataset_textual import load_textual_dataset
import json


def run_mf_baseline(k=50, output_path="/content/reco-amazon-embeddings/results/mf_results.json"):
    print("üì• Chargement dataset pour MF...")
    ds = load_textual_dataset()

    # Cornac requiert feedback sous forme (user,item,rating)
    train = ds.train
    test  = ds.test

    # Config MF baseline
    mf = cornac.models.MF(k=k, max_iter=50, learning_rate=0.005, lambda_reg=0.02, verbose=True)

    # M√©triques obligatoires
    metrics = [
        cornac.metrics.Recall(k=10),
        cornac.metrics.NDCG(k=10),
    ]

    # Exp√©rimentation Cornac
    print("üöÄ Entra√Ænement MF baseline...")
    exp = cornac.Experiment(
        models=[mf],
        eval_method=cornac.eval_methods.RatioSplit(
            data=train + test,
            test_size=0.2,  # on recalcule split propre MF
            rating_threshold=0.0,
            exclude_unknowns=True,
        ),
        metrics=metrics
    )

    exp.run()

    results = {
        "Recall@10": float(exp.result_dict["MF"]["Recall@10"]),
        "NDCG@10": float(exp.result_dict["MF"]["NDCG@10"]),
    }

    with open(output_path,"w") as f:
        json.dump(results,f,indent=2)

    print(f"‚úî R√©sultats MF sauvegard√©s ‚Üí {output_path}")
    print(results)

    return results


if __name__ == "__main__":
    run_mf_baseline()
"""
with open("/content/reco-amazon-embeddings/src/baseline_mf.py","w") as f:
    f.write(code)

print("üìÑ Phase 5 ‚ñ∂ baseline_mf.py cr√©√©")


üìÑ Phase 5 ‚ñ∂ baseline_mf.py cr√©√©


In [16]:
import sys
sys.path.append("/content/reco-amazon-embeddings/src")

from baseline_mf import run_mf_baseline

baseline_results = run_mf_baseline()


üì• Chargement dataset pour MF...
üöÄ Entra√Ænement MF baseline...


  0%|          | 0/50 [00:00<?, ?it/s]

Optimization finished!

TEST:
...
   | NDCG@10 | Recall@10 | Train (s) | Test (s)
-- + ------- + --------- + --------- + --------
MF |  0.0034 |    0.0066 |    0.1786 |   1.3691



AttributeError: 'Experiment' object has no attribute 'result_dict'

In [17]:
import cornac
from dataset_textual import load_textual_dataset
import json


def run_mf_baseline(k=50, output_path="/content/reco-amazon-embeddings/results/mf_results.json"):

    print("üì• Chargement dataset pour MF baseline...")
    ds = load_textual_dataset()
    train = ds.train
    test = ds.test

    # MF classique
    mf = cornac.models.MF(k=k, max_iter=50, learning_rate=0.005, lambda_reg=0.02, verbose=True)

    # Evaluation Leave-One-Out correcte
    eval_method = cornac.eval_methods.BaseMethod.from_splits(
        train_data=train,
        test_data=test,
        rating_threshold=0.0,
        exclude_unknowns=True,
        verbose=True
    )

    print("üöÄ Training MF ...")
    exp = cornac.Experiment(
        models=[mf],
        eval_method=eval_method,
        metrics=[cornac.metrics.Recall(10), cornac.metrics.NDCG(10)]
    )

    exp.run()

    # ‚¨á On r√©cup√®re r√©sultats correctement
    results = {
        "Recall@10": float(exp.results[0].metric_avg_results["Recall@10"]),
        "NDCG@10": float(exp.results[0].metric_avg_results["NDCG@10"])
    }

    with open(output_path,"w") as f:
        json.dump(results,f,indent=2)

    print(f"‚úî MF baseline sauvegard√©e dans {output_path}")
    print(results)
    return results


if __name__ == "__main__":
    run_mf_baseline()


üì• Chargement dataset pour MF baseline...
rating_threshold = 0.0
exclude_unknowns = True
---
Training data:
Number of users = 5377
Number of items = 3212
Number of ratings = 9260
Max rating = 5.0
Min rating = 1.0
Global mean = 4.4
---
Test data:
Number of users = 5377
Number of items = 3212
Number of ratings = 4037
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 5377
Total items = 3212
üöÄ Training MF ...

[MF] Training started!


  0%|          | 0/50 [00:00<?, ?it/s]

Optimization finished!

[MF] Evaluation started!


Ranking:   0%|          | 0/4037 [00:00<?, ?it/s]


TEST:
...
   | NDCG@10 | Recall@10 | Train (s) | Test (s)
-- + ------- + --------- + --------- + --------
MF |  0.0046 |    0.0092 |    0.1833 |   2.2558



AttributeError: 'Experiment' object has no attribute 'results'

In [18]:
import cornac
from dataset_textual import load_textual_dataset
import json


def run_mf_baseline(k=50, output_path="/content/reco-amazon-embeddings/results/mf_results.json"):

    print("üì• Chargement dataset pour MF baseline...")
    ds = load_textual_dataset()
    train = ds.train
    test = ds.test

    # Model MF
    mf = cornac.models.MF(k=k, max_iter=50, learning_rate=0.005, lambda_reg=0.02, verbose=True)

    # M√©thode d'√©valuation identique √† ton mod√®le
    eval_method = cornac.eval_methods.BaseMethod.from_splits(
        train_data=train,
        test_data=test,
        rating_threshold=0.0,
        exclude_unknowns=True,
        verbose=True
    )

    print("üöÄ Training MF ...")
    exp = cornac.Experiment(
        models=[mf],
        eval_method=eval_method,
        metrics=[cornac.metrics.Recall(10), cornac.metrics.NDCG(10)]
    )

    exp.run()

    # üìå Extraction correcte des m√©triques (solution d√©finitive)
    recall10 = exp.models[0].metric_results["Recall@10"].mean()
    ndcg10   = exp.models[0].metric_results["NDCG@10"].mean()

    results = {"Recall@10": float(recall10), "NDCG@10":float(ndcg10)}

    # Sauvegarde JSON
    with open(output_path,"w") as f:
        json.dump(results,f,indent=2)

    print(f"\n‚úî R√©sultats MF baseline enregistr√©s dans ‚Üí {output_path}")
    print(results)

    return results


if __name__ == "__main__":
    run_mf_baseline()


üì• Chargement dataset pour MF baseline...
rating_threshold = 0.0
exclude_unknowns = True
---
Training data:
Number of users = 5377
Number of items = 3212
Number of ratings = 9260
Max rating = 5.0
Min rating = 1.0
Global mean = 4.4
---
Test data:
Number of users = 5377
Number of items = 3212
Number of ratings = 4037
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 5377
Total items = 3212
üöÄ Training MF ...

[MF] Training started!


  0%|          | 0/50 [00:00<?, ?it/s]

Optimization finished!

[MF] Evaluation started!


Ranking:   0%|          | 0/4037 [00:00<?, ?it/s]


TEST:
...
   | NDCG@10 | Recall@10 | Train (s) | Test (s)
-- + ------- + --------- + --------- + --------
MF |  0.0046 |    0.0092 |    0.1620 |   2.2315



AttributeError: 'MF' object has no attribute 'metric_results'

In [19]:
# ================================
# FINAL baseline_mf.py ‚Äî Stable
# ================================

import cornac
from dataset_textual import load_textual_dataset
import json


def run_mf_baseline(output_path="/content/reco-amazon-embeddings/results/mf_results.json"):

    print("\nüì• Chargement dataset...")
    ds = load_textual_dataset()

    train = ds.train
    test  = ds.test

    # ======================
    # MODEL MF BASELINE
    # ======================
    mf = cornac.models.MF(
        k=50, learning_rate=0.005,
        lambda_reg=0.02, max_iter=50, verbose=True
    )

    eval_method = cornac.eval_methods.BaseMethod.from_splits(
        train_data=train,
        test_data=test,
        rating_threshold=0.0,
        exclude_unknowns=True,
        verbose=True
    )

    print("\nüöÄ Training + Evaluation MF...\n")
    exp = cornac.Experiment(
        models=[mf],
        eval_method=eval_method,
        metrics=[cornac.metrics.Recall(10), cornac.metrics.NDCG(10)]
    )

    exp.run()

    # =====================================================
    # üî• EXTRACTION FIABLE Des Scores (via metrics globales)
    # =====================================================

    results = {
        "Recall@10": float(exp._runner.all_results[0]["Recall@10"]),
        "NDCG@10":  float(exp._runner.all_results[0]["NDCG@10"])
    }

    with open(output_path, "w") as f:
        json.dump(results, f, indent=2)

    print("\nüìÅ Sauvegard√© dans :", output_path)
    print("\nüìä R√©sultats MF =", results)

    return results


if __name__ == "__main__":
    run_mf_baseline()



üì• Chargement dataset...
rating_threshold = 0.0
exclude_unknowns = True
---
Training data:
Number of users = 5377
Number of items = 3212
Number of ratings = 9260
Max rating = 5.0
Min rating = 1.0
Global mean = 4.4
---
Test data:
Number of users = 5377
Number of items = 3212
Number of ratings = 4037
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 5377
Total items = 3212

üöÄ Training + Evaluation MF...


[MF] Training started!


  0%|          | 0/50 [00:00<?, ?it/s]

Optimization finished!

[MF] Evaluation started!


Ranking:   0%|          | 0/4037 [00:00<?, ?it/s]


TEST:
...
   | NDCG@10 | Recall@10 | Train (s) | Test (s)
-- + ------- + --------- + --------- + --------
MF |  0.0046 |    0.0092 |    0.1547 |   2.2405



AttributeError: 'Experiment' object has no attribute '_runner'

In [20]:
# src/baseline_mf.py

import cornac
from dataset_textual import load_textual_dataset


def run_mf_baseline():
    print("\nüì• Chargement dataset pour MF baseline...")
    ds = load_textual_dataset()
    train = ds.train
    test = ds.test

    mf = cornac.models.MF(
        k=50,
        learning_rate=0.005,
        lambda_reg=0.02,
        max_iter=50,
        verbose=True,
    )

    eval_method = cornac.eval_methods.BaseMethod.from_splits(
        train_data=train,
        test_data=test,
        rating_threshold=0.0,
        exclude_unknowns=True,
        verbose=True,
    )

    print("\nüöÄ Entra√Ænement + √©valuation MF...\n")
    exp = cornac.Experiment(
        models=[mf],
        eval_method=eval_method,
        metrics=[cornac.metrics.Recall(10), cornac.metrics.NDCG(10)],
    )

    exp.run()
    print("\n‚úÖ Baseline MF entra√Æn√©e et √©valu√©e (voir tableau au-dessus).")


if __name__ == "__main__":
    run_mf_baseline()



üì• Chargement dataset pour MF baseline...
rating_threshold = 0.0
exclude_unknowns = True
---
Training data:
Number of users = 5377
Number of items = 3212
Number of ratings = 9260
Max rating = 5.0
Min rating = 1.0
Global mean = 4.4
---
Test data:
Number of users = 5377
Number of items = 3212
Number of ratings = 4037
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 5377
Total items = 3212

üöÄ Entra√Ænement + √©valuation MF...


[MF] Training started!


  0%|          | 0/50 [00:00<?, ?it/s]

Optimization finished!

[MF] Evaluation started!


Ranking:   0%|          | 0/4037 [00:00<?, ?it/s]


TEST:
...
   | NDCG@10 | Recall@10 | Train (s) | Test (s)
-- + ------- + --------- + --------- + --------
MF |  0.0046 |    0.0092 |    0.1516 |   2.2451


‚úÖ Baseline MF entra√Æn√©e et √©valu√©e (voir tableau au-dessus).


In [21]:
code = """
import json
import os

def save_mf_results(
    recall_10=0.0092,
    ndcg_10=0.0046,
    path="/content/reco-amazon-embeddings/results/mf_results.json",
):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    results = {
        "Recall@10": float(recall_10),
        "NDCG@10": float(ndcg_10),
    }
    with open(path, "w") as f:
        json.dump(results, f, indent=2)
    print("‚úî R√©sultats MF sauvegard√©s ‚Üí", path)
    print(results)
    return results


if __name__ == "__main__":
    save_mf_results()
"""
with open("/content/reco-amazon-embeddings/src/save_mf_results.py","w") as f:
    f.write(code)

print("‚úÖ Fichier save_mf_results.py cr√©√©.")


‚úÖ Fichier save_mf_results.py cr√©√©.


In [22]:
import sys
sys.path.append("/content/reco-amazon-embeddings/src")

from save_mf_results import save_mf_results
mf_results = save_mf_results()
mf_results


‚úî R√©sultats MF sauvegard√©s ‚Üí /content/reco-amazon-embeddings/results/mf_results.json
{'Recall@10': 0.0092, 'NDCG@10': 0.0046}


{'Recall@10': 0.0092, 'NDCG@10': 0.0046}

In [23]:
code = """
import numpy as np
from sklearn.metrics import ndcg_score
from dataset_textual import load_textual_dataset
from sklearn.metrics.pairwise import cosine_similarity
import json


def evaluate_embeddings(
    user_emb_path="/content/reco-amazon-embeddings/data/user_embeddings_concat_2.npy",
    item_emb_path="/content/reco-amazon-embeddings/data/item_embeddings.npy",
    save_path="/content/reco-amazon-embeddings/results/embeddings_results.json",
    K=10
):
    ds = load_textual_dataset()

    # Chargement embeddings
    user_embeddings = np.load(user_emb_path, allow_pickle=True).item()
    item_data = np.load(item_emb_path, allow_pickle=True).item()

    item_ids  = item_data["item_ids"]
    item_vecs = item_data["embeddings"]

    item_index = {item_ids[i]: i for i in range(len(item_ids))}
    item_matrix = np.vstack(item_vecs)

    Recall_total = 0
    NDCG_total   = 0
    user_count   = 0

    print("üìä √âvaluation mod√®le embeddings...")

    for user in ds.users:
        if user not in user_embeddings:
            continue

        u_vec = user_embeddings[user].reshape(1,-1)
        scores = cosine_similarity(u_vec, item_matrix)[0]

        # items consomm√©s = ground truth
        gt_items = {i for (u,i,r) in ds.test if u == user}
        if len(gt_items)==0:
            continue

        # exclude train items
        train_items = {i for (u,i,r) in ds.train if u == user}
        candidate_scores = [(item_ids[i], scores[i]) for i in range(len(item_ids)) if item_ids[i] not in train_items]

        # ranking
        ranked = sorted(candidate_scores, key=lambda x: x[1], reverse=True)[:K]
        recommended = [item for item,score in ranked]

        # recall@10
        hits = len([i for i in recommended if i in gt_items])
        recall = hits / len(gt_items)

        # NDCG@10
        y_true = [[1 if item in gt_items else 0 for item,score in ranked]]
        y_score= [[score for item,score in ranked]]
        ndcg   = ndcg_score(y_true,y_score)

        Recall_total += recall
        NDCG_total   += ndcg
        user_count   += 1

    results = {
        "Recall@10": float(Recall_total/user_count),
        "NDCG@10":   float(NDCG_total/user_count)
    }

    with open(save_path,"w") as f:
        json.dump(results,f,indent=2)

    print("\nüìÅ R√©sultats embeddings sauvegard√©s ‚Üí", save_path)
    print(results)
    return results


if __name__ == "__main__":
    evaluate_embeddings()
"""
with open("/content/reco-amazon-embeddings/src/evaluate_embeddings.py","w") as f:
    f.write(code)

print("üìÑ Phase 6 ‚ñ∂ evaluate_embeddings.py cr√©√©")


üìÑ Phase 6 ‚ñ∂ evaluate_embeddings.py cr√©√©


In [24]:
from evaluate_embeddings import evaluate_embeddings

emb_results = evaluate_embeddings(K=10)
emb_results

SyntaxError: unterminated string literal (detected at line 74) (evaluate_embeddings.py, line 74)

In [25]:
# =========================================================
# Phase 6 - Evaluation du mod√®le embeddings
# =========================================================

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score
import json
from dataset_textual import load_textual_dataset


def evaluate_embeddings(
    user_emb_path="/content/reco-amazon-embeddings/data/user_embeddings_concat_2.npy",
    item_emb_path="/content/reco-amazon-embeddings/data/item_embeddings.npy",
    save_path="/content/reco-amazon-embeddings/results/embeddings_results.json",
    K=10
):
    ds = load_textual_dataset()

    # Chargement embeddings user + item
    user_embeddings = np.load(user_emb_path, allow_pickle=True).item()
    item_data = np.load(item_emb_path, allow_pickle=True).item()

    item_ids  = item_data["item_ids"]
    item_vecs = item_data["embeddings"]
    item_index = {item_ids[i]:i for i in range(len(item_ids))}

    item_matrix = np.vstack(item_vecs)

    Recall_total = 0
    NDCG_total   = 0
    user_count   = 0

    print("üìä Evaluating embeddings-based recommendation model...")

    for user in ds.users:

        if user not in user_embeddings:
            continue

        # vecteur utilisateur
        u_vec = user_embeddings[user].reshape(1,-1)

        # scores item
        scores = cosine_similarity(u_vec, item_matrix)[0]

        # ground truth = items du TEST
        gt = {i for (u,i,r) in ds.test if u == user}
        if len(gt) == 0:
            continue

        # filtrer items d√©j√† vus en train
        train_items = {i for (u,i,r) in ds.train if u == user}
        candidates = [
            (item_ids[i], scores[i])
            for i in range(len(item_ids))
            if item_ids[i] not in train_items
        ]

        ranked = sorted(candidates, key=lambda x: x[1], reverse=True)[:K]
        recommended_items = [i for i,s in ranked]

        # recall@10
        hits = len([i for i in recommended_items if i in gt])
        recall = hits / len(gt)

        # ndcg@10
        relevance = [[1 if item in gt else 0 for item,s in ranked]]
        scores_ndcg= [[s for item,s in ranked]]
        ndcg = ndcg_score(relevance, scores_ndcg)

        Recall_total += recall
        NDCG_total   += ndcg
        user_count   += 1

    results = {
        "Recall@10": float(Recall_total/user_count),
        "NDCG@10":   float(NDCG_total/user_count)
    }

    # sauvegarde JSON
    with open(save_path,"w") as f:
        json.dump(results,f,indent=2)

    print("\nüìÅ R√©sultats sauvegard√©s :", save_path)
    print("üìà Performance embeddings =",results)

    return results



if __name__ == "__main__":
    evaluate_embeddings()


üìä Evaluating embeddings-based recommendation model...

üìÅ R√©sultats sauvegard√©s : /content/reco-amazon-embeddings/results/embeddings_results.json
üìà Performance embeddings = {'Recall@10': 0.13772860690900882, 'NDCG@10': 0.08320426522753581}


In [26]:
from evaluate_embeddings import evaluate_embeddings
emb_results = evaluate_embeddings()
emb_results


SyntaxError: unterminated string literal (detected at line 74) (evaluate_embeddings.py, line 74)

In [29]:
import os
from google.colab import files

# Define the path to the project directory
REPO = "/content/reco-amazon-embeddings"
ARCHIVE_NAME = f"{REPO}.zip"

# Create a zip archive of the project directory
print(f"Compressing directory '{REPO}' into '{ARCHIVE_NAME}'...")
!zip -r {ARCHIVE_NAME} {REPO}

# Download the created zip file
print(f"Downloading '{ARCHIVE_NAME}'...")
files.download(ARCHIVE_NAME)


Compressing directory '/content/reco-amazon-embeddings' into '/content/reco-amazon-embeddings.zip'...
updating: content/reco-amazon-embeddings/ (stored 0%)
updating: content/reco-amazon-embeddings/data/ (stored 0%)
updating: content/reco-amazon-embeddings/data/user_embeddings_concat_2.npy (deflated 10%)
updating: content/reco-amazon-embeddings/data/item_embeddings.npy (deflated 12%)
updating: content/reco-amazon-embeddings/results/ (stored 0%)
updating: content/reco-amazon-embeddings/results/embeddings_results.json (deflated 8%)
updating: content/reco-amazon-embeddings/results/mf_results.json (deflated 20%)
updating: content/reco-amazon-embeddings/src/ (stored 0%)
updating: content/reco-amazon-embeddings/src/evaluate_embeddings.py (deflated 61%)
updating: content/reco-amazon-embeddings/src/recommender.py (deflated 54%)
updating: content/reco-amazon-embeddings/src/__pycache__/ (stored 0%)
updating: content/reco-amazon-embeddings/src/__pycache__/baseline_mf.cpython-312.pyc (deflated 38%)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [30]:
import os
from google.colab import files

# Define the path to the project directory
REPO = "/content/sample_data"
ARCHIVE_NAME = f"{REPO}.zip"

# Create a zip archive of the project directory
print(f"Compressing directory '{REPO}' into '{ARCHIVE_NAME}'...")
!zip -r {ARCHIVE_NAME} {REPO}

# Download the created zip file
print(f"Downloading '{ARCHIVE_NAME}'...")
files.download(ARCHIVE_NAME)


Compressing directory '/content/sample_data' into '/content/sample_data.zip'...
  adding: content/sample_data/ (stored 0%)
  adding: content/sample_data/README.md (deflated 39%)
  adding: content/sample_data/anscombe.json (deflated 83%)
  adding: content/sample_data/california_housing_train.csv (deflated 79%)
  adding: content/sample_data/california_housing_test.csv (deflated 76%)
  adding: content/sample_data/mnist_train_small.csv (deflated 88%)
  adding: content/sample_data/mnist_test.csv (deflated 88%)
Downloading '/content/sample_data.zip'...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>