**Chargement et exploration de dataset**

In [1]:
import pandas as pd
import os

# chemin vers le dossier MovieLens 100k
path = "/kaggle/input/movielens-100k-dataset/ml-100k"

# Ratings : liens utilisateur ↔ film
ratings = pd.read_csv(os.path.join(path, "u.data"),
                      sep="\t",
                      names=["user_id", "item_id", "rating", "timestamp"])

# Films : informations sur les films
movies = pd.read_csv(os.path.join(path, "u.item"),
                     sep="|",
                     encoding="latin-1",
                     names=["movie_id", "title", "release_date", "video_release_date", "IMDb_URL"] +
                           [f"genre_{i}" for i in range(19)])

# Utilisateurs : informations sur les utilisateurs
users = pd.read_csv(os.path.join(path, "u.user"),
                    sep="|",
                    names=["user_id", "age", "gender", "occupation", "zip_code"])

# Genres des films
genres = pd.read_csv(os.path.join(path, "u.genre"),
                     sep="|",
                     names=["genre", "genre_id"],
                     encoding="latin-1")

# Supprimer les lignes vides éventuelles dans genres
genres = genres.dropna()

# Vérification 
print("Ratings :", ratings.shape)
print("Movies :", movies.shape)
print("Users :", users.shape)
print("Genres :", genres.shape)

# Aperçu 
print("\nExemple ratings :")
display(ratings.head())

print("\nExemple movies :")
display(movies.head())

print("\nExemple users :")
display(users.head())

print("\nExemple genres :")
display(genres)

Ratings : (100000, 4)
Movies : (1682, 24)
Users : (943, 5)
Genres : (19, 2)

Exemple ratings :


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596



Exemple movies :


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,movie_id,title,release_date,video_release_date,IMDb_URL,genre_0,genre_1,genre_2,genre_3,genre_4,...,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0



Exemple users :


Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213



Exemple genres :


Unnamed: 0,genre,genre_id
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4
5,Comedy,5
6,Crime,6
7,Documentary,7
8,Drama,8
9,Fantasy,9


In [2]:
# Distribution des notes
rating_counts = ratings['rating'].value_counts().sort_index()
print("Distribution des notes :")
print(rating_counts)

# Nombre de films notés par utilisateur
user_ratings_count = ratings.groupby('user_id')['item_id'].count()
print("\n Nombre de films notés par utilisateur :")
print(user_ratings_count.describe())

# Nombre d'utilisateurs par film
movie_ratings_count = ratings.groupby('item_id')['user_id'].count()
print("\n Nombre d'utilisateurs par film :")
print(movie_ratings_count.describe())

Distribution des notes :
rating
1     6110
2    11370
3    27145
4    34174
5    21201
Name: count, dtype: int64

 Nombre de films notés par utilisateur :
count    943.000000
mean     106.044539
std      100.931743
min       20.000000
25%       33.000000
50%       65.000000
75%      148.000000
max      737.000000
Name: item_id, dtype: float64

 Nombre d'utilisateurs par film :
count    1682.000000
mean       59.453032
std        80.383846
min         1.000000
25%         6.000000
50%        27.000000
75%        80.000000
max       583.000000
Name: user_id, dtype: float64


**Prétraitement**

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## Nettoyage et normalisation
ratings.drop('timestamp', axis=1, inplace=True, errors='ignore')  # supprimer timestamp
ratings['rating'] = (ratings['rating'] - 1.0) / 4.0  # normaliser [0-1]

# Renommer item_id → movie_id pour la fusion
ratings.rename(columns={'item_id': 'movie_id'}, inplace=True)

# Fusionner avec les titres des films
data = pd.merge(ratings, movies[['movie_id', 'title']], on='movie_id')

# Supprimer doublons (user_id, movie)
data = data.drop_duplicates(subset=['user_id', 'title'])

# Vérification rapide
print("Valeurs nulles :\n", data.isnull().sum())
print("Nombre de films uniques :", data['title'].nunique())
print("Distribution des notes normalisées :")
print(data['rating'].value_counts())

## Mapping des IDs utilisateurs / films
user_ids_unique = data['user_id'].unique()
user2idx = {uid: idx for idx, uid in enumerate(user_ids_unique)}
data['user_idx'] = data['user_id'].map(user2idx)

movie_ids_unique = data['movie_id'].unique()
movie2idx = {mid: idx for idx, mid in enumerate(movie_ids_unique)}
data['movie_idx'] = data['movie_id'].map(movie2idx)

## Création de la matrice utilisateur-film
num_users = len(user_ids_unique)
num_movies = len(movie_ids_unique)
R = np.zeros((num_users, num_movies))

for row in data.itertuples():
    R[row.user_idx, row.movie_idx] = row.rating

print(f"Matrice utilisateur-film créée : {R.shape}")

## Split train/test
train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=42)
train_data, val_data = train_test_split(train_val_data, test_size=0.1765, random_state=42)  

print(f"Nombre d'interactions - Train : {len(train_data)}, Val : {len(val_data)}, Test : {len(test_data)}")

Valeurs nulles :
 user_id     0
movie_id    0
rating      0
title       0
dtype: int64
Nombre de films uniques : 1664
Distribution des notes normalisées :
rating
-0.265625    34057
-0.281250    27075
-0.250000    21138
-0.296875    11337
-0.312500     6086
Name: count, dtype: int64
Matrice utilisateur-film créée : (943, 1681)
Nombre d'interactions - Train : 69782, Val : 14957, Test : 14954


**Construction du graphe**
**Entrainement et évaluation**

In [11]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split

# ----- Split train/val/test 70/15/15 -----
train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=42)
train_data, val_data = train_test_split(train_val_data, test_size=0.1765, random_state=42)  # 0.1765*0.85≈0.15

print(f"Nombre d'interactions - Train : {len(train_data)}, Val : {len(val_data)}, Test : {len(test_data)}")

# ----- Tensors pour PyTorch -----
train_user = torch.tensor(train_data['user_idx'].values, dtype=torch.long)
train_movie = torch.tensor(train_data['movie_idx'].values, dtype=torch.long)
train_rating = torch.tensor(train_data['rating'].values, dtype=torch.float)

val_user = torch.tensor(val_data['user_idx'].values, dtype=torch.long)
val_movie = torch.tensor(val_data['movie_idx'].values, dtype=torch.long)
val_rating = torch.tensor(val_data['rating'].values, dtype=torch.float)

test_user = torch.tensor(test_data['user_idx'].values, dtype=torch.long)
test_movie = torch.tensor(test_data['movie_idx'].values, dtype=torch.long)
test_rating = torch.tensor(test_data['rating'].values, dtype=torch.float)

# ----- Définir le modèle MF -----
class MF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim):
        super(MF, self).__init__()
        self.user_embeddings = nn.Embedding(num_users, embedding_dim)
        self.item_embeddings = nn.Embedding(num_items, embedding_dim)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        self.reset_parameters()
    
    def reset_parameters(self):
        nn.init.normal_(self.user_embeddings.weight, std=0.01)
        nn.init.normal_(self.item_embeddings.weight, std=0.01)
        nn.init.normal_(self.user_bias.weight, std=0.01)
        nn.init.normal_(self.item_bias.weight, std=0.01)
    
    def forward(self, user_idx, item_idx):
        user_embed = self.user_embeddings(user_idx)
        item_embed = self.item_embeddings(item_idx)
        user_bias = self.user_bias(user_idx).squeeze()
        item_bias = self.item_bias(item_idx).squeeze()
        return (user_embed * item_embed).sum(dim=1) + user_bias + item_bias

# ----- Initialiser le modèle, optimizer et loss -----
num_users = len(user_ids_unique)
num_movies = len(movie_ids_unique)
embedding_dim = 20

model = MF(num_users, num_movies, embedding_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.MSELoss()

# ----- Entraînement avec early stopping -----
epochs = 30
patience = 3
best_val_rmse = float('inf')
counter = 0

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    pred = model(train_user, train_movie)
    loss = loss_fn(pred, train_rating)
    loss.backward()
    optimizer.step()
    
    # Validation
    model.eval()
    with torch.no_grad():
        val_pred = model(val_user, val_movie)
        val_rmse = torch.sqrt(loss_fn(val_pred, val_rating)).item()
    
    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {loss.item():.4f} - Val RMSE: {val_rmse:.4f}")
    
    # Early stopping
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        counter = 0
        best_model_state = model.state_dict()
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping activé à l'epoch {epoch+1}")
            break

# Charger le meilleur modèle
model.load_state_dict(best_model_state)

# ----- Évaluation finale sur test set -----
model.eval()
with torch.no_grad():
    test_pred = model(test_user, test_movie)
    test_rmse = torch.sqrt(loss_fn(test_pred, test_rating)).item()

print(f"\nTest RMSE final: {test_rmse:.4f}")

Nombre d'interactions - Train : 69782, Val : 14957, Test : 14954
Epoch 1/30 - Train Loss: 0.0747 - Val RMSE: 0.2534
Epoch 2/30 - Train Loss: 0.0641 - Val RMSE: 0.2328
Epoch 3/30 - Train Loss: 0.0539 - Val RMSE: 0.2110
Epoch 4/30 - Train Loss: 0.0441 - Val RMSE: 0.1874
Epoch 5/30 - Train Loss: 0.0346 - Val RMSE: 0.1618
Epoch 6/30 - Train Loss: 0.0256 - Val RMSE: 0.1340
Epoch 7/30 - Train Loss: 0.0173 - Val RMSE: 0.1040
Epoch 8/30 - Train Loss: 0.0102 - Val RMSE: 0.0724
Epoch 9/30 - Train Loss: 0.0047 - Val RMSE: 0.0426
Epoch 10/30 - Train Loss: 0.0015 - Val RMSE: 0.0307
Epoch 11/30 - Train Loss: 0.0008 - Val RMSE: 0.0493
Epoch 12/30 - Train Loss: 0.0026 - Val RMSE: 0.0698
Epoch 13/30 - Train Loss: 0.0053 - Val RMSE: 0.0811
Early stopping activé à l'epoch 13

Test RMSE final: 0.0817


**Génération de recommandations**

In [12]:
import torch

def recommend_movies(user_id, model, data, user2idx, movie2idx, movies, top_k=10):
    model.eval()
    
    # Récupérer l'indice interne de l'utilisateur
    user_idx = torch.tensor([user2idx[user_id]], dtype=torch.long)
    
    # Créer un tensor avec tous les films
    all_movie_idx = torch.tensor(list(range(len(movie2idx))), dtype=torch.long)
    
    # Prédire les notes pour tous les films
    with torch.no_grad():
        preds = model(user_idx.repeat(len(all_movie_idx)), all_movie_idx)
    
    # Transformer en numpy et trier les indices par score décroissant
    top_indices = preds.numpy().argsort()[::-1][:top_k]
    
    # Récupérer les movie_ids correspondants
    idx2movie = {v: k for k, v in movie2idx.items()}
    top_movie_ids = [idx2movie[idx] for idx in top_indices]
    
    # Récupérer les titres
    recommended_titles = movies[movies['movie_id'].isin(top_movie_ids)]['title'].tolist()
    
    return recommended_titles

# Exemple : recommandations pour l'utilisateur 5
top_movies = recommend_movies(user_id=5, model=model, data=data,
                              user2idx=user2idx, movie2idx=movie2idx,
                              movies=movies, top_k=10)
print("Top 10 films recommandés pour l'utilisateur 5 :")
for i, title in enumerate(top_movies, 1):
    print(f"{i}. {title}")


Top 10 films recommandés pour l'utilisateur 5 :
1. Pushing Hands (1992)
2. Other Voices, Other Rooms (1997)
3. Hedd Wyn (1992)
4. JLG/JLG - autoportrait de décembre (1994)
5. To Cross the Rubicon (1991)
6. Man from Down Under, The (1943)
7. Brothers in Trouble (1995)
8. Sudden Manhattan (1996)
9. Nothing Personal (1995)
10. Ripe (1996)


**générer les recommandations finales filtrées**

In [13]:
import torch

def recommend_new_movies(user_id, model, data, user2idx, movie2idx, movies, top_k=10):
    model.eval()
    
    # Indice interne de l'utilisateur
    user_idx = torch.tensor([user2idx[user_id]], dtype=torch.long)
    
    # Tous les films
    all_movie_idx = torch.tensor(list(range(len(movie2idx))), dtype=torch.long)
    
    # Films déjà vus par l'utilisateur
    seen_movie_ids = data[data['user_id'] == user_id]['movie_id'].tolist()
    seen_movie_idx = [movie2idx[mid] for mid in seen_movie_ids]
    
    # Masquer les films déjà vus
    mask = torch.ones(len(all_movie_idx), dtype=torch.bool)
    mask[seen_movie_idx] = False
    candidate_movies_idx = all_movie_idx[mask]
    
    # Prédire les notes pour les films non vus
    with torch.no_grad():
        preds = model(user_idx.repeat(len(candidate_movies_idx)), candidate_movies_idx)
    
    # Trier par score décroissant
    top_indices = preds.numpy().argsort()[::-1][:top_k]
    
    # Récupérer les movie_ids correspondants
    idx2movie = {v: k for k, v in movie2idx.items()}
    top_movie_ids = [idx2movie[candidate_movies_idx[idx].item()] for idx in top_indices]
    
    # Récupérer les titres
    recommended_titles = movies[movies['movie_id'].isin(top_movie_ids)]['title'].tolist()
    
    return recommended_titles

# Exemple : recommandations pour l'utilisateur 5 (nouveaux films)
top_new_movies = recommend_new_movies(user_id=5, model=model, data=data,
                                      user2idx=user2idx, movie2idx=movie2idx,
                                      movies=movies, top_k=10)
print("Top 10 films recommandés (non vus) pour l'utilisateur 5 :")
for i, title in enumerate(top_new_movies, 1):
    print(f"{i}. {title}")

Top 10 films recommandés (non vus) pour l'utilisateur 5 :
1. Pushing Hands (1992)
2. Other Voices, Other Rooms (1997)
3. Hedd Wyn (1992)
4. JLG/JLG - autoportrait de décembre (1994)
5. To Cross the Rubicon (1991)
6. Man from Down Under, The (1943)
7. Brothers in Trouble (1995)
8. Sudden Manhattan (1996)
9. Nothing Personal (1995)
10. Ripe (1996)
