**Chargement et exploration des données**

In [1]:
import numpy as np 
import pandas as pd 

# les notes des utilisateurs depuis u.data
ratings = pd.read_csv("/kaggle/input/movielens-100k-dataset/ml-100k/u.data")
ratings = pd.read_csv("/kaggle/input/movielens-100k-dataset/ml-100k/u.data", sep="\t", names=["user_id", "movie_id", "rating", "timestamp"])
print(ratings)

# les métadonnées des films depuis u.item 
movies = pd.read_csv("/kaggle/input/movielens-100k-dataset/ml-100k/u.item", sep="|", encoding="latin-1", 
                     names=["movie_id", "title", "release_date", "video_release_date", "IMDb_URL", "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"])
print(movies)

# les informations sur les utilisateurs
users = pd.read_csv("/kaggle/input/movielens-100k-dataset/ml-100k/u.user", sep="|", names=["user_id", "age", "gender", "occupation", "zip_code"])
print(users)

       user_id  movie_id  rating  timestamp
0          196       242       3  881250949
1          186       302       3  891717742
2           22       377       1  878887116
3          244        51       2  880606923
4          166       346       1  886397596
...        ...       ...     ...        ...
99995      880       476       3  880175444
99996      716       204       5  879795543
99997      276      1090       1  874795795
99998       13       225       2  882399156
99999       12       203       3  879959583

[100000 rows x 4 columns]
      movie_id                                      title release_date  \
0            1                           Toy Story (1995)  01-Jan-1995   
1            2                           GoldenEye (1995)  01-Jan-1995   
2            3                          Four Rooms (1995)  01-Jan-1995   
3            4                          Get Shorty (1995)  01-Jan-1995   
4            5                             Copycat (1995)  01-Jan-1995   
.

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


**Préparation des données**

In [2]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split

# Nettoyage
ratings.drop('timestamp', axis=1, inplace=True)

# Normalisation des notes dans [0,1]
ratings['rating'] = (ratings['rating'] - 1.0) / 4.0

# Fusion pour garder les titres (pas obligatoire mais utile pour l'interprétation)
data = pd.merge(ratings, movies[['movie_id', 'title']], on='movie_id')

# Réindexation des IDs pour embeddings
unique_user_ids = data['user_id'].unique()
unique_movie_ids = data['movie_id'].unique()

user_id_map = {id_: idx for idx, id_ in enumerate(unique_user_ids)}
movie_id_map = {id_: idx for idx, id_ in enumerate(unique_movie_ids)}

data['user_id'] = data['user_id'].map(user_id_map)
data['movie_id'] = data['movie_id'].map(movie_id_map)

# Extraction finale des données
user_ids = data['user_id'].values
movie_ids = data['movie_id'].values
ratings = data['rating'].values

# Split Train / Test
train_user, test_user, train_movie, test_movie, train_rating, test_rating = train_test_split(
    user_ids, movie_ids, ratings, test_size=0.2, random_state=42)

print("Nombre d'utilisateurs:", len(unique_user_ids))
print("Nombre de films:", len(unique_movie_ids))
print("Train size:", len(train_rating))
print("Test size:", len(test_rating))


Nombre d'utilisateurs: 943
Nombre de films: 1682
Train size: 80000
Test size: 20000


**Construction du graphe biparti**

In [3]:
pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.7.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

num_users = len(unique_user_ids)
num_movies = len(unique_movie_ids)
num_nodes = num_users + num_movies

# Décalage des ID films
train_movie_offset = train_movie + num_users
test_movie_offset = test_movie + num_users

# Arêtes (train uniquement)
edge_user = torch.tensor(train_user, dtype=torch.long)
edge_movie = torch.tensor(train_movie_offset, dtype=torch.long)

edge_index = torch.stack([
    torch.cat([edge_user, edge_movie]),
    torch.cat([edge_movie, edge_user])
], dim=0)

# Features initiales aléatoires
x = torch.randn(num_nodes, 64)

data_graph = Data(x=x, edge_index=edge_index)

**Modèle GraphSAGE**

In [5]:
class GraphSAGERecommender(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.predict = nn.Sequential(
            nn.Linear(hidden_channels * 2, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, data, user_ids, movie_ids):
        h = self.conv1(data.x, data.edge_index).relu()
        h = self.conv2(h, data.edge_index)
        movie_ids = movie_ids + num_users  # décalage
        user_emb = h[user_ids]
        movie_emb = h[movie_ids]
        concat = torch.cat([user_emb, movie_emb], dim=1)
        return self.predict(concat).squeeze()

model = GraphSAGERecommender(64, 64)
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

**Entrainement**

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error, mean_absolute_error

# ---------------- Device ----------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ---------------- Convertir les données en tenseurs ----------------
train_user_t = torch.tensor(train_user, dtype=torch.long).to(device)
train_movie_t = torch.tensor(train_movie, dtype=torch.long).to(device)
train_rating_t = torch.tensor(train_rating, dtype=torch.float32).to(device)

test_user_t = torch.tensor(test_user, dtype=torch.long).to(device)
test_movie_t = torch.tensor(test_movie, dtype=torch.long).to(device)
test_rating_t = torch.tensor(test_rating, dtype=torch.float32).to(device)

# ---------------- Envoyer le graphe et le modèle sur le device ----------------
data_graph = data_graph.to(device)
model = model.to(device)

# ---------------- Entraînement ----------------
epochs = 10
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    
    # Prédiction pour le train
    preds = model(data_graph, train_user_t, train_movie_t)
    loss = loss_fn(preds, train_rating_t)
    
    # Backprop
    loss.backward()
    optimizer.step()
    
    # Évaluation sur le test
    model.eval()
    with torch.no_grad():
        test_preds = model(data_graph, test_user_t, test_movie_t)
        rmse = mean_squared_error(test_rating_t.cpu(), test_preds.cpu(), squared=False)
        mae = mean_absolute_error(test_rating_t.cpu(), test_preds.cpu())
    
    print(f"Epoch {epoch+1} | Train Loss: {loss.item():.4f} | RMSE: {rmse:.4f} | MAE: {mae:.4f}")

Using device: cuda
Epoch 1 | Train Loss: 0.1171 | RMSE: 0.3295 | MAE: 0.2753
Epoch 2 | Train Loss: 0.1095 | RMSE: 0.3197 | MAE: 0.2627
Epoch 3 | Train Loss: 0.1031 | RMSE: 0.3112 | MAE: 0.2519
Epoch 4 | Train Loss: 0.0976 | RMSE: 0.3037 | MAE: 0.2470
Epoch 5 | Train Loss: 0.0930 | RMSE: 0.2970 | MAE: 0.2448
Epoch 6 | Train Loss: 0.0889 | RMSE: 0.2910 | MAE: 0.2428
Epoch 7 | Train Loss: 0.0852 | RMSE: 0.2859 | MAE: 0.2405
Epoch 8 | Train Loss: 0.0822 | RMSE: 0.2821 | MAE: 0.2379
Epoch 9 | Train Loss: 0.0800 | RMSE: 0.2805 | MAE: 0.2352
Epoch 10 | Train Loss: 0.0790 | RMSE: 0.2811 | MAE: 0.2326


**Visualisation**

In [11]:
# ---------------- Device ----------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Envoyer modèle et données sur device
model = model.to(device)
data_graph = data_graph.to(device)
train_user_t = torch.tensor(train_user, dtype=torch.long).to(device)
train_movie_t = torch.tensor(train_movie, dtype=torch.long).to(device)
train_rating_t = torch.tensor(train_rating, dtype=torch.float32).to(device)

test_user_t = torch.tensor(test_user, dtype=torch.long).to(device)
test_movie_t = torch.tensor(test_movie, dtype=torch.long).to(device)
test_rating_t = torch.tensor(test_rating, dtype=torch.float32).to(device)

# ---------------- Évaluation globale sur le test set ----------------
model.eval()
with torch.no_grad():
    test_preds = model(data_graph, test_user_t, test_movie_t)
    rmse = mean_squared_error(test_rating_t.cpu(), test_preds.cpu(), squared=False)
    mae = mean_absolute_error(test_rating_t.cpu(), test_preds.cpu())
print(f"Test RMSE: {rmse:.4f} | Test MAE: {mae:.4f}")

# ---------------- Générer recommandations pour un utilisateur ----------------
def recommend_movies(user_id, top_k=5):
    model.eval()
    with torch.no_grad():
        # Tous les films possibles
        all_movies = torch.arange(num_movies, dtype=torch.long).to(device)
        user_ids_tensor = torch.tensor([user_id] * num_movies, dtype=torch.long).to(device)
        
        # Prédiction des notes pour cet utilisateur
        preds = model(data_graph, user_ids_tensor, all_movies)
        
        # Top-k films
        top_indices = torch.topk(preds, top_k).indices.cpu().numpy()
        recommended_movie_ids = [unique_movie_ids[i] for i in top_indices]
        
        # Titres des films
        recommended_titles = movies[movies['movie_id'].isin(recommended_movie_ids)]['title'].values
        return recommended_titles

# Exemple : recommandations pour l'utilisateur 0
user_id_example = 0
recommended = recommend_movies(user_id_example, top_k=5)
print(f"Top 5 recommandations pour l'utilisateur {user_id_example}:")
for i, title in enumerate(recommended, 1):
    print(f"{i}. {title}")


Using device: cuda
Test RMSE: 0.2811 | Test MAE: 0.2326
Top 5 recommandations pour l'utilisateur 0:
1. Twilight (1998)
2. Jupiter's Wife (1994)
3. I Don't Want to Talk About It (De eso no se habla) (1993)
4. Leading Man, The (1996)
5. Tokyo Fist (1995)
