# MovieLens Cold-Start Recommendation System
## Hybrid GCN with BERT Embeddings and Self-Supervised Learning

This notebook implements a Graph Convolutional Network (GCN) for movie recommendations that handles cold-start items using semantic embeddings and contrastive learning.

---
## Block 1: Setup & Data Download
Download the MovieLens-100K dataset and extract it to the current directory.

In [None]:
import os
import requests
import zipfile
import io

def download_movielens():
    url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"
    if not os.path.exists("./ml-100k"):
        print("Downloading MovieLens-100K dataset...")
        r = requests.get(url)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall("./")
        print("Done.")
    else:
        print("Dataset already exists.")

download_movielens()

---
## Block 2: Install Required Libraries


In [None]:
pip install torch-geometric sentence-transformers pandas scikit-learn

---
## Block 3: Import Libraries
Import all necessary libraries for PyTorch, PyTorch Geometric, BERT embeddings, and data processing.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.utils import degree, dropout_adj
from torch_geometric.data import Data
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

---
## Block 4: Configuration
Define all hyperparameters and configuration settings for the model.

In [None]:
class Config:
    DATA_PATH = './ml-100k'
    SPLIT_RATIO = 0.8        # 80% Warm / 20% Cold split
    BERT_DIM = 384           # Output of all-MiniLM-L6-v2
    EMBED_DIM = 64           # Latent space dimension
    N_LAYERS = 3             # Number of GCN layers
    DROPOUT = 0.1            
    LR = 1e-3                
    WEIGHT_DECAY = 1e-4      
    EPOCHS = 50
    BATCH_SIZE = 2048        
    SSL_REG = 0.1            # Weight for Contrastive Loss
    SSL_TEMP = 0.2           # Temperature for InfoNCE

cfg = Config()

---
## Block 5: Dataset Class - Cold Start Split
Load MovieLens data, perform timestamp-based train/test split to simulate cold-start scenario, and generate BERT embeddings for item content.

In [None]:
class MovieLensColdStartDataset:
    def __init__(self, root_path, split_ratio=0.8):
        self.root = root_path
        self.split_ratio = split_ratio
        
        # Load Interactions
        self.interactions = pd.read_csv(
            f"{root_path}/u.data", 
            sep='\t', 
            names=['user_id', 'item_id', 'rating', 'timestamp']
        )
        
        # Load Item Metadata (Title/Genre for BERT)
        self.items = pd.read_csv(
            f"{root_path}/u.item", 
            sep='|', 
            encoding='latin-1',
            header=None,
            usecols=[0, 1], 
            names=['item_id', 'title']
        )
        
        self._process_ids()
        self._time_split()
        self._generate_bert_embeddings()
        
    def _process_ids(self):
        self.user_enc = LabelEncoder()
        self.item_enc = LabelEncoder()
        
        self.interactions['user_idx'] = self.user_enc.fit_transform(self.interactions['user_id'])
        self.interactions['item_idx'] = self.item_enc.fit_transform(self.interactions['item_id'])
        
        self.num_users = len(self.user_enc.classes_)
        self.num_items = len(self.item_enc.classes_)

    def _time_split(self):
        # Sort items by their first appearance timestamp
        item_start_time = self.interactions.groupby('item_idx')['timestamp'].min().reset_index()
        item_start_time = item_start_time.sort_values('timestamp')
        
        split_idx = int(len(item_start_time) * self.split_ratio)
        
        # Define Warm vs Cold items
        train_items = item_start_time.iloc[:split_idx]['item_idx'].values
        self.test_items = item_start_time.iloc[split_idx:]['item_idx'].values
        
        # Masks
        self.train_mask = self.interactions['item_idx'].isin(train_items)
        
        print(f"Total Items: {self.num_items}")
        print(f"Warm Items (Train Graph): {len(train_items)}")
        print(f"Cold Items (Zero-Shot Test): {len(self.test_items)}")

    def _generate_bert_embeddings(self):
        # Sort items by index to ensure alignment
        sorted_items = self.items.sort_values('item_id')
        
        print("Encoding item text with BERT (this may take a moment)...")
        bert = SentenceTransformer('all-MiniLM-L6-v2')
        self.bert_feats = bert.encode(
            sorted_items['title'].tolist(), 
            convert_to_tensor=True, 
            device=device
        )

    def get_graph_data(self):
        # Only TRAIN interactions build the graph
        train_df = self.interactions[self.train_mask]
        
        u_tensor = torch.tensor(train_df['user_idx'].values)
        i_tensor = torch.tensor(train_df['item_idx'].values) + self.num_users
        
        edge_index = torch.stack([
            torch.cat([u_tensor, i_tensor]),
            torch.cat([i_tensor, u_tensor])
        ], dim=0)
        
        return edge_index.to(device)

---
## Block 6: LightGCN Convolution Layer
Implementation of the LightGCN message passing layer with symmetric normalization.

In [None]:
class LightGCNConv(MessagePassing):
    def __init__(self):
        super().__init__(aggr='add')

    def forward(self, x, edge_index):
        # Compute Normalization
        row, col = edge_index
        deg = degree(col, x.size(0), dtype=x.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]
        return self.propagate(edge_index, x=x, norm=norm)

    def message(self, x_j, norm):
        return norm.view(-1, 1) * x_j

---
## Block 7: Hybrid GCN Model
Main model combining BERT semantic embeddings with graph collaborative filtering. Uses layer-wise propagation and mean pooling across layers.

In [None]:
class HybridGCN(nn.Module):
    def __init__(self, num_users, num_items, bert_dim, embedding_dim, n_layers, dropout):
        super(HybridGCN, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.dropout = dropout
        self.n_layers = n_layers

        # Semantic Projection
        self.bert_proj = nn.Sequential(
            nn.Linear(bert_dim, embedding_dim),
            nn.ReLU()
        )
        
        # User Embeddings (Random Init)
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        nn.init.normal_(self.user_embedding.weight, std=0.1)

        # Graph Propagation
        self.convs = nn.ModuleList([LightGCNConv() for _ in range(n_layers)])

    def forward(self, edge_index, bert_features):
        item_emb_initial = self.bert_proj(bert_features)
        user_emb_initial = self.user_embedding.weight
        
        x = torch.cat([user_emb_initial, item_emb_initial], dim=0)
        
        embs = [x]
        for conv in self.convs:
            x = conv(x, edge_index)
            if self.training:
                x = F.dropout(x, p=self.dropout) 
            embs.append(x)
            
        embs = torch.stack(embs, dim=1)
        final_emb = torch.mean(embs, dim=1)
        
        users_emb, items_emb = torch.split(final_emb, [self.num_users, self.num_items])
        return users_emb, items_emb

---
## Block 8: Loss Functions
BPR (Bayesian Personalized Ranking) loss for recommendation and InfoNCE contrastive loss for self-supervised learning.

In [None]:
def bpr_loss(users, pos_items, neg_items):
    pos_scores = torch.sum(users * pos_items, dim=1)
    neg_scores = torch.sum(users * neg_items, dim=1)
    loss = -torch.mean(F.logsigmoid(pos_scores - neg_scores))
    return loss

def info_nce_loss(view1, view2, temp):
    view1 = F.normalize(view1, dim=1)
    view2 = F.normalize(view2, dim=1)
    pos_score = (view1 * view2).sum(dim=1)
    pos_score = torch.exp(pos_score / temp)
    ttl_score = torch.matmul(view1, view2.transpose(0, 1))
    ttl_score = torch.exp(ttl_score / temp).sum(dim=1)
    loss = -torch.log(pos_score / ttl_score).mean()
    return loss

---
## Block 9: Training Loop
Single epoch training with BPR loss and self-supervised graph learning (SGL) using edge dropout augmentation.

In [None]:
def train_epoch(model, dataset, optimizer):
    model.train()
    edge_index = dataset.get_graph_data()
    bert_feats = dataset.bert_feats
    
    # 1. Forward
    users_emb, items_emb = model(edge_index, bert_feats)
    
    # 2. BPR Loss (Random sampling for demo)
    batch_users = torch.randint(0, dataset.num_users, (cfg.BATCH_SIZE,)).to(device)
    # Note: In production, ensure positive items actually exist for the user
    batch_pos = torch.randint(0, dataset.num_items, (cfg.BATCH_SIZE,)).to(device) 
    batch_neg = torch.randint(0, dataset.num_items, (cfg.BATCH_SIZE,)).to(device)
    
    loss_bpr = bpr_loss(users_emb[batch_users], items_emb[batch_pos], items_emb[batch_neg])
    
    # 3. SGL Loss
    edge_index_1, _ = dropout_adj(edge_index, p=0.1)
    edge_index_2, _ = dropout_adj(edge_index, p=0.1)
    
    user_view_1, _ = model(edge_index_1, bert_feats)
    user_view_2, _ = model(edge_index_2, bert_feats)
    
    loss_sgl = info_nce_loss(user_view_1[batch_users], user_view_2[batch_users], cfg.SSL_TEMP)
    
    total_loss = loss_bpr + cfg.SSL_REG * loss_sgl
    
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()
    
    return total_loss.item()

---
## Block 10: Initialize Dataset
Load and preprocess the MovieLens dataset with cold-start split.

In [None]:
print("Initializing Dataset...")
dataset = MovieLensColdStartDataset(cfg.DATA_PATH)

---
## Block 11: Initialize Model and Optimizer
Create the Hybrid GCN model and Adam optimizer.

In [None]:
print("Initializing Model...")
model = HybridGCN(
    num_users=dataset.num_users,
    num_items=dataset.num_items,
    bert_dim=cfg.BERT_DIM,
    embedding_dim=cfg.EMBED_DIM,
    n_layers=cfg.N_LAYERS,
    dropout=cfg.DROPOUT
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=cfg.LR, weight_decay=cfg.WEIGHT_DECAY)

---
## Block 12: Training Execution
Run the training loop for the specified number of epochs.

In [None]:
print("Starting Training...")
for epoch in range(1, cfg.EPOCHS + 1):
    loss = train_epoch(model, dataset, optimizer)
    if epoch % 5 == 0 or epoch == 1:
        print(f"Epoch {epoch}/{cfg.EPOCHS} | Loss: {loss:.4f}")