<a href="https://colab.research.google.com/github/lamyse1/Recommender-Systems/blob/main/RS_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# *RS Graded Project by Lamyse Ammar*




# **Part 1: Recommendation System Using GCNN [40%]**

# Step 1: Load and preprocess the dataset

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def preprocess_rec_sys_data(path="Rec_sys_data.xlsx"):
    df = pd.read_excel(path)
    return df

df = preprocess_rec_sys_data()


In [9]:
 #Read the first 5 rows
 df.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,DeliveryDate,Discount%,ShipMode,ShippingCost,CustomerID
0,536365,84029E,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.2,ExpressAir,30.12,17850
1,536365,71053,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.21,ExpressAir,30.12,17850
2,536365,21730,6,2010-12-01 08:26:00,2010-12-03 08:26:00,0.56,Regular Air,15.22,17850
3,536365,84406B,8,2010-12-01 08:26:00,2010-12-03 08:26:00,0.3,Regular Air,15.22,17850
4,536365,22752,2,2010-12-01 08:26:00,2010-12-04 08:26:00,0.57,Delivery Truck,5.81,17850


In [10]:
#Confirm dtypes
print("\nData types:")
print(df.dtypes)


Data types:
InvoiceNo                int64
StockCode               object
Quantity                 int64
InvoiceDate     datetime64[ns]
DeliveryDate    datetime64[ns]
Discount%              float64
ShipMode                object
ShippingCost           float64
CustomerID               int64
dtype: object


In [11]:
# Check for missing values
missing_values = df.isnull().sum()
print("\Missing Values ")
for col, missing in missing_values.items():
    pct = missing / len(df) * 100
    print(f"{col}: {missing} missing values ({pct:}%)")



\Missing Values 
InvoiceNo: 0 missing values (0.0%)
StockCode: 0 missing values (0.0%)
Quantity: 0 missing values (0.0%)
InvoiceDate: 0 missing values (0.0%)
DeliveryDate: 0 missing values (0.0%)
Discount%: 0 missing values (0.0%)
ShipMode: 0 missing values (0.0%)
ShippingCost: 0 missing values (0.0%)
CustomerID: 0 missing values (0.0%)


In [12]:
# I will Keep only the core columns for my rec sys
df = df[['CustomerID', 'StockCode', 'Quantity', 'DeliveryDate']]

# Checking
print("Keeping only the required columns are:", df.columns.tolist())
display(df.head())


Keeping only the required columns are: ['CustomerID', 'StockCode', 'Quantity', 'DeliveryDate']


Unnamed: 0,CustomerID,StockCode,Quantity,DeliveryDate
0,17850,84029E,6,2010-12-02 08:26:00
1,17850,71053,6,2010-12-02 08:26:00
2,17850,21730,6,2010-12-03 08:26:00
3,17850,84406B,8,2010-12-03 08:26:00
4,17850,22752,2,2010-12-04 08:26:00


# Step 2 : Graph Construction

In [13]:
# Make the DataFrame into a customer×item interaction matrix
interaction_matrix = df.pivot_table(
    index='CustomerID',
    columns='StockCode',
    values='Quantity',
    aggfunc='sum',
    fill_value=0
)

# convert index and column to strings for consistency
interaction_matrix.index  = interaction_matrix.index.astype(str)
interaction_matrix.columns = interaction_matrix.columns.astype(str)

import networkx as nx
B = nx.Graph()

# add customer nodes
B.add_nodes_from(interaction_matrix.index, bipartite=0, node_type='customer')
# add item nodes
B.add_nodes_from(interaction_matrix.columns, bipartite=1, node_type='item')

# create weighted edges for each purchase
for cust in interaction_matrix.index:
    purchased = interaction_matrix.loc[cust]
    purchased = purchased[purchased > 0]
    for item, qty in purchased.items():
        B.add_edge(cust, item, weight=int(qty))

# check the resulting graph
print(f"Graph has {B.number_of_nodes()} nodes and {B.number_of_edges()} edges")



Graph has 7161 nodes and 192758 edges


# Step 3 : Modeling with GCNN

In [14]:
!pip install torch-geometric


Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m61.4/63.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [15]:
# build edge_index and edge_weight tensors
import torch
from torch_geometric.data import Data
import networkx as nx

nodes       = list(B.nodes())
node_to_idx = {n: i for i, n in enumerate(nodes)}

edges, weights = [], []
for u, v, attrs in B.edges(data=True):
    ui, vi = node_to_idx[u], node_to_idx[v]
    edges   += [[ui, vi], [vi, ui]]
    weights += [attrs['weight'], attrs['weight']]

edge_index  = torch.tensor(edges, dtype=torch.long).t().contiguous()
edge_weight = torch.tensor(weights, dtype=torch.float)

# construct node feature matrix x (using node degree )
deg = dict(B.degree())
x   = torch.tensor([[deg[nodes[i]]] for i in range(len(nodes))], dtype=torch.float)

# assemble into a PyTorch-Geometric Data object
data = Data(x=x, edge_index=edge_index, edge_weight=edge_weight)


# define a 2-layer GCNN + MLP edge predictor
from torch_geometric.nn import GraphConv

class GCNNRecommender(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GraphConv(in_channels, hidden_channels)
        self.conv2 = GraphConv(hidden_channels, out_channels)
        self.mlp   = torch.nn.Sequential(
            torch.nn.Linear(2 * out_channels, hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_channels, 1)
        )

    def forward(self, data):
        x, ei, ew = data.x, data.edge_index, data.edge_weight
        x = torch.relu(self.conv1(x, ei, edge_weight=ew))
        x = self.conv2(x, ei, edge_weight=ew)
        return x



    def predict_edge(self, embeddings, u_idx, v_idx):
        u_emb = embeddings[u_idx]
        v_emb = embeddings[v_idx]
        h     = torch.cat([u_emb, v_emb], dim=1)
        raw   = self.mlp(h).squeeze()
        return torch.sigmoid(raw)


# instantiate model with feature-size → hidden → embedding dims
model = GCNNRecommender(
    in_channels    = data.num_node_features,
    hidden_channels= 64,
    out_channels   = 32
)

# now data & model are ready for the training loop
print(model)


GCNNRecommender(
  (conv1): GraphConv(1, 64)
  (conv2): GraphConv(64, 32)
  (mlp): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=1, bias=True)
  )
)


# Step 4 :Training the Model

In [19]:
import torch
from torch.optim import Adam
from torch.nn import MSELoss
import torch.nn.utils as utils
import networkx as nx
from torch_geometric.data import Data

# split into training and validation sets based on DeliveryDate percentile
cutoff   = df.DeliveryDate.quantile(0.8)
train_df = df[df.DeliveryDate <= cutoff].reset_index(drop=True)
val_df   = df[df.DeliveryDate  > cutoff].reset_index(drop=True)

# build a graph containing only training edges for GCNN message passing
B_train = nx.Graph()
B_train.add_nodes_from(B.nodes(data=True))
for _, row in train_df.iterrows():
    u, v   = str(int(row.CustomerID)), str(row.StockCode)
    weight = int(row.Quantity)
    B_train.add_edge(u, v, weight=weight)


In [20]:
# convert the train-only graph into PyG Data format
nodes        = list(B_train.nodes())
node_to_idx  = {n: i for i, n in enumerate(nodes)}
edges, weights = [], []
for u, v, attrs in B_train.edges(data=True):
    ui, vi    = node_to_idx[u], node_to_idx[v]
    edges    += [[ui, vi], [vi, ui]]
    weights  += [attrs['weight'], attrs['weight']]

edge_index  = torch.tensor(edges, dtype=torch.long).t().contiguous()
edge_weight = torch.tensor(weights, dtype=torch.float)
deg         = dict(B_train.degree())
x           = torch.tensor([[deg[nodes[i]]] for i in range(len(nodes))], dtype=torch.float)
x           = x / x.max()

data = Data(x=x, edge_index=edge_index, edge_weight=edge_weight)


In [21]:
# helper to build index and target tensors from dataframe rows
def build_edge_data(df_edges):
    u_idx, v_idx, y = [], [], []
    for _, row in df_edges.iterrows():
        u = str(int(row.CustomerID))
        v = str(row.StockCode)
        if u not in node_to_idx or v not in node_to_idx:
            continue
        u_idx.append(node_to_idx[u])
        v_idx.append(node_to_idx[v])
        y.append(row.Quantity)
    return (
        torch.tensor(u_idx, dtype=torch.long),
        torch.tensor(v_idx, dtype=torch.long),
        torch.tensor(y,     dtype=torch.float),
    )

train_u, train_v, train_y = build_edge_data(train_df)
val_u,   val_v,   val_y   = build_edge_data(val_df)



In [22]:
# scale target quantities to [0,1] using the maximum from training
y_max    = train_y.max()
train_y  = train_y / y_max
val_y    = val_y   / y_max

# set up the optimizer and MSE loss
optimizer = Adam(model.parameters(), lr=1e-5, weight_decay=1e-6)
criterion = MSELoss()


In [23]:
# training loop reporting training and validation MSE
num_epochs = 30
for epoch in range(1, num_epochs+1):
    model.train()
    optimizer.zero_grad()
    embeds = model(data)
    preds  = model.predict_edge(embeds, train_u, train_v)
    loss   = criterion(preds, train_y)
    loss.backward()
    utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
    optimizer.step()

    model.eval()
    with torch.no_grad():
        val_preds = model.predict_edge(model(data), val_u, val_v)
        val_loss  = criterion(val_preds, val_y)

    print(f"Epoch {epoch:02d} — train MSE: {loss.item():.6f}, val MSE: {val_loss.item():.6f}")


Epoch 01 — train MSE: 0.930402, val MSE: 0.884090
Epoch 02 — train MSE: 0.929120, val MSE: 0.882687
Epoch 03 — train MSE: 0.927420, val MSE: 0.881769
Epoch 04 — train MSE: 0.926036, val MSE: 0.880502
Epoch 05 — train MSE: 0.924396, val MSE: 0.879601
Epoch 06 — train MSE: 0.923023, val MSE: 0.878412
Epoch 07 — train MSE: 0.921714, val MSE: 0.877072
Epoch 08 — train MSE: 0.920070, val MSE: 0.876093
Epoch 09 — train MSE: 0.918757, val MSE: 0.875109
Epoch 10 — train MSE: 0.917291, val MSE: 0.874227
Epoch 11 — train MSE: 0.916093, val MSE: 0.873125
Epoch 12 — train MSE: 0.914772, val MSE: 0.872141
Epoch 13 — train MSE: 0.913280, val MSE: 0.871230
Epoch 14 — train MSE: 0.912024, val MSE: 0.870239
Epoch 15 — train MSE: 0.910708, val MSE: 0.869389
Epoch 16 — train MSE: 0.909284, val MSE: 0.868003
Epoch 17 — train MSE: 0.907897, val MSE: 0.867060
Epoch 18 — train MSE: 0.906712, val MSE: 0.865918
Epoch 19 — train MSE: 0.905335, val MSE: 0.864737
Epoch 20 — train MSE: 0.903767, val MSE: 0.863576


# Part 2: Recommendation System Evaluation and Comparison Using GCNN and NeuMF Models