In [1]:
# Import required modules
import random
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import model_selection, metrics, preprocessing

import copy
import torch
from torch import nn, optim, Tensor
from torch_sparse import SparseTensor, matmul
from torch_geometric.utils import degree, structured_negative_sampling
from torch_geometric.data import download_url, extract_zip
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.typing import Adj


In [3]:
# Download the dataset
import os
from torch_geometric.data import download_url, extract_zip

# Define dataset URL and file paths
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
dataset_path = "ml-latest-small.zip"
extract_path = "."

# Download and extract dataset
if not os.path.exists(dataset_path):
    download_url(url, extract_path)
extract_zip(dataset_path, extract_path)

# Define file paths
movie_path = "./ml-latest-small/movies.csv"
rating_path = "./ml-latest-small/ratings.csv"
user_path = "./ml-latest-small/users.csv"

# Load dataset
import pandas as pd

rating_df = pd.read_csv(rating_path)

# Display first few rows
print(rating_df.head())

# Display unique movie and user counts
print(f"Number of unique movies: {len(rating_df['movieId'].unique())}")
print(f"Number of unique users: {len(rating_df['userId'].unique())}")




   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
Number of unique movies: 9724
Number of unique users: 610


Extracting ml-latest-small.zip


In [4]:
# Display summary statistics
rating_df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [5]:
lbl_user = preprocessing.LabelEncoder()
lbl_movie = preprocessing.LabelEncoder()

rating_df.userId = lbl_user.fit_transform(rating_df.userId.values)
rating_df.movieId = lbl_movie.fit_transform(rating_df.movieId.values)

In [6]:
print(rating_df.userId.max())
print(rating_df.movieId.max())

609
9723


In [7]:
rating_df.rating.value_counts()

rating
4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: count, dtype: int64

In [8]:


def load_edge_csv(df, src_index_col, dst_index_col, link_index_col, rating_threshold=3):
    """
    Loads a CSV containing edges between users and items.

    Args:
        df (pd.DataFrame): DataFrame containing user-item interactions.
        src_index_col (str): Column name for users.
        dst_index_col (str): Column name for items (movies).
        link_index_col (str): Column name for user-item interaction (ratings).
        rating_threshold (int, optional): Threshold to determine positive edges. Defaults to 3.

    Returns:
        list: Edge index (2xN matrix) containing the node IDs of N user-item edges.
    """
    
    print("Constructing COO format edge_index from input rating events...")

    # Get user IDs from rating events in order of occurrence
    src = df[src_index_col].tolist()
    
    # Get movie IDs from rating events in order of occurrence
    dst = df[dst_index_col].tolist()

    # Apply rating threshold to filter interactions
    edge_attr = torch.from_numpy(df[link_index_col].values).view(-1, 1).to(torch.long)
    
    edge_index = [[], []]  # COO format edge index (two lists for source and destination nodes)

    for i in range(edge_attr.shape[0]):
        if edge_attr[i] >= rating_threshold:
            edge_index[0].append(src[i])
            edge_index[1].append(dst[i])

    return edge_index


In [12]:

edge_index= load_edge_csv(
rating_df,
src_index_col='userId', dst_index_col='movieId', link_index_col="rating", rating_threshold=3.5,
)
print(f"{len(edge_index)} x {len(edge_index[0])}")

Constructing COO format edge_index from input rating events...
2 x 48580


In [14]:
#Convert to Tensor
edge_index = torch.LongTensor(edge_index)
print(edge_index)
print(edge_index.size())

tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    2,    5,  ..., 9443, 9444, 9445]])
torch.Size([2, 48580])
