In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [34]:
import pandas as pd
import numpy as np
import dask.dataframe as dd

import networkx as nx
import torch
from torch import nn

import re

In [3]:
meta = pd.read_csv("data/movies_metadata.csv")
# ratings = pd.read_csv("data/ratings.csv")
ratings = pd.read_csv("data/ratings_small.csv")
links = pd.read_csv("data/links.csv", dtype=str)

meta.columns = meta.columns.str.lower()
ratings.columns = ratings.columns.str.lower()
links.columns = links.columns.str.lower()

meta = meta.rename(columns={"id": "tmdbid"})

links.tmdbid = links.tmdbid.dropna().astype(int)
links.movieid = links.movieid.dropna().astype(int)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
meta.tmdbid = pd.to_numeric(meta.tmdbid, errors="coerce")
meta = meta.dropna(subset=["tmdbid"])
meta = meta.merge(links[["movieid", "tmdbid"]], how="left")

#### map userid and movie_id to index

In [5]:
userid = ratings.userid.sort_values().drop_duplicates().reset_index(drop=True)
movieid = ratings.movieid.sort_values().drop_duplicates().reset_index(drop=True)
movieid.index = movieid.index + len(userid)

In [9]:
nodeid_userid = userid.to_dict()
nodeid_movieid = movieid.to_dict()

userid_nodeid = {v: k for k, v in nodeid_userid.items()}
movieid_nodeid = {v: k for k, v in nodeid_movieid.items()}

### create graphs

In [10]:
%%time
ddata = dd.from_pandas(ratings, npartitions=10)

def create_edge(x): 
    return (userid_nodeid[int(x.userid)], movieid_nodeid[int(x.movieid)], x.rating)

edges = ddata.map_partitions(lambda df: df.apply((lambda row: create_edge(row)), axis=1)).compute() 
edges = edges.tolist()

CPU times: user 1.01 s, sys: 19.9 ms, total: 1.03 s
Wall time: 1.02 s


In [11]:
%%time
G = nx.Graph(directed=False)
G.add_weighted_edges_from(edges)

CPU times: user 78.4 ms, sys: 4.37 ms, total: 82.8 ms
Wall time: 82.2 ms


In [12]:
print("number of nodes:", G.number_of_nodes())
print("number of edges:", G.number_of_edges())
cc = 2 * G.number_of_edges() / G.number_of_nodes()
print("average node degree:", cc)
print("density of network:", nx.density(G))

number of nodes: 9737
number of edges: 100004
average node degree: 20.541029064393552
density of network: 0.002109801670541655


### train a GCN to get embeddings

In [28]:
from utils import *

### get edge list

In [88]:
pos_edge_list = graph_to_edge_list(G)
pos_edge_index = edge_list_to_tensor(pos_edge_list)

print("The pos_edge_index tensor has shape {}".format(pos_edge_index.shape))
print("The pos_edge_index tensor has sum value {}".format(torch.sum(pos_edge_index)))

The pos_edge_index tensor has shape torch.Size([2, 100004])
The pos_edge_index tensor has sum value 370904711


In [89]:
pos_edge_index

tensor([[   0,    0,    0,  ...,  664,  664,  667],
        [ 701, 1504, 1530,  ...,  786, 4383, 5300]])

### sample negative edges

In [90]:
neg_edge_list = sample_negative_edges(G, len(pos_edge_list))
neg_edge_index = edge_list_to_tensor(neg_edge_list)
print("The neg_edge_index tensor has shape {}".format(neg_edge_index.shape))

The neg_edge_index tensor has shape torch.Size([2, 100004])


### create embeddings

In [91]:
# Please do not change / reset the random seed
torch.manual_seed(1)

emb = create_node_emb(num_node=G.number_of_nodes())
ids = torch.LongTensor([0, 3])

# Print the embedding layer
print("Embedding: {}".format(emb))

# An example that gets the embeddings for node 0 and 3
print(emb(ids))

Embedding: Embedding(9737, 16)
tensor([[0.4502, 0.5006, 0.5616, 0.0649, 0.4536, 0.5016, 0.0650, 0.7346, 0.4265,
         0.5439, 0.1151, 0.1710, 0.3025, 0.5056, 0.6294, 0.6973],
        [0.5920, 0.1875, 0.7108, 0.8519, 0.2776, 0.8310, 0.3669, 0.7839, 0.9917,
         0.7574, 0.1853, 0.2864, 0.4711, 0.6724, 0.8209, 0.9868]],
       grad_fn=<EmbeddingBackward0>)


### train embeddings

#### get labels

In [92]:
# Generate the positive and negative labels

pos_label = torch.ones(
    pos_edge_index.shape[1],
)
neg_label = torch.zeros(
    neg_edge_index.shape[1],
)

In [93]:
def get_weight_labels(G, t):
    length = pos_edge_index.size(1)
    res = [G.get_edge_data(*t[:, i].tolist())["weight"] for i in range(length)]
    return res

In [96]:
pos_label = torch.tensor(get_weight_labels(G, pos_edge_index), dtype=torch.float)

In [97]:
from torch.optim import SGD


def accuracy(pred, label):
    # TODO: Implement the accuracy function. This function takes the
    # pred tensor (the resulting tensor after sigmoid) and the label
    # tensor (torch.LongTensor). Predicted value greater than 0.5 will
    # be classified as label 1. Else it will be classified as label 0.
    # The returned accuracy should be rounded to 4 decimal places.
    # For example, accuracy 0.82956 will be rounded to 0.8296.

    accu = 0.0

    ############# Your code here ############
    pred_integer = (pred > 0.5).type(torch.LongTensor)
    accu = (label == pred_integer).sum() / torch.ones(label.shape).sum()
    accu = round(float(accu), 4)
    #########################################

    return accu


def train(emb, loss_fn, sigmoid, train_label, train_edge):

    epochs = 15000
    learning_rate = 0.5  # tuned the learning rate here
    # could increase learning rate further to achieve even lower loss
    # but overfitting the training data might not give us good embeddings

    optimizer = SGD(emb.parameters(), lr=learning_rate, momentum=0.9)

    for i in range(epochs):

        ############# Your code here ############

        optimizer.zero_grad()

        # gradient caluclated with all nodes (network is small)
        # otherwise I would have done it in mini batches
        embedded_nodes = emb(train_edge)
        pred = sigmoid(torch.mul(embedded_nodes[0], embedded_nodes[1]).sum(axis=1))
        loss = loss_fn(pred, train_label)

        loss.backward()
        optimizer.step()

        if i % 500 == 0:
            print(
                f"epoch {i}: loss is: {loss}, accuracy is {accuracy(pred, train_label)}"
            )

    #########################################


loss_fn = nn.BCELoss()
sigmoid = nn.Sigmoid()


# Concat positive and negative labels into one tensor
train_label = torch.cat([pos_label, neg_label], dim=0)

# Concat positive and negative edges into one tensor
# Since the network is very small, we do not split the edges into val/test sets
train_edge = torch.cat([pos_edge_index, neg_edge_index], dim=1)

train(emb, loss_fn, sigmoid, train_label, train_edge)

epoch 0: loss is: -3.0599710941314697, accuracy is 0.0166
epoch 500: loss is: -109.2962417602539, accuracy is 0.0166
epoch 1000: loss is: -113.22427368164062, accuracy is 0.0176


KeyboardInterrupt: 