In [47]:
%load_ext autoreload
%autoreload 2

import os
from pathlib import Path
while Path.cwd().name != 'bayesian_beats_cheats':
    os.chdir(Path.cwd().parent)
    
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm
from imblearn.over_sampling import SMOTE, ADASYN
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, confusion_matrix
# from tensorboardX import SummaryWriter
from sklearn.manifold import TSNE
import seaborn as sns

######## TORCH ###########
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch_geometric.transforms as T
from torch_geometric.data import Data
from torch.utils.tensorboard import SummaryWriter


######## Jon's code #########
from src import preprocess
from src.visualization import make_confusion_matrix

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/gnn_experiment_1')

In [48]:
import time
from datetime import datetime

In [13]:
!tensorboard --logdir=runs

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.4.1 at http://localhost:6006/ (Press CTRL+C to quit)


## Getting mappings

In [134]:
x = sorted(list(set(df_edge["NodeID1"].tolist() + df_edge["NodeID2"].tolist())))

In [138]:
df_node = df_node.sort_values(by='name').reset_index(drop=True)

In [151]:
id2name = dict(zip(df_node.index, df_node["name"].values))

In [153]:
name2id = dict(zip(df_node["name"].values, df_node.index))

In [49]:
df_node = pd.read_csv('data/imputed_unified_node_data.csv', keep_default_na=False)
df_edge = pd.read_csv('data/uniq_lines_edge_weights.csv') # change to anything else later.

# combine train and val together to train
X_train, X_val, X_test, y_train, y_val, y_test = preprocess.stratified_train_val_test_split(df_node)

In [54]:
usable_cols = ['name', 'year_of_study', 'participation', 'pe_percent',
       'finals_percent', 'midterms_percent', 'afast', 'level_min_max',
       'exp_min_max', 'num_videos',
       'avg_videos_completion', 'batch_1821', 'batch_1935', 'batch_2023',
       'major_-', 'major_Business Analytics', 'major_Chemistry',
       'major_Computational Biology', 'major_Data Science and Analytics',
       'major_Faculty of Arts & Social Sci', 'major_Faculty of Engineering',
       'major_Faculty of Law', 'major_Faculty of Science',
       'major_Life Sciences', 'major_Math/Applied Math',
       'major_NUS Business School', 'major_Pharmacy', 'major_Physics',
       'major_Quantitative Finance', 'major_School of Computing',
       'major_School of Design & Environment', 'major_Statistics',
       'major_Yong Loo Lin School (Medicine)']

In [55]:
X_train_kh = X_train[usable_cols]

In [72]:
num_cheats = sum(y_train > 0)
label_ratio = [len(y_train) - num_cheats, num_cheats]

In [75]:
label_ratio
normWeights = [1 - (x/ sum(label_ratio)) for x in label_ratio]
normWeights

[0.14245810055865926, 0.8575418994413408]

In [None]:
# Consider standardising edge weights DONE
# Consider dropping some cols, DONE
# get train, test, val mask, nodeidmapping, check.
# Whether to standardise one-hot or not

# Consider standardising edge weights.
# Try GraphConv, and other layers, pass edge_weights
# use weighed loss, NLL loss, different dropout shit.
# 

In [None]:
# Get Torch data specs
# Run model for 50 epochs, see how it goes, 

# to try:
# standardise edge weights
# weights or without in loss
# different dropout, different layernorms.
# add lin layers?

## Torch Geometric Data API

In [89]:
X_train_kh.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 716 entries, 696 to 148
Data columns (total 33 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   name                                  716 non-null    object 
 1   year_of_study                         716 non-null    int64  
 2   participation                         716 non-null    float64
 3   pe_percent                            716 non-null    float64
 4   finals_percent                        716 non-null    float64
 5   midterms_percent                      716 non-null    float64
 6   afast                                 716 non-null    int64  
 7   level_min_max                         716 non-null    float64
 8   exp_min_max                           716 non-null    float64
 9   num_videos                            716 non-null    int64  
 10  avg_videos_completion                 716 non-null    float64
 11  batch_1821       

In [79]:
data_features = torch.tensor(X_train_kh.values)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

## SAGE and GCNNs

### Customized Layer

In [None]:
import torch_geometric.nn as pyg_nn
import torch_geometric.utils as pyg_utils

In [None]:
class CustomConv(pyg_nn.MessagePassing):
    def __init__(self, in_channels, out_channels):
        super(CustomConv, self).__init__(aggr='add')  # "Add" aggregation.
        self.lin = nn.Linear(in_channels, out_channels)
        self.lin_self = nn.Linear(in_channels, out_channels)

    def forward(self, x, edge_index):
        # x has shape [N, in_channels] # number of nodes, 
        # edge_index has shape [2, E] # every edge 2 nodes, number of edges

        # Add self-loops to the adjacency matrix.
        # add_self_loops(edge_index, num_nodes = x.size(0))
        edge_index, _ = pyg_utils.remove_self_loops(edge_index)

        # Transform node feature matrix.
        self_x = self.lin_self(x)
        #x = self.lin(x)
        
        # return self.propagate(edge_index, size=(x.size(0), x.size(0)), x=x)
        return self_x + self.propagate(edge_index, size=(x.size(0), x.size(0)), x=self.lin(x))

    def message(self, x_i, x_j, edge_index, size):
        # Compute messages
        # x_j has shape [E, out_channels]

        row, col = edge_index
        deg = pyg_utils.degree(row, size[0], dtype=x_j.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]

        return x_j

    def update(self, aggr_out):
        # aggr_out has shape [N, out_channels]
        # GRAPHSAGE, normalize after message passing.
        return aggr_out

In [None]:
# GCNNs
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(dataset.num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

model = GCN(hidden_channels=16)
print(model)