# Introduction
This implements the iterative classification algorithm as described in slide 37 of http://web.stanford.edu/class/cs224w/slides/05-message.pdf  
It classifies a node based on its features as well as labels of neighbours

## Definitions
$v$: Node  
$Y_v$: Labels of node $v$  
$f_v$: feature vector of node $v$  
$z_v$: summary of labels of $v$'s neighbours (a vector)  
$\phi_1(f_v)$: predict node label based on node feature vector $f_v$  
$\phi_2(f_v, z_v)$: predict label based on node feature vector $f_v$ of labels of $v$'s neighbours

## Phase 1: Train a Classifier based on node attributes only
The classifier can be linear classifier, neural network classifier etc. This is trained on the training set to predict the labels for each node.

$\phi_1(f_v)$ : to predict $Y_v$ based on $f_v$  
$\phi_2(f_v, z_v)$ to predict $Y_v$ based on $f_v$ and summary $z_v$ of labels of $v$'s neighbours  
For vector $z_v$ of neighbourhood labels, let

- $I$ = incoming neighbour label info vector  
  $I_0$ = 1 if at least one of the incoming node is labelled 0.  
  $I_1$ = 1 if at least one of the incoming node is labelled 1.
- $O$ = outgoing neighbour label info vector  
  $O_0$ = 1 if at least one of the outgoing node is labelled 1.  
  $O_1$ = 1 if at least one of the outgoing node is labelled 1.

## Phase 2: Iterate till Convergence

On the test set, set the labels based on the classifier in Phase 1,

## Step 1: Train Classifier

On a different training set, train two classifiers:

- node attribute vector only: $\phi_1$
- node attribute and link vectors: $\phi_2$

## Step 2: Apply Classifier to test set

On test set, use trained node feature vector classifier $\phi_1$ to set $Y_v$

## Step 3.1: Update relational vectors z

Update $z_v$ for all nodes on test set

## 3.2: Update Label

Reclassify all nodes with $\phi_2$

## Iterate

Continue until convergence

- update $z_v$
- update $Y_v = \phi_2(f_v, z_v)$

In [1]:
import pandas as pd
import networkx as nx
from collective.constants import get_summary_zv
from collective.Iterative import IterativeClassification

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import sys
sys.path.insert(1, '../src')
import preprocess

In [79]:
df_node = pd.read_csv('../data/unified_node_data.csv', keep_default_na=False)
df_edge = pd.read_csv('../data/max_edge_weights.csv')
df_formatted = preprocess.nodes1(df_node)
df_clean = preprocess.nodes_filter(df_formatted, df_edge)
df_impute = preprocess.impute(df_clean)
X_train, X_val, X_test, y_train, y_val, y_test = preprocess.stratified_train_val_test_split(df_impute)
X_test = pd.concat([X_val, X_test])
y_test = pd.concat([y_val, y_test])

## Further cleaning
Note that need to drop confessed_assignments and num_confessed_assignments as both indicates whether the student cheated or not

In [80]:
X_train = pd.concat([X_train, y_train], axis = 1)
X_train['label'] = X_train['num_confessed_assignments'].apply(lambda x : 1 if x > 0 else 0)
X_train = X_train.drop(['num_confessed_assignments', 'confessed_assignments'], axis = 1)

X_test = pd.concat([X_test, y_test], axis = 1)
X_test['label'] = X_test['num_confessed_assignments'].apply(lambda x : 1 if x > 0 else 0)
X_test = X_test.drop(['num_confessed_assignments', 'confessed_assignments'], axis = 1)

In [81]:
network_graph = nx.Graph()
for index, row in X_train.iterrows():
    network_graph.add_node(row['name'])
    network_graph.nodes[row['name']].update(row.drop(['name']).to_dict())

In [82]:
df_edge[df_edge.NodeID2 == 'jessica_mack'].iloc[0].NodeID1

'jessica_torres'

In [83]:
for node in network_graph.nodes:
    edge1 = df_edge[df_edge.NodeID1 == node]
    edge2 = df_edge[df_edge.NodeID2 == node]
    if (len(edge1) != 0):
        for index, edge in edge1.iterrows():
            if (edge.NodeID2 in network_graph.nodes):
                network_graph.add_edge(node, edge.NodeID2)
                network_graph[node][edge.NodeID2]['edge_weight'] = edge.edge_weights
    elif (len(edge2) != 0):
        for index, edge in edge2.iterrows():
            if (edge.NodeID1 in network_graph.nodes):
                network_graph.add_edge(node, edge.NodeID1)
                network_graph[node][edge.NodeID1]['edge_weight'] = edge.edge_weights

## Creating a networkx graph using the edgelist and setting the edge weights

In [24]:
network_graph = nx.from_pandas_edgelist(df_edge, "NodeID1", "NodeID2")

In [25]:
for edge in network_graph.edges:
    edge1 = df_edge[(df_edge.NodeID1 == edge[0]) & (df_edge.NodeID2 == edge[1])]
    edge2 = df_edge[(df_edge.NodeID1 == edge[1]) & (df_edge.NodeID2 == edge[0])]
    if (len(edge1) == 1):
        network_graph[edge[0]][edge[1]]['edge_weight'] = edge_data[(edge_data.NodeID1 == edge[0]) & (edge_data.NodeID2 == edge[1])].iloc[0]['edge_weights']
    elif (len(edge2) == 1):
        network_graph[edge[0]][edge[1]]['edge_weight'] = edge_data[(edge_data.NodeID1 == edge[1]) & (edge_data.NodeID2 == edge[0])].iloc[0]['edge_weights']
    else:
        print("error: " + str(edge))

In [26]:
network_graph["joseph_trevino"]['aaron_christian']['edge_weight']

0.0076399961214356875

# Setting the node attributes of the networkx graph using the node data

In [27]:
nx.set_node_attributes(network_graph, X_train.set_index("name").to_dict("index"))

In [28]:
network_graph.nodes["joseph_trevino"]

{'year_of_study': 1,
 'participation': 3.0000000000000004,
 'pe_percent': 0.2333333333333333,
 'finals_percent': 0.4,
 'midterms_percent': 0.12853934181648263,
 'afast': 0,
 'level_min_max': 0.7600000000000001,
 'exp_min_max': 0.6643572276410178,
 't01_exp': 240,
 't02_exp': 260,
 't03_exp': 240,
 't04_exp': 250,
 't05_exp': 225,
 't06_exp': 225,
 't07_exp': 250,
 't08_exp': 250,
 't09_exp': 275,
 't10_exp': 250,
 'num_videos': 1,
 'avg_videos_completion': 0.0,
 'batch_1821': 1,
 'batch_1935': 0,
 'batch_2023': 0,
 'major_-': 0,
 'major_Business Analytics': 0,
 'major_Chemistry': 0,
 'major_Computational Biology': 0,
 'major_Data Science and Analytics': 0,
 'major_Faculty of Arts & Social Sci': 0,
 'major_Faculty of Engineering': 0,
 'major_Faculty of Law': 0,
 'major_Faculty of Science': 0,
 'major_Life Sciences': 0,
 'major_Math/Applied Math': 0,
 'major_NUS Business School': 0,
 'major_Pharmacy': 0,
 'major_Physics': 0,
 'major_Quantitative Finance': 0,
 'major_School of Computing':

# Gets L1_max, L0_max, L1_mean, L0_mean

In [29]:
network_graph = get_summary_zv(network_graph)

KeyError: 'label'

In [None]:
df = pd.DataFrame()
for node in network_graph.nodes:
    network_graph.nodes[node]['index'] = node
    temp = pd.DataFrame([network_graph.nodes[node]]).set_index('index')
    df = pd.concat([df, temp])
df.head()

In [None]:
df.columns

# Model 1: Logistic Regression without L1_max, L0_max, L1_mean, L0_mean

In [None]:
print("Doing train-test-split")
train, test = train_test_split(df, test_size=0.2)

# model1
print("Training model1")
train_x_model1 = train.drop(
    ['L1_max', 'L0_max', 'L1_mean', 'L0_mean', 'label'], axis=1)
train_y_model1 = train['label']
test_x_model1 = test.drop(
    ['L1_max', 'L0_max', 'L1_mean', 'L0_mean', 'label'], axis=1)
test_y_model1 = test['label']
model1 = LogisticRegression(max_iter=10000)
model1.fit(train_x_model1, train_y_model1)
y_pred1 = model1.predict(test_x_model1)

print(accuracy_score(test_y_model1.to_numpy(), y_pred1))

# Model 2: Logistic Regression with L1_max, L0_max, L1_mean, L0_mean

In [None]:
print("Training model2")
train_x_model2 = train.drop(['label'], axis=1)
train_y_model2 = train['label']
test_x_model2 = test.drop(['label'], axis=1)
test_y_model2 = test['label']
model2 = LogisticRegression(max_iter=10000)
model2.fit(train_x_model2, train_y_model2)
y_pred2 = model2.predict(test_x_model2)

print(accuracy_score(test_y_model2.to_numpy(), y_pred2))

# Iterative Classification

In [None]:
print("Iterative classification")
ic = IterativeClassification(max_iterations=5)
new_gnx = ic.predict(network_graph, model1, model2)

In [None]:
new_gnx_pred = pd.DataFrame([])
for node in test['label'].index:
    temp = pd.DataFrame([[new_gnx.nodes[node]['label'][0], node]], columns=[
                        'label', 'index']).set_index('index')
    new_gnx_pred = pd.concat([new_gnx_pred, temp])
print(accuracy_score(test_y_model2.to_numpy(), new_gnx_pred.to_numpy()))