In [None]:
import numpy as np
import os
import networkx as nx
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from numpy import dot
from numpy.linalg import norm
import random

from collections import Counter
import matplotlib.pyplot as plt


In [None]:
!tar -xvf 'cora.tgz'

cora/
cora/README
cora/cora.cites
cora/cora.content


In [None]:
all_data = []
all_edges = []

for root,dirs,files in os.walk('./cora'):
    for file in files:
        if '.content' in file:
            with open(os.path.join(root,file),'r') as f:
                all_data.extend(f.read().splitlines())
        elif 'cites' in file:
            with open(os.path.join(root,file),'r') as f:
                all_edges.extend(f.read().splitlines())

                
#random_state = 42
#all_data = shuffle(all_data,random_state=random_state)

In [None]:
categories =  ['Reinforcement_Learning', 'Theory', 'Case_Based', 'Genetic_Algorithms', 'Probabilistic_Methods', 'Neural_Networks', 'Rule_Learning']
sorted(categories)
label_encoder = {}
i = 0
for cat in sorted(categories):
  label_encoder[cat] = i
  i +=1
label_encoder


{'Case_Based': 0,
 'Genetic_Algorithms': 1,
 'Neural_Networks': 2,
 'Probabilistic_Methods': 3,
 'Reinforcement_Learning': 4,
 'Rule_Learning': 5,
 'Theory': 6}

In [None]:
#parse the data
labels = []
nodes = []
X = []
element_to_ind  = {}

for i,data in enumerate(all_data):
    elements = data.split('\t')
    labels.append(label_encoder[elements[-1]])
    X.append(elements[1:-1])
    nodes.append(elements[0])
    element_to_ind[elements[0]]= i
X = np.array(X,dtype=int)
N = X.shape[0] #the number of nodes
F = X.shape[1] #the size of node features
print('X shape: ', X.shape)


#parse the edge
edge_list=[]
for edge in all_edges:
    e = edge.split('\t')
    edge_list.append((e[0],e[1]))

print('\nNumber of nodes (N): ', N)
print('\nNumber of features (F) of each node: ', F)
print('\nCategories: ', set(labels))

num_classes = len(set(labels))
print('\nNumber of classes: ', num_classes)


X shape:  (2708, 1433)

Number of nodes (N):  2708

Number of features (F) of each node:  1433

Categories:  {0, 1, 2, 3, 4, 5, 6}

Number of classes:  7


In [None]:
G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(edge_list)
G = nx.relabel_nodes(G, element_to_ind)
print('Graph info: ', nx.info(G))

Graph info:  Graph with 2708 nodes and 5278 edges


In [None]:
nodes = list(G.nodes)
print(len(nodes))
list(G.neighbors(0))

2708


[258, 544, 8, 435, 14]

In [None]:
df = pd.DataFrame(list(zip(nodes, labels,X)),columns =['node', 'label','features'])
print(len(df))
df.head()

2708


Unnamed: 0,node,label,features
0,0,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
2,2,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
G = G.subgraph(Gcc[0])
gcc_nodes = list(G.nodes)

In [None]:
df = df.loc[df['node'].isin(gcc_nodes)]
df['node'] = list(range(len(df))) #rename nodes 
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,node,label,features
0,0,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
2,2,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
print(df.shape)

(2485, 3)


In [None]:
train = df.groupby('label', group_keys=False).apply(lambda x: x.sample(20))
G = nx.relabel_nodes(G, df['node'])

In [None]:
def create_transition_matrix(g):
    vs = list(g.nodes)
    n = len(vs)
    adj = nx.adjacency_matrix(g)
    transition_matrix = adj/adj.sum(axis=1)

    return transition_matrix

In [None]:
def random_walk(g, num_steps, start_node, transition_matrix = None):
  if transition_matrix is None:
    transition_matrix = create_transition_matrix(g)

  v = start_node
  for i in range(num_steps):
    PMF = (np.asarray(transition_matrix[v])).flatten()
    vcount = g.number_of_nodes()
    v = np.random.choice(vcount, 1, p=PMF)
  
  return int(v)

In [None]:
seeds_dict = {predicted:list(train[train['label'] == predicted]['node']) for predicted in range(7)}

def random_walk_with_teleportation(g, num_steps, start_node,tp,predicted, transition_matrix = None):
  if transition_matrix is None:
    transition_matrix = create_transition_matrix(g)

  v = start_node
  for i in range(num_steps):
    rn = random.uniform(0, 1)
    # teleport activates if rn smaller than tp chance
    if rn <= tp:
      nodes_in_class = seeds_dict[predicted]
      v = np.random.choice(nodes_in_class)
    else:
      PMF = (np.asarray(transition_matrix[v])).flatten()
      vcount = g.number_of_nodes()
      v = np.random.choice(vcount, 1, p=PMF)
  
  return int(v)


Part a - Teleportation probabilty = 0

In [None]:
#pagerank. NO teleportation, NO tfidf. 
transition_matrix = create_transition_matrix(G)

num_samples = 1000  
num_walk_steps = 100

visiting_freq_label = []
for i in range(transition_matrix.shape[0]):
  visiting_freq_label.append([0,0,0,0,0,0,0])

visiting_freq = [0 for i in range(transition_matrix.shape[0])]


for train_node,predicted in zip(train['node'],train['label']):
  #print (train_node,predicted)
  for i in range(num_samples):
      start_point = train_node
      end_node = random_walk(G, num_walk_steps, start_point, transition_matrix)
      visiting_freq_label[end_node][predicted] += 1
      visiting_freq[end_node] +=1

In [None]:
count = 0 #these many nodes remain unvisited. 
for vf in visiting_freq:
  if vf ==0:
    count+=1
print('unvisited = ', count)
visiting_freq_label = np.asarray(visiting_freq_label)
preds = np.argmax(visiting_freq_label,axis = 1)
print(classification_report(df['label'], preds))
accuracy_score(df['label'], preds)

unvisited =  0
              precision    recall  f1-score   support

           0       0.29      0.49      0.36       285
           1       0.47      0.53      0.50       406
           2       0.52      0.24      0.33       726
           3       0.58      0.51      0.54       379
           4       0.09      0.13      0.11       214
           5       0.09      0.19      0.12       131
           6       0.28      0.24      0.26       344

    accuracy                           0.34      2485
   macro avg       0.33      0.33      0.32      2485
weighted avg       0.40      0.34      0.35      2485



0.344466800804829

Part a - Teleportation probability = 0.1

In [None]:
#pagerank. WITH telportation, without tfidf 
transition_matrix = create_transition_matrix(G)

num_samples = 1000  
num_walk_steps = 100

visiting_freq_label_a2 = []
for i in range(transition_matrix.shape[0]):
  visiting_freq_label_a2.append([0,0,0,0,0,0,0])

visiting_freq_a2 = [0 for i in range(transition_matrix.shape[0])]

for train_node,predicted in zip(train['node'],train['label']):
  #print (train_node,predicted)
  for i in range(num_samples):
      start_point = train_node
      end_node = random_walk_with_teleportation(G, num_walk_steps, start_point, 0.1, predicted, transition_matrix)
      visiting_freq_label_a2[end_node][predicted] += 1
      visiting_freq_a2[end_node] +=1


In [None]:
count_a2 = 0 #these many nodes remain unvisited. 
for vf in visiting_freq_a2:
  if vf ==0:
    count_a2 += 1
print('unvisited = ', count)
visiting_freq_label_a2 = np.asarray(visiting_freq_label_a2)
preds_a2 = np.argmax(visiting_freq_label_a2,axis = 1)
print(classification_report(df['label'], preds_a2))
accuracy_score(df['label'], preds_a2)

unvisited =  0
              precision    recall  f1-score   support

           0       0.76      0.64      0.69       285
           1       0.82      0.92      0.87       406
           2       0.81      0.65      0.72       726
           3       0.87      0.75      0.80       379
           4       0.62      0.83      0.71       214
           5       0.43      0.91      0.59       131
           6       0.65      0.61      0.63       344

    accuracy                           0.73      2485
   macro avg       0.71      0.76      0.72      2485
weighted avg       0.76      0.73      0.73      2485



0.7319919517102615

Part a - Teleportation probability = 0.2

In [None]:
#pagerank. WITH telportation, without tfidf 
transition_matrix = create_transition_matrix(G)

num_samples = 1000  
num_walk_steps = 100

visiting_freq_label_a3 = []
for i in range(transition_matrix.shape[0]):
  visiting_freq_label_a3.append([0,0,0,0,0,0,0])

visiting_freq_a3 = [0 for i in range(transition_matrix.shape[0])]

for train_node,predicted in zip(train['node'],train['label']):
  #print (train_node,predicted)
  for i in range(num_samples):
      start_point = train_node
      end_node = random_walk_with_teleportation(G, num_walk_steps, start_point, 0.2, predicted, transition_matrix)
      visiting_freq_label_a3[end_node][predicted] += 1
      visiting_freq_a3[end_node] +=1

In [None]:
count_a3 = 0 #these many nodes remain unvisited. 
for vf in visiting_freq_a3:
  if vf ==0:
    count_a3 += 1
print('unvisited = ', count)
visiting_freq_label_a3 = np.asarray(visiting_freq_label_a3)
preds_a3 = np.argmax(visiting_freq_label_a3,axis = 1)
print(classification_report(df['label'], preds_a3))
accuracy_score(df['label'], preds_a3)

unvisited =  0
              precision    recall  f1-score   support

           0       0.60      0.65      0.62       285
           1       0.87      0.91      0.89       406
           2       0.80      0.63      0.71       726
           3       0.88      0.72      0.79       379
           4       0.61      0.81      0.70       214
           5       0.41      0.87      0.56       131
           6       0.67      0.60      0.63       344

    accuracy                           0.72      2485
   macro avg       0.69      0.74      0.70      2485
weighted avg       0.75      0.72      0.72      2485



0.7154929577464789

Part b - TFIDF without teleportation

In [None]:
vs = list(G.nodes)
n = len(vs)
adj = nx.adjacency_matrix(G)
transition = np.zeros((len(G.nodes), len(G.nodes)))

for i in vs:
  for j in vs:
    if i in G.neighbors(j):
      n1_ft = list(df['features'])[i]
      n2_ft = list(df['features'])[j]
      cos_sim = (n1_ft @ n2_ft) / (norm(n1_ft) * norm(n2_ft))
      transition[i, j] = np.exp(cos_sim)

transition /= np.sum(transition, axis=1, keepdims=True)

In [None]:
transition_matrix = transition

num_samples = 1000  
num_walk_steps = 100

visiting_freq_label = []
for i in range(transition_matrix.shape[0]):
  visiting_freq_label.append([0,0,0,0,0,0,0])

visiting_freq = [0 for i in range(transition_matrix.shape[0])]


for train_node,predicted in zip(train['node'],train['label']):
  #print (train_node,predicted)
  for i in range(num_samples):
      start_point = train_node
      end_node = random_walk(G, num_walk_steps, start_point, transition)
      visiting_freq_label[end_node][predicted] += 1
      visiting_freq[end_node] +=1

count = 0 #these many nodes remain unvisited. 
for vf in visiting_freq:
  if vf ==0:
    count+=1
print('unvisited = ', count)
visiting_freq_label = np.asarray(visiting_freq_label)
preds = np.argmax(visiting_freq_label,axis = 1)
print(classification_report(df['label'], preds))
accuracy_score(df['label'], preds)

unvisited =  0
              precision    recall  f1-score   support

           0       0.33      0.54      0.41       285
           1       0.50      0.59      0.54       406
           2       0.55      0.27      0.36       726
           3       0.64      0.57      0.60       379
           4       0.12      0.17      0.14       214
           5       0.08      0.17      0.11       131
           6       0.29      0.23      0.26       344

    accuracy                           0.38      2485
   macro avg       0.36      0.36      0.35      2485
weighted avg       0.43      0.38      0.39      2485



0.37987927565392354

Part b - TFDIF with teleportation probability = 0.1

In [None]:
#pagerank. WITH teleportation WITH TFIDF
transition_matrix = transition

transition_matrix = create_transition_matrix(G)

num_samples = 1000  
num_walk_steps = 100

visiting_freq_label = []
for i in range(transition_matrix.shape[0]):
  visiting_freq_label.append([0,0,0,0,0,0,0])

visiting_freq = [0 for i in range(transition_matrix.shape[0])]

for train_node,predicted in zip(train['node'],train['label']):
  #print (train_node,predicted)
  for i in range(num_samples):
      start_point = train_node
      end_node = random_walk_with_teleportation(G, num_walk_steps, start_point, 0.1, predicted, transition)
      visiting_freq_label[end_node][predicted] += 1
      visiting_freq[end_node] +=1

count = 0 #these many nodes remain unvisited. 
for vf in visiting_freq:
  if vf ==0:
    count += 1
print('unvisited = ', count)
visiting_freq_label = np.asarray(visiting_freq_label)
preds = np.argmax(visiting_freq_label,axis = 1)
print(classification_report(df['label'], preds))
accuracy_score(df['label'], preds)

unvisited =  9
              precision    recall  f1-score   support

           0       0.69      0.72      0.70       285
           1       0.86      0.90      0.88       406
           2       0.88      0.65      0.75       726
           3       0.79      0.81      0.80       379
           4       0.65      0.83      0.73       214
           5       0.48      0.90      0.62       131
           6       0.60      0.54      0.57       344

    accuracy                           0.74      2485
   macro avg       0.71      0.77      0.72      2485
weighted avg       0.76      0.74      0.74      2485



0.7372233400402415

Part b - TFDIF with teleportation probability = 0.2

In [None]:
#pagerank. WITH teleportation WITH TFIDF
transition_matrix = transition

transition_matrix = create_transition_matrix(G)

num_samples = 1000  
num_walk_steps = 100

visiting_freq_label = []
for i in range(transition_matrix.shape[0]):
  visiting_freq_label.append([0,0,0,0,0,0,0])

visiting_freq = [0 for i in range(transition_matrix.shape[0])]

for train_node,predicted in zip(train['node'],train['label']):
  #print (train_node,predicted)
  for i in range(num_samples):
      start_point = train_node
      end_node = random_walk_with_teleportation(G, num_walk_steps, start_point, 0.2, predicted, transition)
      visiting_freq_label[end_node][predicted] += 1
      visiting_freq[end_node] +=1

count = 0 #these many nodes remain unvisited. 
for vf in visiting_freq:
  if vf ==0:
    count += 1
print('unvisited = ', count)
visiting_freq_label = np.asarray(visiting_freq_label)
preds = np.argmax(visiting_freq_label,axis = 1)
print(classification_report(df['label'], preds))
accuracy_score(df['label'], preds)

unvisited =  48
              precision    recall  f1-score   support

           0       0.58      0.77      0.66       285
           1       0.87      0.90      0.89       406
           2       0.82      0.58      0.68       726
           3       0.79      0.69      0.74       379
           4       0.58      0.83      0.68       214
           5       0.50      0.87      0.64       131
           6       0.69      0.62      0.65       344

    accuracy                           0.71      2485
   macro avg       0.69      0.75      0.71      2485
weighted avg       0.74      0.71      0.72      2485



0.7142857142857143