In [54]:
import torch #The torch package contains data structures for multi-dimensional tensors and mathematical operations over these are defined.
import torchvision #The torchvision package consists of popular datasets, model architectures, and common image transformations for computer vision.
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import SAGEConv
from torch_geometric.nn import GATConv
import pandas as pd

In [2]:
edge_dataset=pd.read_csv('large_twitch_edges.csv')

In [3]:
features_dataset=pd.read_csv('large_twitch_features.csv')

In [4]:
edge_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6797557 entries, 0 to 6797556
Data columns (total 2 columns):
 #   Column        Dtype
---  ------        -----
 0   numeric_id_1  int64
 1   numeric_id_2  int64
dtypes: int64(2)
memory usage: 103.7 MB


In [5]:
edge_dataset.head()

Unnamed: 0,numeric_id_1,numeric_id_2
0,98343,141493
1,98343,58736
2,98343,140703
3,98343,151401
4,98343,157118


In [6]:
features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168114 entries, 0 to 168113
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   views         168114 non-null  int64 
 1   mature        168114 non-null  int64 
 2   life_time     168114 non-null  int64 
 3   created_at    168114 non-null  object
 4   updated_at    168114 non-null  object
 5   numeric_id    168114 non-null  int64 
 6   dead_account  168114 non-null  int64 
 7   language      168114 non-null  object
 8   affiliate     168114 non-null  int64 
dtypes: int64(6), object(3)
memory usage: 11.5+ MB


In [7]:
features_dataset.head()

Unnamed: 0,views,mature,life_time,created_at,updated_at,numeric_id,dead_account,language,affiliate
0,7879,1,969,2016-02-16,2018-10-12,0,0,EN,1
1,500,0,2699,2011-05-19,2018-10-08,1,0,EN,0
2,382502,1,3149,2010-02-27,2018-10-12,2,0,EN,1
3,386,0,1344,2015-01-26,2018-10-01,3,0,EN,0
4,2486,0,1784,2013-11-22,2018-10-11,4,0,EN,0


In [8]:
features_dataset.head()

Unnamed: 0,views,mature,life_time,created_at,updated_at,numeric_id,dead_account,language,affiliate
0,7879,1,969,2016-02-16,2018-10-12,0,0,EN,1
1,500,0,2699,2011-05-19,2018-10-08,1,0,EN,0
2,382502,1,3149,2010-02-27,2018-10-12,2,0,EN,1
3,386,0,1344,2015-01-26,2018-10-01,3,0,EN,0
4,2486,0,1784,2013-11-22,2018-10-11,4,0,EN,0


In [9]:
edge_dataset.describe()

Unnamed: 0,numeric_id_1,numeric_id_2
count,6797557.0,6797557.0
mean,83828.01,84015.23
std,48205.13,48527.19
min,0.0,0.0
25%,42217.0,42045.0
50%,83546.0,83851.0
75%,125642.0,125957.0
max,168112.0,168113.0


**Converting Dataset to Pytorch Geometric Data**

In [10]:
features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168114 entries, 0 to 168113
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   views         168114 non-null  int64 
 1   mature        168114 non-null  int64 
 2   life_time     168114 non-null  int64 
 3   created_at    168114 non-null  object
 4   updated_at    168114 non-null  object
 5   numeric_id    168114 non-null  int64 
 6   dead_account  168114 non-null  int64 
 7   language      168114 non-null  object
 8   affiliate     168114 non-null  int64 
dtypes: int64(6), object(3)
memory usage: 11.5+ MB


In [11]:
features_dataset['language'].unique()

array(['EN', 'FR', 'KO', 'JA', 'RU', 'PL', 'DE', 'ES', 'IT', 'PT',
       'OTHER', 'TR', 'ZH', 'SV', 'NL', 'TH', 'CS', 'DA', 'HU', 'FI',
       'NO'], dtype=object)

In [12]:
languages = ['EN', 'FR', 'KO', 'JA', 'RU', 'PL', 'DE', 'ES', 'IT', 'PT',
       'OTHER', 'TR', 'ZH', 'SV', 'NL', 'TH', 'CS', 'DA', 'HU', 'FI',
       'NO']

from sklearn.preprocessing import LabelEncoder

def encode_df(dataframe):
    le = LabelEncoder()
    features_dataset['language'] = le.fit_transform(features_dataset['language'])
    return dataframe

#encode the dataframe
features_dataset = encode_df(features_dataset)
features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168114 entries, 0 to 168113
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   views         168114 non-null  int64 
 1   mature        168114 non-null  int64 
 2   life_time     168114 non-null  int64 
 3   created_at    168114 non-null  object
 4   updated_at    168114 non-null  object
 5   numeric_id    168114 non-null  int64 
 6   dead_account  168114 non-null  int64 
 7   language      168114 non-null  int32 
 8   affiliate     168114 non-null  int64 
dtypes: int32(1), int64(6), object(2)
memory usage: 10.9+ MB


In [13]:
node_features = features_dataset[["numeric_id"]]

In [14]:
# node_features['created_at'] = pd.to_datetime(node_features['created_at']).astype('int64')/ 10**9
# node_features['updated_at'] = pd.to_datetime(node_features['updated_at']).astype('int64')/ 10**9
node_features.head()

Unnamed: 0,numeric_id
0,0
1,1
2,2
3,3
4,4


In [15]:
node_features = node_features.astype('float32')
node_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168114 entries, 0 to 168113
Data columns (total 1 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   numeric_id  168114 non-null  float32
dtypes: float32(1)
memory usage: 656.8 KB


In [16]:
x =  torch.from_numpy(node_features.to_numpy())
x.shape # [num_nodes x num_features]

torch.Size([168114, 1])

In [17]:
x.dtype

torch.float32

In [18]:
features_dataset[["dead_account","language","affiliate"]] = features_dataset[["dead_account","language","affiliate"]].astype('int64')
features_dataset[["dead_account","language","affiliate"]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168114 entries, 0 to 168113
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   dead_account  168114 non-null  int64
 1   language      168114 non-null  int64
 2   affiliate     168114 non-null  int64
dtypes: int64(3)
memory usage: 3.8 MB


In [19]:
# Select node features
labels = features_dataset[["dead_account"]]
labels.head()

# labels = labels.astype(float)

# Convert to numpy
y =  torch.from_numpy(labels.to_numpy())
y.shape # [num_nodes, 1] --> node regression

torch.Size([168114, 1])

In [20]:
y = y.reshape(-1,)

In [21]:
y.shape

torch.Size([168114])

In [22]:
labels.dtypes

dead_account    int64
dtype: object

In [23]:
edge_dataset = edge_dataset.astype('int64')

In [24]:
edge_dataset.dtypes

numeric_id_1    int64
numeric_id_2    int64
dtype: object

In [25]:
edge_dataset = edge_dataset.sort_values(by=['numeric_id_1'])

In [26]:
edge_index = edge_dataset.transpose()

In [27]:
edge_index.dtypes.unique()

array([dtype('int64')], dtype=object)

In [28]:
all_edges =  torch.from_numpy(edge_index.to_numpy()) # [2, num_edges]
print(all_edges.shape)

torch.Size([2, 6797557])


In [29]:
all_edges

tensor([[     0,      0,      0,  ..., 168112, 168112, 168112],
        [ 10464,  59443, 151601,  ...,  77866,  95086,  12740]])

In [30]:
len(features_dataset)

168114

In [31]:
train_arr = np.array([True for i in range(round(len(features_dataset)*0.85))])

In [32]:
train_arr = np.append(train_arr,np.array([False for i in range(int(len(features_dataset)*0.15))]))

In [33]:
train_arr.shape

(168114,)

In [34]:
test_arr = np.array([False for i in range(round(len(features_dataset)*0.55))])
test_arr = np.append(test_arr,np.array([True for i in range(int(len(features_dataset)*0.45))]))

In [35]:
test_arr.shape

(168114,)

In [36]:
val_arr = np.array([False for i in range(round(len(features_dataset)*0.35))])
val_arr = np.append(val_arr,np.array([True for i in range(round(len(features_dataset)*0.35))]))
val_arr = np.append(val_arr,np.array([False for i in range(int(len(features_dataset)*0.30))]))

In [37]:
val_arr.shape

(168114,)

In [38]:
train_mask = torch.from_numpy(train_arr)
test_mask = torch.from_numpy(test_arr)
val_mask = torch.from_numpy(val_arr)

In [39]:
from torch_geometric.data import Data
data = Data(x=x, edge_index=all_edges, y=y)

In [40]:
data

Data(x=[168114, 1], edge_index=[2, 6797557], y=[168114])

In [41]:
data.num_classes = 2
data.train_mask = train_mask
data.test_mask = test_mask
data.val_mask = val_mask

In [42]:
data

Data(x=[168114, 1], edge_index=[2, 6797557], y=[168114], num_classes=2, train_mask=[168114], test_mask=[168114], val_mask=[168114])

In [43]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:1024"

In [44]:
# import torch_geometric
# from torch_geometric.utils.convert import to_networkx
# import networkx as nx
# import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 10))
# twitch_gamers = torch_geometric.data.Data(x=data.x[:500], edge_index=data.edge_index[:500])
# # g = torch_geometric.utils.to_networkx(twitch_gamers, to_undirected=True)
# twitchgraph = to_networkx(twitch_gamers)
# node_labels = data.y[list(twitchgraph.nodes)].numpy()
# nx.draw(g, cmap=plt.get_cmap('Set1'),node_color = node_labels,node_size=75,linewidths=6)

Live Account / Dead Account Classification

In [45]:
print(data)
print("number of graphs:\t\t",len(data))
print("number of classes:\t\t",data.num_classes)
print("number of classes:\t\t",np.unique(data.y))
print("number of node features:\t",data.num_node_features)
print("number of edge features:\t",data.num_edge_features)
print("X shape: ", data.x.shape)
print("Edge shape: ", data.edge_index.shape)
print("Y shape: ", data.y.shape)

Data(x=[168114, 1], edge_index=[2, 6797557], y=[168114], num_classes=2, train_mask=[168114], test_mask=[168114], val_mask=[168114])
number of graphs:		 7
number of classes:		 2
number of classes:		 [0 1]
number of node features:	 1
number of edge features:	 0
X shape:  torch.Size([168114, 1])
Edge shape:  torch.Size([2, 6797557])
Y shape:  torch.Size([168114])


In [46]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(data.num_node_features, 16)
        self.conv2 = GCNConv(16, data.num_classes)   
    
    def forward(self, data):
        # x: Node feature matrix 
        # edge_index: Graph connectivity matrix        
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
print("Graph Convolutional Network (GCN):")
GCN()

Graph Convolutional Network (GCN):


GCN(
  (conv1): GCNConv(1, 16)
  (conv2): GCNConv(16, 2)
)

In [47]:
# useful function for computing accuracy
def compute_accuracy(pred_y, y):
    return (pred_y == y).sum()

In [48]:
# train the model
model.train()
losses = []
accuracies = []
epoch_stable = []
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    correct = compute_accuracy(out.argmax(dim=1)[data.train_mask], data.y[data.train_mask])
    acc = int(correct) / int(data.train_mask.sum())
    losses.append(loss.item())
    accuracies.append(acc*100)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print('Epoch: {}, Loss: {:.4f}, Training Acc: {:.4f}'.format(epoch+1, loss.item(), acc))

Epoch: 10, Loss: 294.2742, Training Acc: 0.9561
Epoch: 20, Loss: 154.2151, Training Acc: 0.9515
Epoch: 30, Loss: 73.0423, Training Acc: 0.9361
Epoch: 40, Loss: 22.4700, Training Acc: 0.9579
Epoch: 50, Loss: 9.0745, Training Acc: 0.9572
Epoch: 60, Loss: 5.3297, Training Acc: 0.9651
Epoch: 70, Loss: 43.7609, Training Acc: 0.9695
Epoch: 80, Loss: 29.0569, Training Acc: 0.9695
Epoch: 90, Loss: 12.0303, Training Acc: 0.9695
Epoch: 100, Loss: 43.3903, Training Acc: 0.9695
Epoch: 110, Loss: 91.1746, Training Acc: 0.9695
Epoch: 120, Loss: 39.9148, Training Acc: 0.9680
Epoch: 130, Loss: 12.3479, Training Acc: 0.8967
Epoch: 140, Loss: 3.7884, Training Acc: 0.9574
Epoch: 150, Loss: 0.3986, Training Acc: 0.9695
Epoch: 160, Loss: 0.4519, Training Acc: 0.9695
Epoch: 170, Loss: 0.3472, Training Acc: 0.9695
Epoch: 180, Loss: 0.2765, Training Acc: 0.9695
Epoch: 190, Loss: 0.2331, Training Acc: 0.9695
Epoch: 200, Loss: 0.2063, Training Acc: 0.9695


In [49]:
model.eval()
pred = model(data).argmax(dim=1)
correct = compute_accuracy(pred[data.test_mask], data.y[data.test_mask])
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.9694


In [50]:
torch.cuda.empty_cache()

In [62]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class SAGECN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = SAGEConv(data.num_node_features, 16)
        self.conv2 = SAGEConv(16, data.num_classes)   
    
    def forward(self, data):
        # x: Node feature matrix 
        # edge_index: Graph connectivity matrix        
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

model = SAGECN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
print("SAGE Graph Convolutional Network (GCN):")
SAGECN()

SAGE Graph Convolutional Network (GCN):


SAGECN(
  (conv1): SAGEConv(1, 16, aggr=mean)
  (conv2): SAGEConv(16, 2, aggr=mean)
)

In [63]:
# train the model
model.train()
losses = []
accuracies = []
epoch_stable = []
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    correct = compute_accuracy(out.argmax(dim=1)[data.train_mask], data.y[data.train_mask])
    acc = int(correct) / int(data.train_mask.sum())
    losses.append(loss.item())
    accuracies.append(acc*100)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print('Epoch: {}, Loss: {:.4f}, Training Acc: {:.4f}'.format(epoch+1, loss.item(), acc))

Epoch: 10, Loss: 1330.0488, Training Acc: 0.9433
Epoch: 20, Loss: 331.9142, Training Acc: 0.9086
Epoch: 30, Loss: 137.7535, Training Acc: 0.9354
Epoch: 40, Loss: 51.6706, Training Acc: 0.9099
Epoch: 50, Loss: 334.5307, Training Acc: 0.9694
Epoch: 60, Loss: 168.1728, Training Acc: 0.9685
Epoch: 70, Loss: 229.1092, Training Acc: 0.9693
Epoch: 80, Loss: 85.1866, Training Acc: 0.9605
Epoch: 90, Loss: 49.0361, Training Acc: 0.9691
Epoch: 100, Loss: 88.9890, Training Acc: 0.9669
Epoch: 110, Loss: 26.7843, Training Acc: 0.9689
Epoch: 120, Loss: 39.7027, Training Acc: 0.9463
Epoch: 130, Loss: 48.2361, Training Acc: 0.8451
Epoch: 140, Loss: 33.4130, Training Acc: 0.6131
Epoch: 150, Loss: 29.0949, Training Acc: 0.9525
Epoch: 160, Loss: 26.7401, Training Acc: 0.9687
Epoch: 170, Loss: 53.0635, Training Acc: 0.9682
Epoch: 180, Loss: 36.8511, Training Acc: 0.9662
Epoch: 190, Loss: 10.6977, Training Acc: 0.9654
Epoch: 200, Loss: 1.8517, Training Acc: 0.9609


In [64]:
model.eval()
pred = model(data).argmax(dim=1)
correct = compute_accuracy(pred[data.test_mask], data.y[data.test_mask])
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.9694


In [70]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GATCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GATConv(data.num_node_features, 16)
        self.conv2 = GATConv(16, data.num_classes)   
    
    def forward(self, data):
        # x: Node feature matrix 
        # edge_index: Graph connectivity matrix        
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

model = GATCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
print("Graph Attention Network (GCN):")
GATCN()

Graph Attention Network (GCN):


GATCN(
  (conv1): GATConv(1, 16, heads=1)
  (conv2): GATConv(16, 2, heads=1)
)

In [71]:
# train the model
model.train()
losses = []
accuracies = []
epoch_stable = []
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    correct = compute_accuracy(out.argmax(dim=1)[data.train_mask], data.y[data.train_mask])
    acc = int(correct) / int(data.train_mask.sum())
    losses.append(loss.item())
    accuracies.append(acc*100)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print('Epoch: {}, Loss: {:.4f}, Training Acc: {:.4f}'.format(epoch+1, loss.item(), acc))

Epoch: 10, Loss: 2900.1421, Training Acc: 0.6705
Epoch: 20, Loss: 545.3464, Training Acc: 0.9397
Epoch: 30, Loss: 581.8926, Training Acc: 0.9533
Epoch: 40, Loss: 561.6147, Training Acc: 0.9579
Epoch: 50, Loss: 461.3360, Training Acc: 0.9448
Epoch: 60, Loss: 426.4019, Training Acc: 0.9590
Epoch: 70, Loss: 363.1548, Training Acc: 0.9576
Epoch: 80, Loss: 359.2078, Training Acc: 0.9507
Epoch: 90, Loss: 240.9288, Training Acc: 0.9465
Epoch: 100, Loss: 223.1491, Training Acc: 0.9591
Epoch: 110, Loss: 141.5525, Training Acc: 0.9561
Epoch: 120, Loss: 90.5152, Training Acc: 0.9329
Epoch: 130, Loss: 119.1024, Training Acc: 0.9075
Epoch: 140, Loss: 79.0528, Training Acc: 0.9450
Epoch: 150, Loss: 61.8010, Training Acc: 0.9624
Epoch: 160, Loss: 54.9426, Training Acc: 0.8502
Epoch: 170, Loss: 64.1227, Training Acc: 0.9162
Epoch: 180, Loss: 55.0258, Training Acc: 0.9190
Epoch: 190, Loss: 42.0996, Training Acc: 0.9649
Epoch: 200, Loss: 42.3579, Training Acc: 0.9665


In [72]:
model.eval()
pred = model(data).argmax(dim=1)
correct = compute_accuracy(pred[data.test_mask], data.y[data.test_mask])
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.9694


Language Classification

In [73]:
# Select node features
labels = features_dataset[["language"]]
labels.head()

# labels = labels.astype(float)

# Convert to numpy
y =  torch.from_numpy(labels.to_numpy())
y.shape # [num_nodes, 1] --> node regression

torch.Size([168114, 1])

In [74]:
y = y.reshape(-1,)

In [75]:
y.shape

torch.Size([168114])

In [76]:
labels.dtypes

language    int64
dtype: object

In [77]:
from torch_geometric.data import Data
data = Data(x=x, edge_index=all_edges, y=y)

In [78]:
data

Data(x=[168114, 1], edge_index=[2, 6797557], y=[168114])

In [79]:
data.num_classes = 21
data.train_mask = train_mask
data.test_mask = test_mask
data.val_mask = val_mask

In [80]:
data

Data(x=[168114, 1], edge_index=[2, 6797557], y=[168114], num_classes=21, train_mask=[168114], test_mask=[168114], val_mask=[168114])

In [81]:
print(data)
print("number of graphs:\t\t",len(data))
print("number of classes:\t\t",data.num_classes)
print("number of classes:\t\t",np.unique(data.y))
print("number of node features:\t",data.num_node_features)
print("number of edge features:\t",data.num_edge_features)
print("X shape: ", data.x.shape)
print("Edge shape: ", data.edge_index.shape)
print("Y shape: ", data.y.shape)

Data(x=[168114, 1], edge_index=[2, 6797557], y=[168114], num_classes=21, train_mask=[168114], test_mask=[168114], val_mask=[168114])
number of graphs:		 7
number of classes:		 21
number of classes:		 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]
number of node features:	 1
number of edge features:	 0
X shape:  torch.Size([168114, 1])
Edge shape:  torch.Size([2, 6797557])
Y shape:  torch.Size([168114])


In [82]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(data.num_node_features, 16)
        self.conv2 = GCNConv(16, data.num_classes)   
    
    def forward(self, data):
        # x: Node feature matrix 
        # edge_index: Graph connectivity matrix        
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
print("Graph Convolutional Network (GCN):")
GCN()

Graph Convolutional Network (GCN):


GCN(
  (conv1): GCNConv(1, 16)
  (conv2): GCNConv(16, 21)
)

In [83]:
# train the model
model.train()
losses = []
accuracies = []
epoch_stable = []
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    correct = compute_accuracy(out.argmax(dim=1)[data.train_mask], data.y[data.train_mask])
    acc = int(correct) / int(data.train_mask.sum())
    losses.append(loss.item()/1000)
    accuracies.append(acc*10000)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print('Epoch: {}, Loss: {:.4f}, Training Acc: {:.4f}'.format(epoch+1, loss.item(), acc))

Epoch: 10, Loss: 4594.3657, Training Acc: 0.3958
Epoch: 20, Loss: 5635.3379, Training Acc: 0.6658
Epoch: 30, Loss: 4284.4199, Training Acc: 0.6081
Epoch: 40, Loss: 2993.6726, Training Acc: 0.5124
Epoch: 50, Loss: 2222.1672, Training Acc: 0.6032
Epoch: 60, Loss: 1744.5785, Training Acc: 0.5641
Epoch: 70, Loss: 1473.7809, Training Acc: 0.5700
Epoch: 80, Loss: 1249.5635, Training Acc: 0.6008
Epoch: 90, Loss: 1074.3857, Training Acc: 0.5603
Epoch: 100, Loss: 896.3304, Training Acc: 0.5740
Epoch: 110, Loss: 726.3765, Training Acc: 0.5270
Epoch: 120, Loss: 664.1389, Training Acc: 0.4609
Epoch: 130, Loss: 476.9018, Training Acc: 0.4726
Epoch: 140, Loss: 358.4817, Training Acc: 0.5684
Epoch: 150, Loss: 310.2226, Training Acc: 0.5152
Epoch: 160, Loss: 247.2195, Training Acc: 0.5100
Epoch: 170, Loss: 209.1277, Training Acc: 0.5120
Epoch: 180, Loss: 190.0162, Training Acc: 0.5159
Epoch: 190, Loss: 143.0040, Training Acc: 0.5713
Epoch: 200, Loss: 120.5919, Training Acc: 0.5587


In [84]:
model.eval()
pred = model(data).argmax(dim=1)
correct = compute_accuracy(pred[data.test_mask], data.y[data.test_mask])
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.7402


In [85]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class SAGECN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = SAGEConv(data.num_node_features, 16)
        self.conv2 = SAGEConv(16, data.num_classes)   
    
    def forward(self, data):
        # x: Node feature matrix 
        # edge_index: Graph connectivity matrix        
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

model = SAGECN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
print("Graph Convolutional Network (GCN):")
SAGECN()

Graph Convolutional Network (GCN):


SAGECN(
  (conv1): SAGEConv(1, 16, aggr=mean)
  (conv2): SAGEConv(16, 21, aggr=mean)
)

In [86]:
# train the model
model.train()
losses = []
accuracies = []
epoch_stable = []
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    correct = compute_accuracy(out.argmax(dim=1)[data.train_mask], data.y[data.train_mask])
    acc = int(correct) / int(data.train_mask.sum())
    losses.append(loss.item()/1000)
    accuracies.append(acc*10000)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print('Epoch: {}, Loss: {:.4f}, Training Acc: {:.4f}'.format(epoch+1, loss.item(), acc))

Epoch: 10, Loss: 25871.5000, Training Acc: 0.6902
Epoch: 20, Loss: 15540.1338, Training Acc: 0.5593
Epoch: 30, Loss: 8264.8047, Training Acc: 0.5615
Epoch: 40, Loss: 5522.2358, Training Acc: 0.5516
Epoch: 50, Loss: 3644.7698, Training Acc: 0.6092
Epoch: 60, Loss: 2442.5212, Training Acc: 0.6034
Epoch: 70, Loss: 1945.5668, Training Acc: 0.5928
Epoch: 80, Loss: 1697.8372, Training Acc: 0.4186
Epoch: 90, Loss: 1205.2484, Training Acc: 0.3269
Epoch: 100, Loss: 1182.0908, Training Acc: 0.6003
Epoch: 110, Loss: 1315.2778, Training Acc: 0.4142
Epoch: 120, Loss: 1574.3534, Training Acc: 0.5333
Epoch: 130, Loss: 1551.9508, Training Acc: 0.5612
Epoch: 140, Loss: 867.1945, Training Acc: 0.5866
Epoch: 150, Loss: 982.3398, Training Acc: 0.7107
Epoch: 160, Loss: 1010.3718, Training Acc: 0.1535
Epoch: 170, Loss: 978.2234, Training Acc: 0.6308
Epoch: 180, Loss: 663.0765, Training Acc: 0.6911
Epoch: 190, Loss: 402.5407, Training Acc: 0.4388
Epoch: 200, Loss: 608.8264, Training Acc: 0.4223


In [87]:
model.eval()
pred = model(data).argmax(dim=1)
correct = compute_accuracy(pred[data.test_mask], data.y[data.test_mask])
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.7385


In [88]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GATCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GATConv(data.num_node_features, 16)
        self.conv2 = GATConv(16, data.num_classes)   
    
    def forward(self, data):
        # x: Node feature matrix 
        # edge_index: Graph connectivity matrix        
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

model = GATCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
print("Graph Convolutional Network (GCN):")
GATCN()

Graph Convolutional Network (GCN):


GATCN(
  (conv1): GATConv(1, 16, heads=1)
  (conv2): GATConv(16, 21, heads=1)
)

In [89]:
# train the model
model.train()
losses = []
accuracies = []
epoch_stable = []
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    correct = compute_accuracy(out.argmax(dim=1)[data.train_mask], data.y[data.train_mask])
    acc = int(correct) / int(data.train_mask.sum())
    losses.append(loss.item()/1000)
    accuracies.append(acc*10000)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print('Epoch: {}, Loss: {:.4f}, Training Acc: {:.4f}'.format(epoch+1, loss.item(), acc))

Epoch: 10, Loss: 4037.9187, Training Acc: 0.5856
Epoch: 20, Loss: 71.2553, Training Acc: 0.6591
Epoch: 30, Loss: 17.5411, Training Acc: 0.3262
Epoch: 40, Loss: 17.1531, Training Acc: 0.5977
Epoch: 50, Loss: 15.3569, Training Acc: 0.6930
Epoch: 60, Loss: 13.6613, Training Acc: 0.6865
Epoch: 70, Loss: 12.1782, Training Acc: 0.7050
Epoch: 80, Loss: 12.3782, Training Acc: 0.7179
Epoch: 90, Loss: 11.6111, Training Acc: 0.7139
Epoch: 100, Loss: 11.6038, Training Acc: 0.7209
Epoch: 110, Loss: 10.9582, Training Acc: 0.7165
Epoch: 120, Loss: 9.8024, Training Acc: 0.7252
Epoch: 130, Loss: 10.2173, Training Acc: 0.7240
Epoch: 140, Loss: 9.8679, Training Acc: 0.7246
Epoch: 150, Loss: 9.4816, Training Acc: 0.7257
Epoch: 160, Loss: 8.3578, Training Acc: 0.7190
Epoch: 170, Loss: 7.6288, Training Acc: 0.7269
Epoch: 180, Loss: 8.0253, Training Acc: 0.7264
Epoch: 190, Loss: 7.3051, Training Acc: 0.7239
Epoch: 200, Loss: 6.4689, Training Acc: 0.7254


In [90]:
model.eval()
pred = model(data).argmax(dim=1)
correct = compute_accuracy(pred[data.test_mask], data.y[data.test_mask])
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.7402


In [91]:
torch.cuda.empty_cache()

Affiliation Status Identification

In [92]:
# Select node features
labels = features_dataset[["affiliate"]]
labels.head()

# labels = labels.astype(float)

# Convert to numpy
y =  torch.from_numpy(labels.to_numpy())
y.shape # [num_nodes, 1] --> node regression

torch.Size([168114, 1])

In [93]:
y = y.reshape(-1,)

In [94]:
y.shape

torch.Size([168114])

In [95]:
from torch_geometric.data import Data
data = Data(x=x, edge_index=all_edges, y=y)

In [96]:
data

Data(x=[168114, 1], edge_index=[2, 6797557], y=[168114])

In [97]:
data.num_classes = 2
data.train_mask = train_mask
data.test_mask = test_mask
data.val_mask = val_mask

In [98]:
data

Data(x=[168114, 1], edge_index=[2, 6797557], y=[168114], num_classes=2, train_mask=[168114], test_mask=[168114], val_mask=[168114])

In [99]:
print(data)
print("number of graphs:\t\t",len(data))
print("number of classes:\t\t",data.num_classes)
print("number of classes:\t\t",np.unique(data.y))
print("number of node features:\t",data.num_node_features)
print("number of edge features:\t",data.num_edge_features)
print("X shape: ", data.x.shape)
print("Edge shape: ", data.edge_index.shape)
print("Y shape: ", data.y.shape)

Data(x=[168114, 1], edge_index=[2, 6797557], y=[168114], num_classes=2, train_mask=[168114], test_mask=[168114], val_mask=[168114])
number of graphs:		 7
number of classes:		 2
number of classes:		 [0 1]
number of node features:	 1
number of edge features:	 0
X shape:  torch.Size([168114, 1])
Edge shape:  torch.Size([2, 6797557])
Y shape:  torch.Size([168114])


In [100]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(data.num_node_features, 16)
        self.conv2 = GCNConv(16, data.num_classes)   
    
    def forward(self, data):
        # x: Node feature matrix 
        # edge_index: Graph connectivity matrix        
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
print("Graph Convolutional Network (GCN):")
GCN()

Graph Convolutional Network (GCN):


GCN(
  (conv1): GCNConv(1, 16)
  (conv2): GCNConv(16, 2)
)

In [102]:
# train the model
model.train()
losses = []
accuracies = []
epoch_stable = []
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    correct = compute_accuracy(out.argmax(dim=1)[data.train_mask], data.y[data.train_mask])
    acc = int(correct) / int(data.train_mask.sum())
    losses.append(loss.item()/1000)
    accuracies.append(acc*10000)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print('Epoch: {}, Loss: {:.4f}, Training Acc: {:.4f}'.format(epoch+1, loss.item(), acc))

Epoch: 10, Loss: 330.0388, Training Acc: 0.5137
Epoch: 20, Loss: 28.8396, Training Acc: 0.4954
Epoch: 30, Loss: 226.8124, Training Acc: 0.4853
Epoch: 40, Loss: 46.1765, Training Acc: 0.5032
Epoch: 50, Loss: 306.5750, Training Acc: 0.5138
Epoch: 60, Loss: 573.1097, Training Acc: 0.5147
Epoch: 70, Loss: 227.3119, Training Acc: 0.5138
Epoch: 80, Loss: 140.9728, Training Acc: 0.5122
Epoch: 90, Loss: 19.9780, Training Acc: 0.4880
Epoch: 100, Loss: 106.3802, Training Acc: 0.5145
Epoch: 110, Loss: 182.5476, Training Acc: 0.5147
Epoch: 120, Loss: 13.9099, Training Acc: 0.4905
Epoch: 130, Loss: 146.4393, Training Acc: 0.4856
Epoch: 140, Loss: 319.4242, Training Acc: 0.5148
Epoch: 150, Loss: 35.3138, Training Acc: 0.5114
Epoch: 160, Loss: 105.6124, Training Acc: 0.5148
Epoch: 170, Loss: 121.6744, Training Acc: 0.4885
Epoch: 180, Loss: 116.3296, Training Acc: 0.4908
Epoch: 190, Loss: 54.2342, Training Acc: 0.4873
Epoch: 200, Loss: 168.8339, Training Acc: 0.5148


In [103]:
model.eval()
pred = model(data).argmax(dim=1)
correct = compute_accuracy(pred[data.test_mask], data.y[data.test_mask])
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.5142


In [104]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class SAGECN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = SAGEConv(data.num_node_features, 16)
        self.conv2 = SAGEConv(16, data.num_classes)   
    
    def forward(self, data):
        # x: Node feature matrix 
        # edge_index: Graph connectivity matrix        
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

model = SAGECN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
print("Graph Convolutional Network (GCN):")
SAGECN()

Graph Convolutional Network (GCN):


SAGECN(
  (conv1): SAGEConv(1, 16, aggr=mean)
  (conv2): SAGEConv(16, 2, aggr=mean)
)

In [105]:
# train the model
model.train()
losses = []
accuracies = []
epoch_stable = []
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    correct = compute_accuracy(out.argmax(dim=1)[data.train_mask], data.y[data.train_mask])
    acc = int(correct) / int(data.train_mask.sum())
    losses.append(loss.item()/1000)
    accuracies.append(acc*10000)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print('Epoch: {}, Loss: {:.4f}, Training Acc: {:.4f}'.format(epoch+1, loss.item(), acc))

Epoch: 10, Loss: 22711.7266, Training Acc: 0.4947
Epoch: 20, Loss: 11811.5264, Training Acc: 0.5034
Epoch: 30, Loss: 5153.5674, Training Acc: 0.5005
Epoch: 40, Loss: 4185.5376, Training Acc: 0.5110
Epoch: 50, Loss: 2896.5632, Training Acc: 0.5041
Epoch: 60, Loss: 2141.3992, Training Acc: 0.5036
Epoch: 70, Loss: 1570.2141, Training Acc: 0.5031
Epoch: 80, Loss: 1082.0494, Training Acc: 0.5187
Epoch: 90, Loss: 894.1318, Training Acc: 0.5184
Epoch: 100, Loss: 464.9858, Training Acc: 0.5146
Epoch: 110, Loss: 325.1484, Training Acc: 0.4965
Epoch: 120, Loss: 1093.5674, Training Acc: 0.5107
Epoch: 130, Loss: 1681.1971, Training Acc: 0.4859
Epoch: 140, Loss: 2128.2485, Training Acc: 0.5147
Epoch: 150, Loss: 379.3544, Training Acc: 0.5052
Epoch: 160, Loss: 733.7258, Training Acc: 0.4870
Epoch: 170, Loss: 538.6768, Training Acc: 0.4891
Epoch: 180, Loss: 577.8620, Training Acc: 0.5121
Epoch: 190, Loss: 268.3410, Training Acc: 0.4968
Epoch: 200, Loss: 211.8831, Training Acc: 0.4975


In [106]:
model.eval()
pred = model(data).argmax(dim=1)
correct = compute_accuracy(pred[data.test_mask], data.y[data.test_mask])
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.5267


In [107]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GATCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GATConv(data.num_node_features, 16)
        self.conv2 = GATConv(16, data.num_classes)   
    
    def forward(self, data):
        # x: Node feature matrix 
        # edge_index: Graph connectivity matrix        
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

model = GATCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
print("Graph Convolutional Network (GCN):")
GATCN()

Graph Convolutional Network (GCN):


GATCN(
  (conv1): GATConv(1, 16, heads=1)
  (conv2): GATConv(16, 2, heads=1)
)

In [108]:
# train the model
model.train()
losses = []
accuracies = []
epoch_stable = []
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    correct = compute_accuracy(out.argmax(dim=1)[data.train_mask], data.y[data.train_mask])
    acc = int(correct) / int(data.train_mask.sum())
    losses.append(loss.item()/1000)
    accuracies.append(acc*10000)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print('Epoch: {}, Loss: {:.4f}, Training Acc: {:.4f}'.format(epoch+1, loss.item(), acc))

Epoch: 10, Loss: 993.9878, Training Acc: 0.5531
Epoch: 20, Loss: 758.9821, Training Acc: 0.5521
Epoch: 30, Loss: 407.7310, Training Acc: 0.5506
Epoch: 40, Loss: 441.2668, Training Acc: 0.5429
Epoch: 50, Loss: 409.8798, Training Acc: 0.5441
Epoch: 60, Loss: 373.9543, Training Acc: 0.5469
Epoch: 70, Loss: 335.2215, Training Acc: 0.5381
Epoch: 80, Loss: 355.1660, Training Acc: 0.5419
Epoch: 90, Loss: 803.8299, Training Acc: 0.5349
Epoch: 100, Loss: 283.0875, Training Acc: 0.5351
Epoch: 110, Loss: 304.4244, Training Acc: 0.5373
Epoch: 120, Loss: 408.3054, Training Acc: 0.5392
Epoch: 130, Loss: 270.1129, Training Acc: 0.5335
Epoch: 140, Loss: 318.0359, Training Acc: 0.5400
Epoch: 150, Loss: 335.5360, Training Acc: 0.5444
Epoch: 160, Loss: 281.6624, Training Acc: 0.5428
Epoch: 170, Loss: 288.7307, Training Acc: 0.5404
Epoch: 180, Loss: 246.6561, Training Acc: 0.5324
Epoch: 190, Loss: 233.7589, Training Acc: 0.5446
Epoch: 200, Loss: 223.6935, Training Acc: 0.5387


In [109]:
model.eval()
pred = model(data).argmax(dim=1)
correct = compute_accuracy(pred[data.test_mask], data.y[data.test_mask])
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.5151
