In [None]:
!pip install  dgl

Collecting dgl
  Downloading dgl-1.1.2-cp310-cp310-manylinux1_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dgl
Successfully installed dgl-1.1.2


In [None]:
!pip install torch



In [None]:
!pip install networkx



In [None]:
import dgl
import torch
import networkx as nx

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [None]:
# Assumes presence of graphml file (can be generated from data scraping code folder)
G = nx.read_graphml('mega_bip_top100000.graphml')

In [None]:
G = G.to_directed()

In [None]:
# Since dgl doesn't like booleans
for node in G.nodes(data=True):
    if node[1]['node_type'] == 'user':
        node[1]['node_type'] = 0
    elif node[1]['node_type'] == 'subreddit':
        node[1]['node_type'] = 1
    if node[1]['is_sus'] == 'False':
        node[1]['is_sus'] = 0
    elif node[1]['is_sus'] == 'True':
        node[1]['is_sus'] = 1
    if node[1]['is_suspended'] == 1:
        node[1]['is_sus'] = 1

In [218]:
# Attributes explained in
g = dgl.from_networkx(G, node_attrs = ['node_type', 'is_sus', 'avg_compound', 'avg_pos',
                                    'avg_neu', 'avg_neg',
                                    'ratio_misc_sr', 'ratio_political_sr', 'ratio_popular_sr',
                                    'num_misc_sr'], edge_attrs=['weight'])
# dgl needs self loop to be undirected
g = dgl.add_self_loop(g)

In [219]:
# mapping between networkx and dgl to get list of users predicted as suspicious
nx_to_dgl_mapping = {node_id: idx for idx, node_id in enumerate(sorted(G.nodes))}

node_mapping = {idx: node_id for node_id, idx in nx_to_dgl_mapping.items()}


In [220]:
import numpy as np
feature_list = []
for feature_name in g.ndata:
    if feature_name != 'node_type':
      feature = g.ndata[feature_name]
      feature_list.append(feature)
    # Check if the feature is a scalar (dimension equals 1)


g.ndata['feat'] = torch.stack(feature_list, dim=1)

# Set the label attribute (assuming 'label' is a node attribute)
g.ndata['label'] = g.ndata['is_sus'].long()


In [221]:
g.ndata['is_sus'].unique(return_counts=True)

(tensor([0, 1]), tensor([64928,  2002]))

In [222]:
import os

os.environ["DGLBACKEND"] = "pytorch"
import dgl
import dgl.function as fn
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
from sklearn.model_selection import train_test_split
import torch.optim as optim
from dgl.nn import GraphConv

In [223]:
in_dim = g.ndata['feat'].shape[1]
hidden_dim = 16
out_dim = 2
dropout_rate = 0.3
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = GraphConv(in_dim, hidden_dim)
        self.layer2 = GraphConv(hidden_dim, hidden_dim // 2)
        self.layer3 = GraphConv(hidden_dim // 2, out_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, g, features):
        x = F.relu(self.layer1(g, features))
        x = F.relu(self.layer2(g, x))
        x = self.layer3(g, x)
        x = self.dropout(x)
        return x


net = Net()
print(net)

labels = g.ndata['label']
features = g.ndata['feat']
node_types = g.ndata['node_type']

# Split the nodes into train and test sets
train_idx, test_idx = train_test_split(range(len(labels)), test_size=0.25, random_state=5)


# Create Boolean masks for train and test nodes
train_mask = torch.zeros(len(g.nodes()), dtype=torch.bool)
test_mask = torch.zeros(len(g.nodes()), dtype=torch.bool)
train_mask[train_idx] = 1
test_mask[test_idx] = 1

# Probability of moving non-suspicious user from training to test set (not needed in M3)
p = 0  # Adjust the probability as needed

# Resample train_mask based on the class and probability
train_mask = np.logical_and(train_mask, (labels != 0) | (np.random.rand(len(train_mask)) > p)).bool()
test_mask = ~train_mask
train_mask = np.logical_and(train_mask, node_types != 1).bool()
test_mask = np.logical_and(test_mask, node_types != 1).bool()

# Assign the masks to the graph
g.ndata['train_mask'] = train_mask
g.ndata['test_mask'] = test_mask



Net(
  (layer1): GraphConv(in=9, out=16, normalization=both, activation=None)
  (layer2): GraphConv(in=16, out=8, normalization=both, activation=None)
  (layer3): GraphConv(in=8, out=2, normalization=both, activation=None)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [224]:
print(labels[train_mask].unique(return_counts=True))
print(labels[test_mask].unique(return_counts=True))

(tensor([0, 1]), tensor([48105,  1483]))
(tensor([0, 1]), tensor([16009,   519]))


In [225]:
def evaluate(model, g, features, labels, mask):
    model.eval()
    with th.no_grad():
        logits = model(g, features)
        logits = logits[mask]
        labels = labels[mask]
        _, indices = th.max(logits, dim=1)
        correct = th.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)

In [226]:
import time

import numpy as np

# Add edges between each node and itself to preserve old node representations
g.add_edges(g.nodes(), g.nodes())
optimizer = th.optim.Adam(net.parameters(), lr=1e-2)
dur = []
for epoch in range(100):
    if epoch >= 3:
        t0 = time.time()
    net.train()
    logits = net(g, features)
    logp = F.log_softmax(logits, 1)
    loss = F.cross_entropy(logp[train_mask], labels[train_mask], weight = torch.Tensor([1.0, 2.0]))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch >= 3:
        dur.append(time.time() - t0)
    acc = evaluate(net, g, features, labels, test_mask)
    print(
        "Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f} | Time(s) {:.4f}".format(
            epoch, loss.item(), acc, np.mean(dur)
        )
    )

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Epoch 00000 | Loss 0.5396 | Test Acc 0.9686 | Time(s) nan
Epoch 00001 | Loss 0.4757 | Test Acc 0.9686 | Time(s) nan
Epoch 00002 | Loss 0.4255 | Test Acc 0.9686 | Time(s) nan
Epoch 00003 | Loss 0.3864 | Test Acc 0.9686 | Time(s) 0.0918
Epoch 00004 | Loss 0.3604 | Test Acc 0.9686 | Time(s) 0.1271
Epoch 00005 | Loss 0.3483 | Test Acc 0.9686 | Time(s) 0.1254
Epoch 00006 | Loss 0.3355 | Test Acc 0.9686 | Time(s) 0.1309
Epoch 00007 | Loss 0.3305 | Test Acc 0.9686 | Time(s) 0.1313
Epoch 00008 | Loss 0.3257 | Test Acc 0.9686 | Time(s) 0.1244
Epoch 00009 | Loss 0.3283 | Test Acc 0.9686 | Time(s) 0.1208
Epoch 00010 | Loss 0.3307 | Test Acc 0.9686 | Time(s) 0.1189
Epoch 00011 | Loss 0.3263 | Test Acc 0.9686 | Time(s) 0.1185
Epoch 00012 | Loss 0.3259 | Test Acc 0.9686 | Time(s) 0.1164
Epoch 00013 | Loss 0.3299 | Test Acc 0.9686 | Time(s) 0.1147
Epoch 00014 | Loss 0.3214 | Test Acc 0.9686 | Time(s) 0.1149
Epoch 00015 | Loss 0.3257 | Test Acc 0.9686 | Time(s) 0.1137
Epoch 00016 | Loss 0.3232 | Test 

In [227]:
from sklearn.metrics import precision_score, recall_score
net.eval()
with torch.no_grad():
    logits = net(g, features)
    predicted_labels = logits.argmax(dim=1)
test_labels = labels[test_mask]

In [228]:
predicted_labels[test_mask].unique(return_counts=True)

(tensor([0, 1]), tensor([15950,   578]))

In [229]:
test_labels.unique(return_counts=True)

(tensor([0, 1]), tensor([16009,   519]))

In [238]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

confusion = classification_report(labels[test_mask], predicted_labels[test_mask])
print(confusion)

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     16009
           1       0.79      0.88      0.84       519

    accuracy                           0.99     16528
   macro avg       0.90      0.94      0.92     16528
weighted avg       0.99      0.99      0.99     16528



In [235]:
indices = torch.nonzero((predicted_labels == 1) & (labels == 0) & (node_types == 0)).squeeze()

print("Indices where predicted label is 1 but ground truth label is 0:", indices.tolist())

print(len(indices.tolist()))

Indices where predicted label is 1 but ground truth label is 0: [124, 554, 619, 701, 1724, 1887, 1926, 2139, 2176, 2189, 2470, 2498, 3026, 3037, 3276, 3293, 3296, 3297, 3323, 3846, 3883, 3895, 4152, 4214, 4292, 4324, 4412, 4438, 4757, 5134, 5328, 5629, 5738, 5828, 5885, 6078, 6180, 6249, 6998, 7150, 7188, 7196, 7279, 7368, 7598, 7732, 7809, 8259, 8486, 8737, 9027, 9241, 9461, 9717, 9773, 9810, 9914, 9939, 10042, 10248, 10343, 10359, 10397, 10648, 10715, 10846, 10896, 10928, 11084, 11115, 11205, 11288, 11293, 11355, 11525, 11724, 11758, 11797, 12400, 12570, 12734, 12757, 12774, 12888, 13013, 13195, 13378, 13539, 13767, 13828, 13837, 14092, 14170, 14886, 14976, 15022, 15282, 15433, 15497, 15612, 15641, 15722, 15731, 16238, 16277, 16391, 16505, 16527, 16646, 16673, 16740, 16975, 17026, 17046, 17167, 17173, 17699, 17729, 17758, 18081, 18103, 18488, 18543, 18629, 18634, 18915, 18993, 19311, 19433, 19594, 20237, 20899, 21169, 21494, 21898, 21909, 22132, 22287, 22478, 22609, 23032, 23130, 231

In [232]:
for idx in indices.tolist():
  print(node_mapping[idx])
  print(g.nodes[idx])


-kOdAbAr-
NodeSpace(data={'node_type': tensor([0]), 'is_sus': tensor([0]), 'avg_compound': tensor([0.0464]), 'avg_pos': tensor([0.1623]), 'avg_neu': tensor([0.7527]), 'avg_neg': tensor([0.0851]), 'ratio_misc_sr': tensor([0.8333]), 'ratio_political_sr': tensor([0.]), 'ratio_popular_sr': tensor([0.1667]), 'num_misc_sr': tensor([5]), 'feat': tensor([[0.0000, 0.0464, 0.1623, 0.7527, 0.0851, 0.8333, 0.0000, 0.1667, 5.0000]]), 'label': tensor([0]), 'train_mask': tensor([True]), 'test_mask': tensor([False])})
305-til-i-786
NodeSpace(data={'node_type': tensor([0]), 'is_sus': tensor([0]), 'avg_compound': tensor([0.1974]), 'avg_pos': tensor([0.2017]), 'avg_neu': tensor([0.7614]), 'avg_neg': tensor([0.0369]), 'ratio_misc_sr': tensor([1.]), 'ratio_political_sr': tensor([0.]), 'ratio_popular_sr': tensor([0.]), 'num_misc_sr': tensor([3]), 'feat': tensor([[0.0000, 0.1974, 0.2017, 0.7614, 0.0369, 1.0000, 0.0000, 0.0000, 3.0000]]), 'label': tensor([0]), 'train_mask': tensor([True]), 'test_mask': tensor

In [233]:
from sklearn.metrics import roc_auc_score
from sklearn import metrics
roc_auc_score(labels[test_mask], predicted_labels[test_mask])


0.9384798724128082