In [5]:
import networkx 
import gzip
import ujson as json
import itertools
# import treelib
import numpy as np
#import matplotlib.pyplot as plt
from torch_geometric.utils.convert import from_networkx
#import torch_geometric.utils.convert
from torch_geometric.loader import DataLoader
import pandas as pd

# Creating the reply graph

In [6]:

def get_tree_user_edges(tree, tweets):
    """
    Input:
        - tree:
            recursive tree structure
            {tweet: "tweet_id", replies: [ .... ]}
    Output:
        - list of replier, poster user id pairs
          (poster <- replier)
    """

    parent_tweet_id = tree["tweet"]
    parent_user_id = tweets[parent_tweet_id]["user_id"]

    edges_from_children = []
    downstream_edges = []

    for reply in tree["replies"]:
        reply_tweet_id = reply["tweet"]
        reply_user_id = tweets[reply_tweet_id]["user_id"]

        edges_from_children.append((reply_user_id, parent_user_id))

        # recursively get the edges of the child
        downstream_edges += get_tree_user_edges(reply, tweets)

    return edges_from_children + downstream_edges

In [7]:

def tree_to_nx_user_graph(conversation, directed=True, remove_root=False):
    """
    Input:
        - conversation tree:
        {
            "tweets": metadata for each tweet in the tree
            "reply_tree": recursive tree structure
        }
    Output:
        - networkx graph
    """

    tree = conversation["reply_tree"]
    tweets = conversation["tweets"]
    
    root_tweet_id = tree["tweet"]
    root_user_id = tweets[root_tweet_id]["user_id"]
    
    # nodes: unique user ids
    nodes = {tweet["user_id"] for tweet in tweets.values()}
    
    # edges: list of (replier, poster) pairs
    edges = get_tree_user_edges(tree, tweets)

    edges = [(u, v) for u, v in edges if u != v]
        
    if remove_root:
        nodes = {u_id for u_id in nodes if u_id != root_user_id}
        edges = [(u, v) for u, v in edges 
                 if u != root_user_id and v != root_user_id]
    
    # create networkx graph
    G = networkx.Graph()
    
    if directed:
        G = networkx.DiGraph()
        
    G.add_nodes_from(nodes)
    G.add_edges_from(edges)

    return G

In [137]:
import os
cwd="D:\\Study Material\\Sem6\\nam\\proj\\jsons_no_embs\\final"
data_arr=[]
for filename in os.listdir(cwd):
   with open(os.path.join(cwd, filename), 'r') as f: # open in readonly mode
      # do your stuff
      conversation=json.load(f)
      d=dict()
      G = tree_to_nx_user_graph(conversation, directed=True, remove_root=True)
      for tids in conversation['tweets']:
          d[conversation['tweets'][tids]['user_id']]={"time":conversation['tweets'][tids]['time'],"tox-score":conversation['tweets'][tids]["tox-score"],"alignment-score":conversation['tweets'][tids]['alignment-score']}
      networkx.set_node_attributes(G, d)
      g_data=from_networkx(G,group_node_attrs=["time","tox-score","alignment-score"])
      g_data.y=conversation['ftox']
      print(conversation['ftox'])
      data_arr.append(g_data)
      
print("Done")

0
0
0
0
0
0
1
1
0
1
0
1
0
0
1
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
1
0
0
1
1
0
0
0
0
0
0
0
1
0
0
0
0
1
1
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
1
0
0
0
1
1
1
1
0
0
0
0
1
0
0
0
0
0
0
0
1
1
1
1
0
1
0
1
1
0
1
0
0
1
1
0
0
1
0
1
0
0
0
0
0
1
0
0
0
1
0
0
1
1
0
0
0
0
0
0
0
1
0
0
1
1
0
0
0
0
1
1
1
0
1
1
0
0
1
1
0
0
0
0
1
0
0
1
0
0
0
0
0
0
1
1
0
0
1
1
0
0
1
1
1
1
0
0
0
0
0
0
0
0
1
0
0
0
1
0
1
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
1
0
0
1
0
0
0
0
0
1
1
0
0
0
1
0
0
0
1
1
1
0
1
1
0
1
1
1
0
0
1
1
1
0
0
0
1
0
1
0
1
1
0
1
0
0
0
1
1
1
0
0
0
1
0
0
0
1
0
1
1
0
0
0
0
1
0
1
1
1
0
0
0
0
0
1
0
0
0
1
1
0
1
1
1
0
1
0
1
0
0
0
0
0
1
1
0
0
1
0
1
1
1
0
1
0
0
1
1
1
1
0
1
0
0
0
1
1
0
0
0
1
0
0
0
0
0
1
0
0
0
1
0
0
1
1
1
1
0
1
0
0
1
0
0
1
1
0
0
1
0
0
0
1
0
0
1
0
0
0
0
0
0
1
1
1
1
0
1
0
1
1
0
0
0
0
1
0
0
0
0
0
0
0
1
1
1
0
0
0
1
0
0
1
0
0
1
0
0
0
1
0
1
0
0
0
0
0
0
0
1
1
1
0
0
0
0
0
0
0
0
1
1
1
1
1
0
0
0
0
1
0
1
0
0
0
1
0
1
0
1
0
0
1
0
0
0
1
0
1
1
0
1
0
0
0
1
0
0
1
0
0
0
0
0
0
1
0
0
1
0
0
1
0
0
1
1
0
0
0
1
0


In [138]:
dataset=data_arr
data=dataset[0]

In [139]:
print()
print(data)
print('=========*================*==============*============*=========')

# Summary statistics
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Data(edge_index=[2, 723], x=[4607, 3], y=0)
Number of nodes: 4607
Number of edges: 723
Average node degree: 0.16
Has isolated nodes: True
Has self-loops: False
Is undirected: False


In [140]:
import torch
from random import shuffle
torch.manual_seed(12345)
shuffle(dataset)

train_dataset = dataset[:500]
test_dataset = dataset[500:]

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

Number of training graphs: 500
Number of test graphs: 401


In [141]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 60441], x=[180329, 3], y=[64], batch=[180329], ptr=[65])

Step 2:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 51022], x=[162924, 3], y=[64], batch=[162924], ptr=[65])

Step 3:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 46340], x=[167330, 3], y=[64], batch=[167330], ptr=[65])

Step 4:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 70142], x=[191826, 3], y=[64], batch=[191826], ptr=[65])

Step 5:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 60547], x=[185543, 3], y=[64], batch=[185543], ptr=[65])

Step 6:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 46256], x=[169673, 3], y=[64], batch=[169673], ptr=[65])

Step 7:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 69206], x=[226076, 3], y=[64], batch=[226076], ptr=[65])

Step 8:
Number of graphs in the current batch: 52
DataBatch(edge_inde

# Implementing GCN

In [149]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv,GNNExplainer
from torch_geometric.nn import global_mean_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(456)
        self.conv1 = GCNConv(3, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        #self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, 2)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        #x = x.relu()
        #x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): GCNConv(3, 64)
  (conv2): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [151]:
model = GCN(hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.batch)  
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 101):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

Epoch: 001, Train Acc: 0.6280, Test Acc: 0.6359
Epoch: 002, Train Acc: 0.6280, Test Acc: 0.6359
Epoch: 003, Train Acc: 0.3720, Test Acc: 0.3641
Epoch: 004, Train Acc: 0.6280, Test Acc: 0.6359
Epoch: 005, Train Acc: 0.3720, Test Acc: 0.3641
Epoch: 006, Train Acc: 0.6280, Test Acc: 0.6359
Epoch: 007, Train Acc: 0.3720, Test Acc: 0.3641
Epoch: 008, Train Acc: 0.6280, Test Acc: 0.6359
Epoch: 009, Train Acc: 0.6280, Test Acc: 0.6359
Epoch: 010, Train Acc: 0.6280, Test Acc: 0.6359
Epoch: 011, Train Acc: 0.6280, Test Acc: 0.6359
Epoch: 012, Train Acc: 0.6280, Test Acc: 0.6359
Epoch: 013, Train Acc: 0.6280, Test Acc: 0.6359
Epoch: 014, Train Acc: 0.6280, Test Acc: 0.6359


KeyboardInterrupt: 

# Implementing GCN

In [152]:
from torch_geometric.nn import GATConv

class GAT(torch.nn.Module):
    def __init__(self):
        super(GAT, self).__init__()
        self.hid = 8
        self.in_head = 8
        self.out_head = 1
        
        
        self.conv1 = GATConv(3, self.hid, heads=self.in_head, dropout=0.4)
        self.conv2 = GATConv(self.hid*self.in_head, 2, concat=False,
                             heads=self.out_head, dropout=0.4)
        #self.lin = Linear(hidden_channels, 2)



    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        #x = self.lin(x)
        
        return x
        #return F.log_softmax(x, dim=1)
    
    
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = "cpu"

model = GAT()

In [154]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.batch)  
         
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 101):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

Epoch: 001, Train Acc: 0.6280, Test Acc: 0.6359
Epoch: 002, Train Acc: 0.6280, Test Acc: 0.6359
Epoch: 003, Train Acc: 0.5960, Test Acc: 0.5935
Epoch: 004, Train Acc: 0.6280, Test Acc: 0.6359
Epoch: 005, Train Acc: 0.6280, Test Acc: 0.6359


KeyboardInterrupt: 

# Implementing GNN Explainer

In [155]:
x, edge_index = dataset[225].x, dataset[225].edge_index
explainer = GNNExplainer(model, epochs=100)
node_feat_mask, edge_mask = explainer.explain_graph(x, edge_index)

Explain graph: 100%|█████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 42.18it/s]


In [168]:
print(node_feat_mask)
print(edge_mask)


tensor([0.7544, 0.2887, 0.2677])
tensor([0.2605, 0.2647, 0.2712, 0.2675, 0.3251, 0.2575, 0.2637, 0.2617, 0.2710,
        0.7611, 0.7453, 0.2685, 0.2628, 0.7360, 0.2804, 0.7497, 0.2512, 0.7395,
        0.2671, 0.2626, 0.2699, 0.2659, 0.2757, 0.2709, 0.5371, 0.2717, 0.2682,
        0.2697, 0.7439, 0.7417, 0.2662, 0.2740, 0.2715, 0.2681, 0.2648, 0.2675,
        0.2604, 0.2632, 0.2672, 0.2606, 0.7592, 0.2794, 0.2767, 0.2695, 0.7262,
        0.2687, 0.7544, 0.2593, 0.2712, 0.2733, 0.2768, 0.2746, 0.2606, 0.2638,
        0.2830, 0.2634, 0.2639, 0.7430, 0.4270, 0.2585, 0.6399, 0.2672, 0.2647,
        0.2324, 0.2738, 0.2668, 0.2666, 0.2578, 0.7329, 0.2639, 0.6970, 0.2716,
        0.2766, 0.2662, 0.7094, 0.2626, 0.2681, 0.2630, 0.7371, 0.2750, 0.2640,
        0.2636, 0.2710, 0.2704, 0.2657, 0.4366, 0.2639, 0.2710, 0.7415, 0.2742,
        0.2769, 0.2760, 0.2730, 0.2684, 0.2530, 0.2864, 0.2675, 0.2563, 0.2691,
        0.2638, 0.2588, 0.2661, 0.7351, 0.2739, 0.7300, 0.2668, 0.3594, 0.3552,
       