In [4]:
import torch as th
import numpy as np
import scipy.sparse as spp
import torch.nn as nn
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
import dgl.nn as dglnn
import torch
import torch.nn.functional as F



Using backend: pytorch


In [5]:
import networkx as nx


In [6]:
import dgl


In [7]:
df = pd.read_csv('../data/soc-redditHyperlinks-body.tsv',sep='\t')

In [8]:
labels = df['SOURCE_SUBREDDIT'].unique().tolist()
labels.extend(df['TARGET_SUBREDDIT'].unique().tolist())


# Build MultiGraph

In [9]:
G = nx.MultiDiGraph()

In [10]:
G.add_nodes_from(labels)

In [11]:
G.add_edges_from(list(zip(df['SOURCE_SUBREDDIT'].values.tolist(),
                          df['TARGET_SUBREDDIT'].values.tolist())));

In [12]:
props = df.pop('PROPERTIES')


In [13]:

vals = props.str.split(',').values
data = list()
for val in tqdm(vals):
    data.append(list(map(float,val)))

HBox(children=(FloatProgress(value=0.0, max=286561.0), HTML(value='')))




In [14]:
data_df = pd.DataFrame(data)

In [15]:
for col in data_df.columns:
    nx.set_edge_attributes(G,data[col],str(col))

In [16]:
data_df['source'] = df['SOURCE_SUBREDDIT']
data_df['target'] = df['TARGET_SUBREDDIT']

# Awkward squish of variables

In [None]:
data_unique = list()
for lab in tqdm(pd.Series(labels).unique()):
    cut = data_df[(data_df['source'] == lab) | (data_df['target'] == lab)]
    data_unique.append(cut.mean().values)

HBox(children=(FloatProgress(value=0.0, max=35776.0), HTML(value='')))

In [None]:
data_mat = np.array(data_unique)

In [None]:
grouped_source = data_df.groupby('source').mean()
grouped_taret = data_df.groupby('target').mean()

In [None]:
grouped = grouped_source.reset_index()

In [None]:
conv_dict = dict(zip(labels, pd.get_dummies(labels).values.argmax(axis=1)))

In [None]:
targets = list(map(conv_dict.get,df['TARGET_SUBREDDIT']))
source = list(map(conv_dict.get,df['SOURCE_SUBREDDIT']))

In [None]:
data_mat = np.zeros((len(data_unique),len(data_unique[0])))
for i in range(len(data_unique)):
    data_mat[i,:] = data_unique[i][:86]

In [None]:
g.srcdata['features'] = th.from_numpy(data_mat).float()
g.edata['labels'] = th.from_numpy(pd.get_dummies(df['LINK_SENTIMENT']).values.argmax(axis=1))

# Most of this comes from the dgl documentation

In [44]:
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)


In [45]:
dataloader = dgl.dataloading.EdgeDataLoader(
    g, th.arange(G.number_of_edges()), sampler,
    batch_size=1024,
    shuffle=True,
    drop_last=False)

In [47]:
class StochasticTwoLayerGCN(nn.Module):
    def __init__(self, in_features, hidden_features, out_features):
        super().__init__()
        self.conv1 = dglnn.GraphConv(in_features, hidden_features,allow_zero_in_degree=True)
        
        self.conv2 = dglnn.GraphConv(hidden_features, out_features,allow_zero_in_degree=True)

    def forward(self, blocks, x):
        x = F.relu(self.conv1(blocks[0], x))
        x = F.relu(self.conv2(blocks[1], x))
        return x

In [48]:
class ScorePredictor(nn.Module):
    def __init__(self, num_classes, in_features):
        super().__init__()
        self.W = nn.Softmax(dim=1)
    def apply_edges(self, edges):
        data = torch.cat([edges.src['x'], edges.dst['x']])
        return {'score': self.W(edges.src['x']),
                'logits': edges.src['x']}

    def forward(self, edge_subgraph, x):
        with edge_subgraph.local_scope():
            edge_subgraph.ndata['x'] = x
            edge_subgraph.apply_edges(self.apply_edges)
            return edge_subgraph.edata['score'],edge_subgraph.edata['logits']

In [49]:
class Model(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, num_classes):
        super().__init__()
        self.gcn = StochasticTwoLayerGCN(
            in_features, hidden_features, out_features)
        self.predictor = ScorePredictor(num_classes, out_features)

    def forward(self, edge_subgraph, blocks, x):
        x = self.gcn(blocks, x)
        return self.predictor(edge_subgraph, x)

In [51]:
nSamples = pd.Series(g.edata['labels']).value_counts().values
normedWeights = [1 - (x / sum(nSamples)) for x in nSamples]
normedWeights = torch.FloatTensor(normedWeights).to('cuda')
loss = nn.CrossEntropyLoss(weight=normedWeights)



In [54]:
model = Model(in_features, 150, out_features, num_classes)
model = model.float()
model = model.cuda()
opt = th.optim.Adam(model.parameters())


for i in range(100):
    batch_roc = []
    for input_nodes, edge_subgraph, blocks in tqdm(dataloader):
        blocks = [b.to(th.device('cuda')) for b in blocks]
        edge_subgraph = edge_subgraph.to(th.device('cuda'))
        input_features = blocks[0].srcdata['features']
        edge_labels = edge_subgraph.edata['labels']
        edge_predictions,logits = model(edge_subgraph, blocks, input_features)
        out = loss(logits,edge_labels)
        #loss = compute_loss(edge_labels, edge_predictions)
        batch_roc.append(roc_auc_score(edge_labels.cpu().numpy(), edge_predictions.argmax(axis=1).cpu().numpy()))
        opt.zero_grad()
        out.backward()
        opt.step()
    print(sum(batch_roc)/len(batch_roc))

HBox(children=(FloatProgress(value=0.0, max=280.0), HTML(value='')))


0.5000080804866859


HBox(children=(FloatProgress(value=0.0, max=280.0), HTML(value='')))


0.5


HBox(children=(FloatProgress(value=0.0, max=280.0), HTML(value='')))




KeyboardInterrupt: 

In [25]:
g = None

In [26]:
g = dgl.DGLGraph()
g = dgl.from_networkx(nx_graph=G)



In [27]:
g.srcdata['features'] = th.from_numpy(data_mat).float()
g.edata['labels'] = th.from_numpy(pd.get_dummies(df['LINK_SENTIMENT']).values.argmax(axis=1))

In [28]:
class SAGE(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats):
        super().__init__()
        self.conv1 = dglnn.SAGEConv(
            in_feats=in_feats, out_feats=hid_feats, aggregator_type='mean')
        self.conv2 = dglnn.SAGEConv(
            in_feats=hid_feats, out_feats=out_feats, aggregator_type='mean')

    def forward(self, graph, inputs):
        # inputs are features of nodes
        h = self.conv1(graph, inputs)
        h = F.relu(h)
        h = self.conv2(graph, h)
        return h

In [29]:
import dgl.function as fn
class DotProductPredictor(nn.Module):
    def forward(self, graph, h):
        # h contains the node representations computed from the GNN defined
        # in the node classification section (Section 5.1).
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            return graph.edata['score']

In [30]:
class MLPPredictor(nn.Module):
    def __init__(self, in_features, out_classes):
        super().__init__()
        self.W = nn.Linear(in_features * 2, out_classes)

    def apply_edges(self, edges):
        h_u = edges.src['h']
        h_v = edges.dst['h']
        score = self.W(torch.cat([h_u, h_v], 1))
        return {'score': score}

    def forward(self, graph, h):
        # h contains the node representations computed from the GNN defined
        # in the node classification section (Section 5.1).
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(self.apply_edges)
            return graph.edata['score']

In [31]:
class Model(nn.Module):
    def __init__(self, in_features, hidden_features, out_features):
        super().__init__()
        self.sage = SAGE(in_features, hidden_features, out_features)
        self.pred = DotProductPredictor()
    def forward(self, g, x):
        h = self.sage(g, x)
        return self.pred(g, h)

In [34]:
in_features = len(data[0])
hidden_features = 750
out_features= 2
num_classes = 2

In [32]:
node_features = g.ndata['features']
edge_label = g.edata['labels']

In [35]:
model = Model(in_features, 150, out_features)

In [36]:
edge_pred_graph = g

In [37]:
opt = torch.optim.Adam(model.parameters())


In [38]:
node_features = edge_pred_graph.ndata['features']
edge_label = edge_pred_graph.edata['labels']


In [39]:
for epoch in range(10):
    pred = model(edge_pred_graph, node_features)
    loss = ((pred - edge_label) ** 2).mean()
    opt.zero_grad()
    loss.backward()
    opt.step()
    print(loss.item())

RuntimeError: [enforce fail at CPUAllocator.cpp:64] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 328468826884 bytes. Error code 12 (Cannot allocate memory)