In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib import pylab
import pickle
import numpy as np
import time
import itertools
from tqdm.notebook import tqdm
from multiprocessing import Pool
import matplotlib.cm as cm
import scipy
import community as community_louvain
import seaborn as sns
import pickle
import os
from collections import defaultdict
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
from sklearn.metrics import accuracy_score,classification_report


# Class Definitions
adapted from https://github.com/tkipf/ica

In [29]:
def get_class(kls):
    parts = kls.split('.')
    module = ".".join(parts[:-1])
    md = __import__(module)
    for comp in parts[1:]:
        md = getattr(md, comp)
    return md


class Classifier(object):
    def __init__(self, scikit_classifier_name, **classifier_args):
        classifer_class = get_class(scikit_classifier_name)
        self.clf = classifer_class(**classifier_args)

    def fit(self, graph, train_indices):
        raise NotImplementedError

    def predict(self, graph, test_indices, conditional_node_to_label_map=None):
        raise NotImplementedError


class LocalClassifier(Classifier):
    def fit(self, graph, train_indices):

        feature_list = []
        label_list = []
        g = graph
        n = g.node_list
        training_nodes = [n[i] for i in train_indices]

        for nodes in training_nodes:
            feature_list.append(nodes.feature_vector)
            label_list.append(nodes.label)

        feature_list = sp.vstack(feature_list)
        feature_list = sp.csr_matrix(feature_list, dtype=np.float64)

        self.clf.fit(feature_list, label_list)
        return

    def predict(self, graph, test_indices, conditional_node_to_label_map=None):

        feature_list = []
        g = graph
        n = g.node_list
        testing_nodes = [n[i] for i in test_indices]

        for nodes in testing_nodes:
            feature_list.append(nodes.feature_vector)

        feature_list = sp.vstack(feature_list)
        feature_list = sp.csr_matrix(feature_list, dtype=np.float64)

        y = self.clf.predict(feature_list)
        return y


class RelationalClassifier(Classifier):
    def __init__(self, scikit_classifier_name, aggregator, **classifier_args):
        super(RelationalClassifier, self).__init__(scikit_classifier_name, **classifier_args)
        self.aggregator = aggregator

    def fit(self, graph, train_indices, local_classifier, bootstrap):
        conditional_map = {}

        if bootstrap:
            predictclf = local_classifier.predict(graph, range(len(graph.node_list)))
            conditional_map = self.cond_mp_upd(graph, conditional_map, predictclf, range(len(graph.node_list)))

        for i in train_indices:
            conditional_map[graph.node_list[i]] = graph.node_list[i].label
        features = []
        aggregates = []
        labels = []
        for i in train_indices:
            features.append(graph.node_list[i].feature_vector)
            labels.append(graph.node_list[i].label)
            aggregates.append(sp.csr_matrix(self.aggregator.aggregate(graph,
                                                                      graph.node_list[i],
                                                                      conditional_map), dtype=np.float64))
        features = sp.vstack(features)
        features = sp.csr_matrix(features, dtype=np.float64)
        aggregates = sp.vstack(aggregates)
        features = sp.hstack([features, aggregates])

        self.clf.fit(features, labels)

    def predict(self, graph, test_indices, conditional_map=None):
        features = []
        aggregates = []

        for i in test_indices:
            features.append(graph.node_list[i].feature_vector)
            aggregates.append(sp.csr_matrix(self.aggregator.aggregate(graph,
                                                                      graph.node_list[i],
                                                                      conditional_map), dtype=np.float64))
        features = sp.vstack(features)
        features = sp.csr_matrix(features, dtype=np.float64)
        aggregates = sp.vstack(aggregates)
        features = sp.hstack([features, aggregates])

        return self.clf.predict(features)

    def cond_mp_upd(self, graph, conditional_map, pred, indices):
        for x in range(len(pred)):
            conditional_map[graph.node_list[indices[x]]] = pred[x]
        return conditional_map

class Graph(object):

    def __init__(self):
        '''
        Create an empty graph
        '''
        self.node_list = []
        self.edge_list = []        
    
    def add_node(self, n):
        self.node_list.append(n)        
    
    def add_edge(self, e):
        abstract()
    
    def get_neighbors(self, n):
        abstract()

class Node(object):
    def __init__(self, node_id, feature_vector = None, label = None):
        self.node_id = node_id
        self.feature_vector = feature_vector
        self.label = label


class Edge(object):
    def __init__(self, from_node, to_node, feature_vector = None, label = None):
        self.from_node = from_node
        self.to_node = to_node
        self.feature_vector = feature_vector
        self.label = label


class DirectedGraph(Graph):
    
    def __init__(self):
        super(DirectedGraph, self).__init__()
        self.out_neighbors = defaultdict(set)
        self.in_neighbors = defaultdict(set)
        self.str_class=[]

    def add_edge(self, e):
        self.edge_list.append(e)
        self.out_neighbors[e.from_node].add(e.to_node)
        self.in_neighbors[e.to_node].add(e.from_node)
    
    def get_out_neighbors(self, n):
        return self.out_neighbors[n]
    
    def get_in_neighbors(self, n):
        return self.in_neighbors[n]
    
    def get_neighbors(self, n):
        return self.out_neighbors[n].union(self.in_neighbors[n])

class UndirectedGraph(Graph):
    
    def __init__(self):
        super(UndirectedGraph, self).__init__()
        self.neighbors = defaultdict(set)        
    
    def add_edge(self, e):
        self.neighbors[e.from_node].add(e.to_node)
        self.neighbors[e.to_node].add(e.from_node)
    
    def get_neighbors(self, n):
        return self.neighbors[n]

        
        super(RelationalClassifier, self).__init__(scikit_classifier_name, **classifier_args)
        self.aggregator = aggregator

    def fit(self, graph, train_indices, local_classifier, bootstrap):
        conditional_map = {}

        if bootstrap:
            predictclf = local_classifier.predict(graph, range(len(graph.node_list)))
            conditional_map = self.cond_mp_upd(graph, conditional_map, predictclf, range(len(graph.node_list)))

        for i in train_indices:
            conditional_map[graph.node_list[i]] = graph.node_list[i].label
        features = []
        aggregates = []
        labels = []
        for i in train_indices:
            features.append(graph.node_list[i].feature_vector)
            labels.append(graph.node_list[i].label)
            aggregates.append(sp.csr_matrix(self.aggregator.aggregate(graph,
                                                                      graph.node_list[i],
                                                                      conditional_map), dtype=np.float64))
        features = sp.vstack(features)
        features = sp.csr_matrix(features, dtype=np.float64)
        aggregates = sp.vstack(aggregates)
        features = sp.hstack([features, aggregates])

        self.clf.fit(features, labels)

    def predict(self, graph, test_indices, conditional_map=None):
        features = []
        aggregates = []

        for i in test_indices:
            features.append(graph.node_list[i].feature_vector)
            aggregates.append(sp.csr_matrix(self.aggregator.aggregate(graph,
                                                                      graph.node_list[i],
                                                                      conditional_map), dtype=np.float64))
        features = sp.vstack(features)
        features = sp.csr_matrix(features, dtype=np.float64)
        aggregates = sp.vstack(aggregates)
        features = sp.hstack([features, aggregates])

        return self.clf.predict(features)

    def cond_mp_upd(self, graph, conditional_map, pred, indices):
        for x in range(len(pred)):
            conditional_map[graph.node_list[indices[x]]] = pred[x]
        return conditional_map


class ICA(Classifier):
    def __init__(self, local_classifier, relational_classifier, bootstrap, max_iteration=10):
        self.local_classifier = local_classifier
        self.relational_classifier = relational_classifier
        self.bootstrap = bootstrap
        self.max_iteration = max_iteration

    def fit(self, graph, train_indices):
        self.local_classifier.fit(graph, train_indices)
        self.relational_classifier.fit(graph, train_indices, self.local_classifier, self.bootstrap)

    def predict(self, graph, eval_indices, test_indices, conditional_node_to_label_map=None):
        predictclf = self.local_classifier.predict(graph, eval_indices)
        conditional_node_to_label_map = self.cond_mp_upd(graph,
                                                         conditional_node_to_label_map,
                                                         predictclf, eval_indices)

        relation_predict = []
        temp = []
        for iter in range(self.max_iteration):
            for x in eval_indices:
                temp.append(x)
                rltn_pred = list(self.relational_classifier.predict(graph, temp, conditional_node_to_label_map))
                conditional_node_to_label_map = self.cond_mp_upd(graph, conditional_node_to_label_map, rltn_pred, temp)
                temp.remove(x)
        for ti in test_indices:
            relation_predict.append(conditional_node_to_label_map[graph.node_list[ti]])
        return relation_predict
    def cond_mp_upd(self, graph, conditional_map, pred, indices):
        for x in range(len(pred)):
            conditional_map[graph.node_list[indices[x]]] = pred[x]
        return conditional_map


In [30]:
class Aggregator(object):
    def __init__(self, domain_labels):
        self.domain_labels = domain_labels  # The list of labels in the domain

    def aggregate(self, graph, node, conditional_node_to_label_map):
        raise NotImplementedError

class Count(Aggregator):
    def aggregate(self, graph, node, conditional_node_to_label_map):
        neighbor_undirected = []

        for x in self.domain_labels:
            neighbor_undirected.append(0.0)
        for i in graph.get_neighbors(node):
            if i in conditional_node_to_label_map.keys():
                index = self.domain_labels.index(conditional_node_to_label_map[i])
                neighbor_undirected[index] += 1.0
        return neighbor_undirected


## Graph building

Since we want to use an algortihm for node classification the processing needs to done a bit differently than originally intended. Most of the meta data in attriubte `PROPERTIES` relates to a post, but in our case posts are not the nodes but the edge between nodes (subreddits). This requires us to aggregate the sentiment from all Posts of a subreddit to a singe node classification by averaging `PROPERTIES` and `LINK_SENTIMENT`

In [31]:
!wget http://snap.stanford.edu/data/soc-redditHyperlinks-body.tsv

--2020-12-09 09:45:38--  http://snap.stanford.edu/data/soc-redditHyperlinks-body.tsv
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 318931394 (304M) [text/tab-separated-values]
Saving to: ‘soc-redditHyperlinks-body.tsv.1’


2020-12-09 09:45:59 (14.4 MB/s) - ‘soc-redditHyperlinks-body.tsv.1’ saved [318931394/318931394]



In [32]:
df = pd.read_csv('soc-redditHyperlinks-body.tsv',sep='\t')

In [33]:
G = nx.Graph()
G.add_nodes_from(df['SOURCE_SUBREDDIT'].unique().tolist())


The meta attributes of a post need to be formatted differently and are then added to the original DataFrame

In [34]:
props = df.pop('PROPERTIES')
vals = props.str.split(',').values
data = list()
for val in tqdm(vals):
    data.append(list(map(float,val)))

HBox(children=(FloatProgress(value=0.0, max=286561.0), HTML(value='')))




In [35]:
data_df = pd.DataFrame(data)

In [36]:
full_df_props = pd.concat((df,data_df),axis=1)

Some subreddits never mention another subreddit, to make the aggregation simple we restrict the graph to contain only edges from posts that ever linked to another subreddit

In [37]:
mask = full_df_props['TARGET_SUBREDDIT'].isin(full_df_props['SOURCE_SUBREDDIT'])


Since a single subreddit can mention the same target subreddit mutliple times we need to aggregate the sentiment

In [51]:
weights = full_df_props[mask].groupby(['SOURCE_SUBREDDIT','TARGET_SUBREDDIT'])['LINK_SENTIMENT'].mean().reset_index().values


[['07scape' 'osrstranscripts' 1.0]
 ['0magick' 'occult' 1.0]
 ['1000wordstories' 'writing' 1.0]
 ...
 ['zyramains' 'threshmains' 1.0]
 ['zyzz' 'fitness' 1.0]
 ['zzseries' 'toontown' 1.0]]


Build up new Graph

In [39]:
G.add_weighted_edges_from(weights)

In [40]:
comps = list(nx.connected_components(G))
connected_subgrpah = G.subgraph(comps[0])
nx.is_connected(connected_subgrpah)

True

Since some nodes are not connected we use the biggest network (accounting for the majority of nodes) and filter our dataset further so that only features of nodes in a complete subgraph are included

In [41]:
mask_connected_comp = full_df_props['SOURCE_SUBREDDIT'].isin(list(connected_subgrpah.nodes))
feats_grouped_source = full_df_props[mask_connected_comp].groupby('SOURCE_SUBREDDIT')[data_df.columns].mean()

Create the labels for the individual nodes by aggregating all outwards edges to a mean sentiment score of the node. An intuition would maybe be how toxic a certain subreddit is. Afterwards to make this to a classification problem we group the Sentiment into two categories. The negative response (avg Sentiment between -1 and 0.9 and purely 1). We have to group the negative sentiment quite broad due to the enormous class divide.

In [42]:
labels = pd.get_dummies(
    pd.cut(full_df_props[mask_connected_comp].groupby('SOURCE_SUBREDDIT')["LINK_SENTIMENT"].mean(),[-1.1,0.9,1]))\
.values

Create attributes required by original code from github link

In [43]:
feats = feats_grouped_source.values
features = scipy.sparse.csr_matrix(feats).tolil()

In [44]:
adj = nx.adjacency_matrix(connected_subgrpah)

In [45]:
graph, domain_labels = build_graph(adj, features, labels)

HBox(children=(FloatProgress(value=0.0, max=26492.0), HTML(value='')))




In [46]:
all_idx = list(range(len(feats_grouped_source)))

Create train,test and validation split

In [47]:
X_train, X_test, y_train, y_test = train_test_split(all_idx, labels, test_size=0.2, random_state=1,stratify=labels)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=1, stratify=y_train)

In [48]:
idx_train = X_train

idx_val = X_val

idx_test = X_test

In [49]:
eval_idx = np.setdiff1d(range(adj.shape[0]), idx_train)


Do classification

In [50]:
for ep in range(1):
    print('Iteration: ',ep)
    np.random.shuffle(eval_idx)
    y_true = [graph.node_list[t].label for t in idx_test]
    local_clf = LocalClassifier('sklearn.linear_model.LogisticRegression')
    agg = Count(domain_labels)
    relational_clf = RelationalClassifier('sklearn.linear_model.LogisticRegression', agg)
    ica = ICA(local_clf, relational_clf, True, max_iteration=10)
    print('Training....')
    ica.fit(graph, idx_train)
    print('Training done!')
    conditional_node_to_label_map = create_map(graph, idx_train)
    print('Prediction...')
    ica_predict = ica.predict(graph, eval_idx, idx_test, conditional_node_to_label_map)
    print('Prediction Done!')
    print(classification_report(ica_predict,y_true))

Iteration:  0
Training....


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training done!
Prediction...
Prediction Done!
              precision    recall  f1-score   support

          c0       0.00      0.00      0.00         1
          c1       1.00      0.87      0.93      5298

    accuracy                           0.87      5299
   macro avg       0.50      0.44      0.47      5299
weighted avg       1.00      0.87      0.93      5299



# ToDo

- Currently accuracy is mainly driven by majority class, need to either undersample majority or oversample minority as a probable solution. Possible weighting inverse to occurences might help as well but need to be tested

- Overall performance of IC is abysmal need to parallelize almost everything. For some reason prediction seems to be way slower than training