In [1]:
import numpy as np
import pandas as pd
import networkx as nx
pd.set_option('display.max_columns' , 40)

In [2]:
graph_file = "./data/users_clean.graphml"
csv_file = "./data/users_neighborhood_anon.csv"

In [3]:
from networkx.readwrite.graphml import read_graphml
graph = read_graphml(graph_file)

# read a list of the features
csv_columns = pd.read_csv(csv_file, nrows=5).columns

In [4]:
csv_columns

Index(['user_id', 'hate', 'hate_neigh', 'normal_neigh', 'statuses_count',
       'followers_count', 'followees_count', 'favorites_count', 'listed_count',
       'betweenness',
       ...
       'c_feminine_empath', 'c_medieval_empath', 'c_journalism_empath',
       'c_farming_empath', 'c_plant_empath', 'c_shopping_empath',
       'c_ship_empath', 'c_religion_empath', 'c_tourism_empath',
       'c_power_empath'],
      dtype='object', length=1039)

In [5]:
# take a list of only characterestic features (drop glove .. etc for now)
chfts = []
for x in csv_columns:
    if not ("glove" in x or "empath" in x or "c_" in x ):
        chfts.append(x)
chfts

['user_id',
 'hate',
 'hate_neigh',
 'normal_neigh',
 'statuses_count',
 'followers_count',
 'followees_count',
 'favorites_count',
 'listed_count',
 'betweenness',
 'eigenvector',
 'in_degree',
 'out_degree',
 'sentiment',
 'subjectivity',
 'number hashtags',
 'hashtags',
 'tweet number',
 'retweet number',
 'quote number',
 'status length',
 'number urls',
 'baddies',
 'mentions',
 'is_50',
 'is_63',
 'is_50_2',
 'is_63_2',
 'time_diff',
 'time_diff_median',
 'created_at']

In [None]:
reader = pd.read_table(csv_file, sep=',', chunksize=100000, header=0, names=csv_columns)
chunks = [chunk[chfts] for chunk in reader]
chdf = pd.concat(chunks , axis = 0)
chdf.head()

In [None]:
all_users = chdf['user_id'].unique()
normal_users = chdf[chdf['hate'] == 'normal']['user_id'].unique()
hateful_users = chdf[chdf['hate'] == 'hateful']['user_id'].unique()

## Clustering Coefficients

In [None]:
clustering_coefficients = nx.algorithms.cluster.clustering(graph)

In [None]:
all_coefs = list(clustering_coefficients.values())
normal_coefs = [clustering_coefficients[str(x)] for x in normal_users]
hateful_coefs = [clustering_coefficients[str(x)] for x in hateful_users]

print("Clustering coefficient for the whole graph: mean = {} , std = {}".format(np.mean(all_coefs) , np.std(all_coefs)))
print("Clustering coefficient for normal users: mean = {} , std = {}".format(np.mean(normal_coefs) , np.std(normal_coefs)))
print("Clustering coefficient for hateful users: mean = {} , std = {}".format(np.mean(hateful_coefs) , np.std(hateful_coefs)))

## Degree Distribution

In [None]:
degree_view = graph.in_degree
all_degrees = [degree_view[str(x)] for x in all_users]
normal_degrees = [degree_view[str(x)] for x in normal_users]
hateful_degrees = [degree_view[str(x)] for x in hateful_users]

print("For all users average in degree = {} and std = {}".format(np.mean(all_degrees) , np.std(all_degrees)))
print("For normal users average in degree = {} and std = {}".format(np.mean(normal_degrees) , np.std(normal_degrees)))
print("For hateful users average in degree = {} and std = {}".format(np.mean(hateful_degrees) , np.std(hateful_degrees)))

In [None]:
degree_view = graph.out_degree
all_degrees = [degree_view[str(x)] for x in all_users]
normal_degrees = [degree_view[str(x)] for x in normal_users]
hateful_degrees = [degree_view[str(x)] for x in hateful_users]

print("For all users average out degree = {} and std = {}".format(np.mean(all_degrees) , np.std(all_degrees)))
print("For normal users average out degree = {} and std = {}".format(np.mean(normal_degrees) , np.std(normal_degrees)))
print("For hateful users average out degree = {} and std = {}".format(np.mean(hateful_degrees) , np.std(hateful_degrees)))

## Bidirectional Ratio

In [None]:
graph.nodes

In [None]:

def reciprocity_iter(G, nodes):
    ret = []
    for node in nodes:
        pred = set(G.predecessors(node))
        succ = set(G.successors(node))
        overlap = pred & succ
        n_total = len(pred) + len(succ)

        if n_total == 0:
            ret.append(0)
        else:
            ret.append(len(overlap))
    return ret
            
biratio = reciprocity(graph , map(str , all_users))
all_biratio = [biratio[str(x)] for x in all_users]
normal_biratio = [biratio[str(x)] for x in normal_users]
hateful_biratio = [biratio[str(x)] for x in hateful_users]

print("For all users average biratio = {} ".format(np.mean(all_biratio)))
print("For normal users average biratio = {} ".format(np.mean(normal_biratio)))
print("For hateful users average biratio = {} ".format(np.mean(hateful_biratio)))