In [1]:
import numpy as np
import pandas as pd
import networkx as nx
pd.set_option('display.max_columns' , 40)

In [2]:
graph_file = "./data/users_clean.graphml"
csv_file = "./data/users_neighborhood_anon.csv"

In [3]:
from networkx.readwrite.graphml import read_graphml
graph = read_graphml(graph_file)

# read a list of the features
csv_columns = pd.read_csv(csv_file, nrows=5).columns

In [4]:
csv_columns

Index(['user_id', 'hate', 'hate_neigh', 'normal_neigh', 'statuses_count',
       'followers_count', 'followees_count', 'favorites_count', 'listed_count',
       'betweenness',
       ...
       'c_feminine_empath', 'c_medieval_empath', 'c_journalism_empath',
       'c_farming_empath', 'c_plant_empath', 'c_shopping_empath',
       'c_ship_empath', 'c_religion_empath', 'c_tourism_empath',
       'c_power_empath'],
      dtype='object', length=1039)

In [5]:
# take a list of only characterestic features (drop glove .. etc for now)
chfts = []
for x in csv_columns:
    if not ("glove" in x or "empath" in x or "c_" in x ):
        chfts.append(x)
chfts

['user_id',
 'hate',
 'hate_neigh',
 'normal_neigh',
 'statuses_count',
 'followers_count',
 'followees_count',
 'favorites_count',
 'listed_count',
 'betweenness',
 'eigenvector',
 'in_degree',
 'out_degree',
 'sentiment',
 'subjectivity',
 'number hashtags',
 'hashtags',
 'tweet number',
 'retweet number',
 'quote number',
 'status length',
 'number urls',
 'baddies',
 'mentions',
 'is_50',
 'is_63',
 'is_50_2',
 'is_63_2',
 'time_diff',
 'time_diff_median',
 'created_at']

In [6]:
reader = pd.read_table(csv_file, sep=',', chunksize=100000, header=0, names=csv_columns)
chunks = [chunk[chfts] for chunk in reader]
chdf = pd.concat(chunks , axis = 0)
chdf.head()

Unnamed: 0,user_id,hate,hate_neigh,normal_neigh,statuses_count,followers_count,followees_count,favorites_count,listed_count,betweenness,eigenvector,in_degree,out_degree,sentiment,subjectivity,number hashtags,hashtags,tweet number,retweet number,quote number,status length,number urls,baddies,mentions,is_50,is_63,is_50_2,is_63_2,time_diff,time_diff_median,created_at
0,0,normal,True,True,101767,3504,3673,81635,53,100467.895084,7.413433e-09,0.000139,0.00012,0.035132,0.431656,16.0,鷺沢文香生誕祭2017 鷺沢文香生誕祭 pixiv アイドル idol グラビア RT希望 ...,121.0,79.0,5.0,75.565,82.0,18.0,159.0,False,False,False,False,356.020101,74.0,1241845000.0
1,1,other,False,False,2352,19609,309,61,197,0.0,4.929853e-32,1e-05,3e-05,0.088142,0.418649,40.0,nzfirst nzfirst nzfirst hadenough nzfirst hade...,199.0,0.0,0.0,101.713568,20.0,10.0,6.0,False,False,False,False,17519.116162,45.0,1312262000.0
2,2,other,False,False,1044,2371,2246,561,16,4897.117853,6.876258e-12,5e-05,3e-05,0.117861,0.455828,328.0,Firefighter mentalhealth PTSD Wellness Fire me...,113.0,87.0,0.0,128.13,219.0,16.0,158.0,False,False,False,False,46417.758794,2010.0,1445293000.0
3,3,other,False,False,167172,3004,298,3242,53,9.864754,1.380359e-30,4e-05,2e-05,0.261688,0.558544,127.0,OneMoreChance OneMoreChance Siwon SuperJunior ...,69.0,131.0,3.0,85.76,149.0,10.0,26.0,False,False,False,False,656.889447,72.0,1286949000.0
4,4,other,False,False,1998,17643,19355,485,239,0.0,4.929853e-32,1e-05,2e-05,0.121533,0.435334,1710.0,Ohio photooftheday nature photography birds mi...,101.0,99.0,0.0,152.175,198.0,35.0,7.0,False,False,False,False,55991.356784,48197.0,1408504000.0


In [7]:
all_users = chdf['user_id'].unique()
normal_users = chdf[chdf['hate'] == 'normal']['user_id'].unique()
hateful_users = chdf[chdf['hate'] == 'hateful']['user_id'].unique()

## Clustering Coefficients

In [8]:
clustering_coefficients = nx.algorithms.cluster.clustering(graph)

In [9]:
all_coefs = list(clustering_coefficients.values())
normal_coefs = [clustering_coefficients[str(x)] for x in normal_users]
hateful_coefs = [clustering_coefficients[str(x)] for x in hateful_users]

print("Clustering coefficient for the whole graph: mean = {} , std = {}".format(np.mean(all_coefs) , np.std(all_coefs)))
print("Clustering coefficient for normal users: mean = {} , std = {}".format(np.mean(normal_coefs) , np.std(normal_coefs)))
print("Clustering coefficient for hateful users: mean = {} , std = {}".format(np.mean(hateful_coefs) , np.std(hateful_coefs)))

Clustering coefficient for the whole graph: mean = 0.05665756186782108 , std = 0.09189662387862936
Clustering coefficient for normal users: mean = 0.05323155415618325 , std = 0.09330204104980355
Clustering coefficient for hateful users: mean = 0.07092943943828832 , std = 0.057711775285799786


## Degree Distribution

In [10]:
degree_view = graph.in_degree
all_degrees = [degree_view[str(x)] for x in all_users]
normal_degrees = [degree_view[str(x)] for x in normal_users]
hateful_degrees = [degree_view[str(x)] for x in hateful_users]

print("For all users average in degree = {} and std = {}".format(np.mean(all_degrees) , np.std(all_degrees)))
print("For normal users average in degree = {} and std = {}".format(np.mean(normal_degrees) , np.std(normal_degrees)))
print("For hateful users average in degree = {} and std = {}".format(np.mean(hateful_degrees) , np.std(hateful_degrees)))

For all users average in degree = 22.777996931843084 and std = 23.047098581157965
For normal users average in degree = 25.350576010842556 and std = 24.3193306030999
For hateful users average in degree = 30.455882352941178 and std = 22.866899931052032


In [11]:
degree_view = graph.out_degree
all_degrees = [degree_view[str(x)] for x in all_users]
normal_degrees = [degree_view[str(x)] for x in normal_users]
hateful_degrees = [degree_view[str(x)] for x in hateful_users]

print("For all users average out degree = {} and std = {}".format(np.mean(all_degrees) , np.std(all_degrees)))
print("For normal users average out degree = {} and std = {}".format(np.mean(normal_degrees) , np.std(normal_degrees)))
print("For hateful users average out degree = {} and std = {}".format(np.mean(hateful_degrees) , np.std(hateful_degrees)))

For all users average out degree = 22.777996931843084 and std = 69.74444204250517
For normal users average out degree = 12.234922069121302 and std = 41.89660404556361
For hateful users average out degree = 16.74448529411765 and std = 33.08810329834783


## Bidirectional Ratio

In [17]:

def reciprocity(G, nodes):
    ret = []
    for node in nodes:
        pred = set(G.predecessors(node))
        succ = set(G.successors(node))
        overlap = pred & succ
        n_total = len(pred) + len(succ)

        if n_total == 0:
            ret.append((node,0))
        else:
            ret.append((node ,len(overlap)))
                       
    return dict(ret)
            
biratio = reciprocity(graph , map(str , all_users))
all_biratio = [biratio[str(x)] for x in all_users]
normal_biratio = [biratio[str(x)] for x in normal_users]
hateful_biratio = [biratio[str(x)] for x in hateful_users]

print("For all users average bidirectional neighbours = {} ".format(np.mean(all_biratio)))
print("For normal users average bidirectional neighbours = {} ".format(np.mean(normal_biratio)))
print("For hateful users average bidirectional neighbours = {} ".format(np.mean(hateful_biratio)))

For all users average bidirectional neighbours = 2.8252146713685176 
For normal users average bidirectional neighbours = 2.200587305172803 
For hateful users average bidirectional neighbours = 2.693014705882353 
