In [11]:
import pandas as pd
import numpy as np
import json
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import community 
from networkx.algorithms.community import greedy_modularity_communities
from networkx.algorithms.community import k_clique_communities

  warn('geopandas not available. Some functionality will be disabled.')
  from .sqlite import head_to_sql, start_sql


#### Function to read json files for datasets

In [16]:
def load_json(fn):

    json_data = []
    
    if type(fn) == str:
        with open(fn,'rb') as f:
            for line in f:
                json_data.append(json.loads(line))
    else:
        for fn0 in fn:
            with open(fn0,'rb') as f:
                for line in f:
                    json_data.append(json.loads(line))

    return(json_data)

#### Function to add missing days to dataframe

In [66]:
def add_missing_data_git(df, entity='Vendor'):
    
    max_date =  max(df['dailyTime'])
    min_date = min(df['dailyTime'])

    idx = pd.date_range(min_date, max_date)
    df.set_index('dailyTime', inplace=True)

    df_concat = []

    for user, group in df.groupby(entity):
        s = group['PushEvent']

        s = s.reindex(idx, fill_value=0)

        df = pd.DataFrame(s)
        df[entity] = user

        df_concat.append(df)

    df_new = pd.concat(df_concat)
    return df_new
    

#### Generate node lists and edge lists

In [67]:
def generate_input_files(G, nodes, file_nodes, file_edges):
    
    df = nx.to_pandas_edgelist(G)
    df['weight'] = 1
    #Find self loops
    self_nodes = df.loc[df['source'] == df['target']]
    self_nodes = list(self_nodes['source'])
    
    #Find nodes without self-loops
    no_self_nodes = list(set(nodes) - set(self_nodes))
    #insert self-loops with weight 0 for records without self loops
    entries = [{'source':node, 'target':node, 'weight':0} for node in no_self_nodes]
    
    if len(entries) > 0:
        df= df.append(entries)
    
    #encode the largest connected component into integers
    lc_encoding = {}
    for _i, node in enumerate(nodes):
        lc_encoding[node] = str(_i)
    df['source'] = df['source'].apply(lambda x: lc_encoding[x])
    df['target'] = df['target'].apply(lambda x: lc_encoding[x])
    
    #rename columns
    df.rename(columns={'source':'from', 'target':'to', 'weight':'distance'}, inplace=True)
    df['distance']= df['distance'].astype(float)
    
    #write id list to file
    with open(file_nodes, 'w') as f:
        f.write(",".join(list(lc_encoding.values())))
    df.to_csv(file_edges, index=False)
    print('Files written succesfully')
    
    return df, lc_encoding

#### Load Datasets

In [17]:
path_data = ''
json_file = load_json(path_to_data)
df_data = pd.DataFrame(json_file)

path_wh = ''
wh_df = pd.read_csv(path_wh)

#### Use df_data to get correct parent mappings for retweets

In [None]:
### rename column to match naming convention in white helmets dataset
df_data.rename(columns={'tweet_id_h':'nodeID'}, inplace=True)

#Twitter data
wh_df = wh_df.loc[wh_df['platform'] == 'twitter'].reset_index(drop=True)

#Drop duplicates
wh_df = wh_df.drop_duplicates('nodeID', keep='last').reset_index(drop=True)

#### Get only action types retweets
wh_df_rt = wh_df.loc[wh_df['actionType'] == 'retweet'].reset_index(drop=True)

### Get correct retweet chain
wh_df_rt = pd.merge(wh_df_rt, df_data, on='nodeID')

### Drop irrelevant information
wh_df_rt = wh_df_rt.drop(columns=['parentID', 'rootID','tweet_postdate', 'tweeter_UTC_Offset', 'tweeter_followers'])
wh_df_rt.rename(columns={'retweeted_from_tweet_id_h':'parentID', 'source_tweet_id_h':'rootID'}, inplace=True)

#### Get correct parentUserID for parentID field

In [28]:
#ParentUserID -> ParentID mapping
wh_mapping = wh_df_new[['nodeID', 'nodeUserID']]
wh_mapping.rename(columns={'nodeID':'parentID', 'nodeUserID':'parentUserID'}, inplace=True)

#### Include parent user ID for the retweet dataframe
wh_df_rt = pd.merge(wh_mapping, wh_df_rt, on='parentID')
wh_df_rt = wh_df_rt[['nodeID', 'nodeUserID', 'parentID', 'parentUserID', 'nodeTime']]

### nodeTime to datetime object
wh_df_rt['nodeTime'] = pd.to_datetime(wh_df_rt['nodeTime'])

#Build retweet network
rt_graph = wh_df_rt.groupby(['nodeUserID', 'parentUserID']).size().reset_index(name='weight')

### WH-Daily

#### Build directed graph for Retweet Network

In [194]:
###Create digraph for retweet diffusion
g_nx = nx.from_pandas_edgelist(rt_graph,'parentUserID', 'nodeUserID', ['weight'], create_using=nx.DiGraph())
print('Nodes:',g_nx.number_of_nodes(), 'Edges:', g_nx.number_of_edges())

###Get lists of largest strongly connected component
lc_strong = sorted(nx.strongly_connected_components(g_nx), key=len, reverse=True)

###Get Subgraph of the largest connected component.
rt_lc = g_nx.subgraph(lc_strong[0])

#Number of nodes and edges in largest connected component
print('Nodes:', rt_lc.number_of_nodes(), 'Edges:', rt_lc.number_of_edges())

###Get list of nodes in largest connected component
rt_nodes = list(rt_lc.nodes())

#### Get daily number of activities, and fill inactive days with 0

In [199]:
### Keep only activities for nodes in largest connected component
rt_df = wh_df_rt.loc[wh_df_rt['nodeUserID'].isin(rt_nodes)].reset_index(drop=True)

###Create a dailyTime field
rt_df['dailyTime'] = rt_df['nodeTime'].dt.strftime('%Y-%m-%d')

###Compute daily activities of each node
rt_df_daily_acts = rt_df.groupby(['nodeUserID','dailyTime']).size().reset_index(name='daily_acts')

###dailyTime to datetime object
rt_df_daily_acts['dailyTime'] = pd.to_datetime(rt_df_daily_acts['dailyTime'])

###Generate missing data
rt_new_df = add_missing_data_git(rt_df_daily_acts, entity='nodeUserID')

#### Compute monthly activities for each node

In [44]:
#Set dailyTime as a column in the df
rt_new_df.reset_index(level=0, inplace=True)
rt_new_df.rename(columns={'index':'nodeTime'}, inplace=True)

#Create new monthly time column
rt_new_df['monthTime'] = rt_new_df['nodeTime'].dt.strftime('%Y-%m')

#Compute the number of monthly activities for each node
rt_month_acts = rt_new_df.groupby(['nodeUserID', 'monthTime'])['daily_acts'].sum().reset_index(name='monthly_acts')

### Avg. monhtly number of activities
rt_avg_monthly_acts = rt_month_acts.groupby('nodeUserID')['monthly_acts'].mean().reset_index(name='avg_acts')
print(rt_avg_monthly_acts.describe())

#### Compute avg. number of daily activities per user

In [46]:
rt_avg_daily_acts = rt_new_df.groupby('nodeUserID')['daily_acts'].mean().reset_index(name='avg_acts')
print(rt_avg_daily_acts.describe())

#### Generating timestep samples

In [None]:
edgelist_path = ''
nodelist_path = ''
edgelist, encoding = generate_input_files(rt_new_df, rt_nodes, nodelist_path, edgelist_path)

In [None]:
filename = ''
rt_day_acts_formatted = rt_new_df.pivot_table(values='daily_acts', index='nodeTime', columns='label', aggfunc='first')
rt_day_acts_formatted.to_pickle(filename)

## WH-Clustering 

#### Transform digraph to undirected

In [48]:
rt_lc_un = rt_lc.to_undirected()

In [52]:
rt_lc_un.number_of_nodes()

6376

#### Run Louvain algorithm in Connected graph to get communities

In [60]:
### compute the best partition using Louvain
partition = community.best_partition(rt_lc_un)

In [61]:
community_un = {}
for user, label in partition.items():
    community_un[label] = community_un.get(label, 0) + 1

In [64]:
community_un

{0: 961,
 1: 2296,
 2: 1345,
 3: 787,
 4: 165,
 5: 459,
 6: 106,
 7: 38,
 8: 193,
 9: 7,
 10: 4,
 11: 9,
 12: 2,
 13: 2,
 14: 2}

#### Assign Users to corresponding clusters found with Louvain

In [106]:
wh_df_rt['label'] = wh_df_rt['nodeUserID'].map(partition)

#Find only activities within strongly connected component
rt_cluster = wh_df_rt.loc[(wh_df_rt['nodeUserID'].isin(rt_nodes)) &
                          (wh_df_rt['parentUserID'].isin(rt_nodes))].reset_index(drop=True)

### Map users to correct grouping
rt_cluster['label_source'] = rt_cluster['nodeUserID'].map(partition)
rt_cluster['label_parent'] = rt_cluster['parentUserID'].map(partition)

### Edgelist
rt_cluster_graph = rt_cluster.groupby(['label_source', 'label_parent']).size().reset_index(name='weight')

#### Build directed graph for clusters

In [228]:
### Create digraph for retweet diffusion
g_nx_cluster = nx.from_pandas_edgelist(rt_cluster_graph,'label_parent', 'label_source', ['weight'], create_using=nx.DiGraph())

In [230]:
### Get list of nodes in largest connected component
rt_nodes_cluster = list(g_nx_cluster.nodes())
print('Nodes:', g_nx_cluster.number_of_nodes(), 'Edges: ', g_nx_cluster.number_of_edges())

### Keep only activities for nodes in largest connected component
rt_df_cluster = wh_df_rt.loc[wh_df_rt['label'].isin(rt_nodes_cluster)].reset_index(drop=True)

rt_df_cluster['label'] = rt_df_cluster['label'].astype(int)

### Create a dailyTime field
rt_df_cluster['dailyTime'] = rt_df_cluster['nodeTime'].dt.strftime('%Y-%m-%d')

### Compute daily activities of each node
rt_df_cluster_acts = rt_df_cluster.groupby(['label','dailyTime']).size().reset_index(name='daily_acts')

### dailyTime to datetime object
rt_df_cluster_acts['dailyTime'] = pd.to_datetime(rt_df_cluster_acts['dailyTime'])

### Get average number of activities per cluster
rt_avg_daily_acts_cluster = rt_df_cluster_acts.groupby('label')['daily_acts'].mean().reset_index(name='avg_acts')

### Generate missing data
rt_new_df_cluster = add_missing_data_git(rt_df_cluster_acts, entity='label')


### Set dailyTime as a column in the df
rt_new_df_cluster.reset_index(level=0, inplace=True)
rt_new_df_cluster.rename(columns={'index':'nodeTime'}, inplace=True)

#### Generating timestep samples

In [None]:
edgelist_path = ''
nodelist_path = ''
edgelist, encoding = generate_input_files(rt_new_df_cluster, rt_nodes_cluster, nodelist_path, edgelist_path)

In [261]:
filename = ''
rt_day_acts_formatted = rt_new_df_cluster.pivot_table(values='daily_acts', index='nodeTime', columns='label', aggfunc='first')
rt_day_acts_formatted.to_pickle(filename)