In [None]:
import pandas as pd
import numpy as np

In [2]:
import json

In [3]:
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from networkx.algorithms.community import greedy_modularity_communities
from networkx.algorithms.community import k_clique_communities

In [63]:
import collections

### Load Retweet Chain

In [6]:
def load_json(fn):

    json_data = []
    
    if type(fn) == str:
        with open(fn,'rb') as f:
            for line in f:
                json_data.append(json.loads(line))
    else:
        for fn0 in fn:
            with open(fn0,'rb') as f:
                for line in f:
                    json_data.append(json.loads(line))

    return(json_data)

In [7]:
path_to_chain = ''
chain_json = load_json(path_to_chain)
df_chain = pd.DataFrame(chain_json)

In [8]:
df_chain.rename(columns={'tweet_id_h':'nodeID'}, inplace=True)

### Load White-Helmets data

In [None]:
wh_df = pd.read_csv('')

#### Map records with correct parentID for retweets

In [11]:
#Twitter data
wh_df = wh_df.loc[wh_df['platform'] == 'twitter'].reset_index(drop=True)

In [12]:
#Drop dups
wh_df_filt = wh_df.drop_duplicates('nodeID', keep='last').reset_index(drop=True)

In [13]:
#### Get only action types retweets
wh_df_rt = wh_df_filt.loc[wh_df_filt['actionType'] == 'retweet'].reset_index(drop=True)

In [14]:
### Get correct retweet chain
wh_df_rt = pd.merge(wh_df_rt, df_chain, on='nodeID')

In [15]:
wh_df_rt = wh_df_rt.drop(columns=['parentID', 'rootID','tweet_postdate', 'tweeter_UTC_Offset', 'tweeter_followers'])

In [16]:
wh_df_rt.rename(columns={'retweeted_from_tweet_id_h':'parentID', 'source_tweet_id_h':'rootID'}, inplace=True)

In [17]:
final_wh_df_rt = wh_df.loc[wh_df['actionType'] != 'retweet'].reset_index(drop=True)

In [None]:
wh_df_new = pd.concat([final_wh_df_rt, wh_df_rt], ignore_index=True)

#### Get correct parentUserID for parentID field

In [19]:
#ParentUserID - ParentID mapping
wh_mapping = wh_df_new[['nodeID', 'nodeUserID']]

In [None]:
wh_mapping.rename(columns={'nodeID':'parentID', 'nodeUserID':'parentUserID'}, inplace=True)

In [21]:
#### Include parent user ID for the retweet dataframe
wh_df_rt = pd.merge(wh_mapping, wh_df_rt, on='parentID')
wh_df_rt = wh_df_rt[['nodeID', 'nodeUserID', 'parentID', 'parentUserID', 'nodeTime']]

In [22]:
### nodeTime to datetime object
wh_df_rt['nodeTime'] = pd.to_datetime(wh_df_rt['nodeTime'])

In [30]:
#Build retweet network
rt_graph = wh_df_rt.groupby(['nodeUserID', 'parentUserID']).size().reset_index(name='weight')

#### Build directed graph for retweets

In [77]:
#Create digraph for retweet diffusion
g_nx = nx.from_pandas_edgelist(rt_graph,'parentUserID', 'nodeUserID', ['weight'], create_using=nx.DiGraph())

In [78]:
#Get largest strongly connected component
lc_strong = sorted(nx.strongly_connected_components(g_nx), key=len, reverse=True)

In [79]:
#Get Subgraph of the largest connected component.
rt_lc = g_nx.subgraph(lc_strong[0])

In [80]:
#Number of nodes and edges in largest connected component
rt_lc.number_of_nodes(), rt_lc.number_of_edges()

(6376, 96316)

In [81]:
#Get list of nodes in largest connected component
rt_nodes = list(rt_lc.nodes())

#### Get daily number of activities, and fill inactive days with 0

In [82]:
#Keep only activities for nodes in largest connected component
rt_df = wh_df_rt.loc[wh_df_rt['nodeUserID'].isin(rt_nodes)].reset_index(drop=True)

In [83]:
#Create a dailyTime field
rt_df['dailyTime'] = rt_df['nodeTime'].dt.strftime('%Y-%m-%d')

#Compute daily activities of each node
rt_df_daily_acts = rt_df.groupby(['nodeUserID','dailyTime']).size().reset_index(name='daily_acts')

#dailyTime to datetime object
rt_df_daily_acts['dailyTime'] = pd.to_datetime(rt_df_daily_acts['dailyTime'])

In [84]:
max_date =  max(rt_df_daily_acts['dailyTime'])
min_date = min(rt_df_daily_acts['dailyTime'])

idx = pd.date_range(min_date, max_date)
rt_df_daily_acts.set_index('dailyTime', inplace=True)

df_concat = []

for user, group in rt_df_daily_acts.groupby('nodeUserID'):
    s = group['daily_acts']
    
    s = s.reindex(idx, fill_value=0)
   
    df = pd.DataFrame(s)
    df['nodeUserID'] = user
    
    df_concat.append(df)
    
rt_new_df = pd.concat(df_concat)

In [85]:
rt_avg_df = rt_new_df.groupby('nodeUserID')['daily_acts'].mean().reset_index(name='avg_daily')

In [89]:
rt_avg_df.describe()

Unnamed: 0,avg_daily
count,6376.0
mean,0.106005
std,0.283105
min,0.002532
25%,0.007595
50%,0.022785
75%,0.081013
max,5.303797


### Filter out users with an avg. number of activities less than the global avg.

In [91]:
rt_nodes = rt_avg_df.loc[rt_avg_df['avg_daily'] > 0.106005]

In [92]:
rt_nodes_filter = list(rt_nodes['nodeUserID'])

In [93]:
#Keep only activities for nodes in largest connected component
rt_df_filter = wh_df_rt.loc[wh_df_rt['nodeUserID'].isin(rt_nodes_filter)].reset_index(drop=True)

In [95]:
#Build retweet network
rt_graph_filter = rt_df_filter.groupby(['nodeUserID', 'parentUserID']).size().reset_index(name='weight')

In [96]:
#Create digraph for retweet diffusion
g_nx_filter = nx.from_pandas_edgelist(rt_graph_filter,'parentUserID', 'nodeUserID', ['weight'], create_using=nx.DiGraph())

In [97]:
#Get largest strongly connected component
lc_strong_filter = sorted(nx.strongly_connected_components(g_nx_filter), key=len, reverse=True)

In [98]:
#Get Subgraph of the largest connected component.
rt_lc_filter = g_nx_filter.subgraph(lc_strong_filter[0])

In [102]:
#Number of nodes and edges in largest connected component
rt_lc_filter.number_of_nodes(), rt_lc_filter.number_of_edges()

(1247, 33655)

In [101]:
rt_lc_filter_nodes = list(rt_lc_filter.nodes())

### Get Only activities for users in filtered strongly connected component

In [104]:
rt_new_df_filter = rt_new_df.loc[rt_new_df['nodeUserID'].isin(rt_lc_filter_nodes)]

#### Formatting to match DCRNN Input

In [None]:
#Set dailyTime as a column in the df
rt_new_df_filter.reset_index(level=0, inplace=True)
rt_new_df_filter.rename(columns={'index':'nodeTime'}, inplace=True)

In [109]:
#### Build output files for DCRNN

#Turn largest connected component subgraph into a dataframe
rt_lc_df = nx.to_pandas_edgelist(rt_lc_filter)
rt_lc_df['weight'] = 1
#Find nodes with self-loops
self_nodes = rt_lc_df.loc[rt_lc_df['source'] == rt_lc_df['target']]
self_nodes = list(self_nodes['source'])

nodes = list(set(rt_lc_filter_nodes) - set(self_nodes))

#insert self-loops with weight 0 for records without self loops
entries = [{'source':node, 'target':node, 'weight':0} for node in nodes]
if len(entries) > 0:
    rt_lc_df= rt_lc_df.append(entries)

#encode the largest connected component into integers
rt_lc_encoding = {}
for _i, node in enumerate(rt_lc_filter_nodes):
    rt_lc_encoding[node] = str(_i)
rt_lc_df['source'] = rt_lc_df['source'].apply(lambda x: rt_lc_encoding[x])
rt_lc_df['target'] = rt_lc_df['target'].apply(lambda x: rt_lc_encoding[x])

#rename columns
rt_lc_df.rename(columns={'source':'from', 'target':'to', 'weight':'distance'}, inplace=True)
rt_lc_df['distance']= rt_lc_df['distance'].astype(float)
#write id list to file
with open('', 'w') as f:
    f.write(",".join(list(rt_lc_encoding.values())))
rt_lc_df.to_csv('', index=False)

- Features file

In [None]:
rt_new_df_filter['label'] = rt_new_df_filter['nodeUserID'].map(rt_lc_encoding)

In [113]:
rt_day_acts_formatted = rt_new_df_filter.pivot_table(values='daily_acts', index='nodeTime', columns='label', aggfunc='first')
rt_day_acts_formatted.to_pickle('')

In [143]:
rt_filter_avg.describe()

Unnamed: 0,avg_acts
count,1247.0
mean,0.423467
std,0.52066
min,0.106329
25%,0.159494
50%,0.258228
75%,0.473418
max,5.303797
