# 2021-10-26 Timeline stats

Beginnings of statistical analysis based on the timeline stats of users, NOT the clustering word data yet. Also should I get the follows?

In [35]:
import os
import pprint
import tqdm
import json
import glob
import jsonlines
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import sys
from collections import Counter
sys.path.insert(0, '../src/d07_visualisation/')
import datetime
import h5py
from typing import NamedTuple
import re
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
#load in search hashtags
with open('../references/search_hashtags.txt', 'r') as f:
    search_hashtags = f.readlines()
    search_hashtags = [i.replace('\n', '') for i in search_hashtags]
    search_hashtags = [i.replace('#', '') for i in search_hashtags]
    search_hashtags = [i.lower() for i in search_hashtags]

## Parameters

In [34]:
group_num = 1

## Load in data and basic preprocessing

In [3]:
interactions_hdf5_file = '/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/03_processed/interactions.hdf5'

# print(h5py.File(interactions_hdf5_file, 'r').keys())

df = pd.read_hdf(interactions_hdf5_file,'interactions_group_1')

In [4]:
def convert_in_reply_to(x):
    if x:
        return x[0]
    else:
        return None

df['in_reply_to'] = df['in_reply_to'].apply(convert_in_reply_to)

In [5]:
# remove self replies

df = df[df['author_id']!=df['in_reply_to']]

In [6]:
# generate counts of internal mentions/quotes/replies

df['internal'] = df['in_reply_to'].isin(df['author_id'])
# df['internal_mentions'] = df['mentions'].apply(lambda x: any(df['author_id'].isin(x)))

# df['internal_replies'] = df['replies'].apply(lambda x: any(df['tweet_id'].isin(x)))
# df['internal_quotes'] = df['quotes'].apply(lambda x: any(df['tweet_id'].isin(x)))

In [7]:
internal = df['internal'].sum()
print(f'Internal replies (not self replies): {internal}')
print(f'Total df length: {len(df)}')


Internal replies (not self replies): 275164
Total df length: 7939920


In [8]:
user_interaction_counts = df[['author_id','tweet_id']].groupby('author_id').count()

user_interaction_counts.describe()

Unnamed: 0,tweet_id
count,4216.0
mean,1883.282732
std,2837.788446
min,1.0
25%,148.75
50%,763.0
75%,2498.25
max,26390.0


### Multiple participation in protest networks
Another preprocessing/data description step: How many how many users participate in more than one protest network in this time period **in their interactions**?

In [9]:
for hashtag in search_hashtags:
    df['vocab:#'+hashtag] = df['contains_hashtags'].apply(lambda x: any(hashtag.lower() == item.lower() for item in x))

#columns with vocab: in them
vocab_colnames = [i for i in list(df.columns) if 'vocab:#' in i]

df['network_participation'] = df[vocab_colnames].sum(axis=1)
df['multiple_participation'] = df['network_participation']>1

df_participate = df[['author_id','multiple_participation']].groupby('author_id').sum()

print(df_participate.describe())

above_one = (df_participate >= 1).sum()
print(f'{above_one} users participate in more than one protest network in interactions')

       multiple_participation
count             4216.000000
mean                 3.486243
std                 21.048806
min                  0.000000
25%                  0.000000
50%                  0.000000
75%                  1.000000
max                855.000000
multiple_participation    1302
dtype: int64 users participate in more than one protest network in interactions


In [10]:
df

Unnamed: 0,tweet_id,author_id,created_at,in_reply_to,mentions,quotes,replies,contains_hashtags,internal,vocab:#metoo,...,vocab:#noustoutes,vocab:#stilleforopptak,vocab:#nårdansenstopper,vocab:#nårmusikkenstilner,vocab:#memyös,vocab:#timesup,vocab:#niere,vocab:#jotambe,network_participation,multiple_participation
0,1000884424190365697,1005675566,2018-05-27,979806883472175105,[],[],[1000857122198966278],[],False,False,...,False,False,False,False,False,False,False,False,0,False
1,1000884337531916289,1005675566,2018-05-27,25073877,[],[],[1000837182297464832],[],False,False,...,False,False,False,False,False,False,False,False,0,False
2,1000866554291130368,1005675566,2018-05-27,2467791,[2467791],[],[1000862340450078720],[],True,False,...,False,False,False,False,False,False,False,False,0,False
3,1000767799898370048,1005675566,2018-05-27,18277655,[],[],[1000371463705235456],[],False,False,...,False,False,False,False,False,False,False,False,0,False
4,1000668370457579520,1005675566,2018-05-27,872909646,[16873455],[],[1000378615580774408],[],False,False,...,False,False,False,False,False,False,False,False,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8581033,950212082313650176,99784623,2018-01-08,,[],[950209985891446784],[],"[TIMESUP, TheFutureIsFemale]",False,False,...,False,False,False,False,False,True,False,False,1,False
8581034,950205269983793155,99784623,2018-01-08,,[],[950205133568307200],[],[TIMESUP],False,False,...,False,False,False,False,False,True,False,False,1,False
8581035,999801367886692352,99784623,2018-05-24,,[17525171],[],[],[MeToo],False,True,...,False,False,False,False,False,False,False,False,1,False
8581036,998961892600229888,99784623,2018-05-22,,[24216951],[],[],"[13ReasonsWhy, MeToo]",False,True,...,False,False,False,False,False,False,False,False,1,False


In [11]:
# Generate simple counter of users either mentioned or replied to inside or outside the current author list

users_actually_interacted_with = Counter()

for row in df.itertuples():
    try:
        for mentioned in row.mentions:
            users_actually_interacted_with[mentioned] += 1
        users_actually_interacted_with[row.in_reply_to] += 1
    except:
        print(row)
        break

Potential sampling again from:

In [12]:
users_actually_interacted_with.most_common()[:10]

[(None, 2010118),
 ('25073877', 125172),
 ('1917731', 95810),
 ('1367531', 91961),
 ('759251', 69922),
 ('14247236', 62094),
 ('32871086', 56299),
 ('49698134', 54523),
 ('2836421', 45869),
 ('807095', 31060)]

To match on activity, let's collect their base timelines and see how much they tweeted

To determine the output variables from this continuous time scope we need the peaks again.

In [13]:
def unit_conv(val):
    return datetime.datetime.strptime('2017-10-16', '%Y-%m-%d') + datetime.timedelta(days=int(val))

#obtain peak times again
with h5py.File('/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/02_intermediate/FAS_peak_analysis.hdf5', 'r') as f:
    FAS_peaks = f['peak_detections']

    most_prominent_peaks = {}
    for name, h5obj in FAS_peaks.items():
        if len(h5obj['prominences']) == 0:
            continue
        max_prominence = np.argmax(h5obj['prominences'])
        most_prominent_peaks[name] = unit_conv(h5obj['peak_locations'][max_prominence])

In [14]:
# let's just consider 私も and tôicũngvậy
most_prominent_peaks['私も']

datetime.datetime(2017, 11, 30, 0, 0)

Start creating the df needed for stats. Each user needs to have
* normal activity before and after
* hashtagged activity before and after
* number of interactions with (each) protest network
* protest networks participated in (organise by pairs?)
* before and after peaks of each network participation


In [15]:
# maybe start with splitting by hashtag
# first split by multiple participation. These are the interactions we're most interested in

multiple_ints_only = df[df['multiple_participation']]

In [16]:
# do by user. first get list of all participating networks for each user

# for author_id in df['author_id'].unique():

df['author_total_hashtags'] = df['contains_hashtags'].apply(lambda x: [i for i in x if i.lower() in search_hashtags])

In [17]:
# get peaks for each hashtag into the pandas
for ht in search_hashtags:
    if ht != 'وأناكمان':
        with_sym_ht = f'#{ht}'
        df[f'peak_{ht}'] = df['created_at'].apply(lambda x: x > most_prominent_peaks[ht].date())

In [88]:
df.columns

Index(['tweet_id', 'author_id', 'created_at', 'in_reply_to', 'mentions',
       'quotes', 'replies', 'contains_hashtags', 'internal', 'vocab:#metoo',
       'vocab:#balancetonporc', 'vocab:#moiaussi', 'vocab:#نه_یعنی_نه',
       'vocab:#米兔', 'vocab:#我也是', 'vocab:#وأناكمان', 'vocab:#gamani',
       'vocab:#tôicũngvậy', 'vocab:#私も', 'vocab:#watashimo', 'vocab:#나도',
       'vocab:#나도당했다', 'vocab:#גםאנחנו', 'vocab:#ятоже', 'vocab:#ricebunny',
       'vocab:#enazeda', 'vocab:#anakaman', 'vocab:#yotambien',
       'vocab:#sendeanlat', 'vocab:#kutoo', 'vocab:#withyou', 'vocab:#wetoo',
       'vocab:#cuentalo', 'vocab:#quellavoltache', 'vocab:#niunamenos',
       'vocab:#woyeshi', 'vocab:#myharveyweinstein', 'vocab:#noustoutes',
       'vocab:#stilleforopptak', 'vocab:#nårdansenstopper',
       'vocab:#nårmusikkenstilner', 'vocab:#memyös', 'vocab:#timesup',
       'vocab:#niere', 'vocab:#jotambe', 'network_participation',
       'multiple_participation', 'author_total_hashtags', 'peak_metoo',

## Vocabulary Input: Python Clustering Eval Results

Now for vocabulary input. First load in the relevant files

In [None]:
# load bsc model, features names, csr, and user ('document') ids

bsc_model_list = glob.glob(os.path.join('/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/05_model_output/01_group','*.obj'))

selected_model = 300

selected_model = [i for i in bsc_model_list if int(re.split('[_.]',i)[-6])==selected_model][0]
print(selected_model)

with open(selected_model, 'rb') as f:
    model = pickle.load(f)

# open mapping also get relevent features names and user ids

feature_mapping_file = '/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/02_intermediate/01_group/mapping_ngram_23.obj'
with open(feature_mapping_file, 'rb') as f:
    feature_names = pickle.load(f)

# also load in csr
csr_file = '/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/02_intermediate/01_group/user_count_mat_ngram_23.obj'
with open(csr_file,'rb') as f:
    csr = pickle.load(f)

# get also the userlist that was used to generate this for 'document' names

user_doc_ids = sorted(glob.glob(os.path.join('/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/01_raw','timeline*.jsonl')))
user_doc_ids = [re.split('[_.]',i)[-2] for i in user_doc_ids]

In [None]:
# function to extract top phrases from cluster
def extract_cluster_words(feature_names, user_ids, csr, model, cluster_num):
    row_ind, col_ind = model.get_indices(cluster_num)

    csr_cluster = csr[row_ind][:,col_ind]
    feature_csr_sums = csr_cluster.sum(axis=0)
    top_features = np.argsort(feature_csr_sums)
    top_features = np.asarray(top_features).flatten()
    return [feature_names[i] for i in top_features[-1:-20:-1]]

In [None]:
cluster_words_dict = {}

for i in range(300):

    cluster_words_dict[i] = extract_cluster_words(
        feature_names,
        user_doc_ids,
        csr,
        model,
        i
    )

    if len(cluster_words_dict[i]) < 19:
        cluster_words_dict[i] = cluster_words_dict[i] + [None]*(19-len(cluster_words_dict[i]))

In [None]:
df_cluster_words = pd.DataFrame.from_dict(cluster_words_dict)

In [None]:
# select a cluster of words
selected_cluster = 0

## Finally, a statistical model

In [None]:
# before Vietnamese peak
df[~df['peak_tôicũngvậy'] & df['vocab:#tôicũngvậy']]
# after Vietnamese peak
df[df['peak_tôicũngvậy'] & df['vocab:#tôicũngvậy']]
# before Japanese peak:
df[~df['peak_私も'] & df['vocab:#私も']]
# after Japanese peak:
df[df['peak_私も'] & df['vocab:#私も']]

In [56]:
# FAS analysis file
FAS_peak_analysis_file = '/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/02_intermediate/FAS_peak_analysis.hdf5'

class daterange(NamedTuple):
    start: str
    end: str

def daterange_from_group_num(group_num):
    with h5py.File(FAS_peak_analysis_file, 'r') as f:
        x = f['segments']['selected_ranges'][0]
        res = daterange(
            start = x[0].decode(),
            end = x[1].decode()
        )
    return res

# insert activity of users
def date_to_array_index(date, daterange):
    return (date - datetime.datetime.strptime(daterange.start, '%Y-%m-%d').date()).days

def insert_activity_of_user(row, daterange=None, activity_file = None, activity='hashtagged', hashtag=None, group_num = 1, ht_row_mapping = None):

    if activity == 'hashtagged':
        if row[f'peak_{hashtag}']:
            after = date_to_array_index(row['created_at'], daterange)
            # get ht row index
            assert hashtag in ht_row_mapping
            ht_index = ht_row_mapping.index(hashtag)
            result = activity[f'group_{group_num}'][row['author_id']][activity][ht_index,after:]
        else:
            after = date_to_array_index(row['created_at'], daterange)
            # get ht row index
            assert hashtag in ht_row_mapping
            ht_index = ht_row_mapping.index(hashtag)
            result = activity[f'group_{group_num}'][row['author_id']][activity][ht_index,after:]
    else:
        if row[f'peak_{hashtag}']:
            after = date_to_array_index(row['created_at'], daterange)
            # print(f'group_{group_num}', row['author_id'], after)
            result = activity_file[f'group_{group_num}'][row['author_id']][activity][after:]
        else:
            up_to = date_to_array_index(row['created_at'], daterange)
            # print(f'group_{group_num}', row['author_id'], up_to)
            result = activity_file[f'group_{group_num}'][row['author_id']][activity][:up_to]

    if result:
        return np.sum(result)/len(result)
    else:
        return 0

In [84]:
activity_file = '/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/03_processed/activity_counts.hdf5'

with h5py.File(activity_file, 'r') as f:
    # print(f.keys())
    x = f['group_1']['1005675566']['hashtagged'].attrs['feature_order']
    ht_row_mapping = x.split(';')

In [58]:
group_daterange = daterange_from_group_num(group_num)

with h5py.File(activity_file, 'r') as f:
    for ht in search_hashtags:
        if ht != 'وأناكمان':
            print(ht)
            df[f'activity_{ht}_normal'] = df.apply(
                insert_activity_of_user,
                daterange = group_daterange,
                activity_file = f,
                hashtag = ht,
                activity='normal',
                axis=1
            )
# ht_row_mapping = activity[f'group_{group_num}'][row['author_id']]['feature_order']
# ht_row_mapping = ht_row_mapping.split(':;')

metoo


KeyboardInterrupt: 

In [60]:
# save progress
save_file = '/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/06_reporting/combined_analysis.hdf5'

df.to_hdf(save_file, 'group_1')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['tweet_id', 'author_id', 'created_at', 'in_reply_to', 'mentions',
       'quotes', 'replies', 'contains_hashtags', 'author_total_hashtags'],
      dtype='object')]

  pytables.to_hdf(


In [75]:
ht_a = 'metoo'
ht_b = 'balancetonporc'
filtered =  df[~df[f'peak_{ht}'] & df[f'vocab:#{ht_a}'] & df[f'vocab:#{ht_b}']]

In [79]:
num_interacts = filtered.groupby(['author_id']).agg(int_pre_peak = pd.NamedAgg(column="tweet_id", aggfunc="count"))
print(type(num_interacts))

<class 'pandas.core.frame.DataFrame'>


In [None]:
# print(post peak activity)
post_filtered = df[df[f'peak_{ht}'] & df[f'vocab:#{ht_a}'] & df[f'vocab:#{ht_b}']]
post_filtered.sort_values('created_at', ascending=False).groupby(['author_id']).agg(post_peak_target = pd.NamedAgg(column=f"activity_{ht_b}_normal", aggfunc= lambda x: x.iloc[-1]))

In [77]:
filtered.sort_values('created_at', ascending=False).groupby('author_id').agg(act_pre_peak = pd.NamedAgg(column=f'activity_{ht_b}_normal', aggfunc= lambda x: x.iloc[-1]))

Unnamed: 0_level_0,act_pre_peak
author_id,Unnamed: 1_level_1
1126372568,2
134939868,2
148746108,2
1491181038,2
1514035297,3
161451768,2
16447769,2
1691477732,3
1699266222,2
169949875,2


In [159]:
# do by hashtag. group by users and then whether it was before peak
class stats_df_tuple(NamedTuple):
    ht_target: str
    data: pd.DataFrame

def retrieve_activity_user(row, daterange=None, activity_file = None, activity='hashtagged', hashtag=None, group_num = 1, ht_row_mapping = None):

    if activity == 'hashtagged':
        if row[f'peak_{hashtag}']:
            after = date_to_array_index(row['created_at'], daterange)
            # get ht row index
            assert hashtag in ht_row_mapping
            ht_index = ht_row_mapping.index(hashtag)
            # print(f'group_{group_num}', row['author_id'], after)
            result = activity_file[f'group_{group_num}'][row['author_id']][activity][ht_index,after:]
        else:
            up_to = date_to_array_index(row['created_at'], daterange)
            # get ht row index
            assert hashtag in ht_row_mapping
            ht_index = ht_row_mapping.index(hashtag)
            # print(f'group_{group_num}', row['author_id'], ht_index, up_to)
            result = activity_file[f'group_{group_num}'][row['author_id']][activity][ht_index,:up_to]
    else:
        if row[f'peak_{hashtag}']:
            after = date_to_array_index(row['created_at'], daterange)
            # print(f'group_{group_num}', row['author_id'], after)
            result = activity_file[f'group_{group_num}'][row['author_id']][activity][after:]
        else:
            up_to = date_to_array_index(row['created_at'], daterange)
            # print(f'group_{group_num}', row['author_id'], up_to)
            result = activity_file[f'group_{group_num}'][row['author_id']][activity][:up_to]

    if len(result)>0:
        s = np.sum(result)
        # print(s)
        return s
    else:
        return 0


res = []
with h5py.File(activity_file, 'r') as f:
    for ht_target in search_hashtags[:1]:
        # print(ht_target)
        # take only the rows for before the peak. Group by 
        filtered =  df[~df[f'peak_{ht_target}'] & df[f'vocab:#{ht_target}']]
        num_interacts = filtered.groupby(['author_id']).agg(int_pre_peak = pd.NamedAgg(column="tweet_id", aggfunc="count"))
        act_pre_peak = filtered.sort_values('created_at', ascending=False).groupby('author_id').agg(lambda x: x.iloc[0]).reset_index()
        if act_pre_peak.empty:
            num_interacts['act_pre_peak'] = 0
        else:
            act_pre_peak['act_pre_peak'] = act_pre_peak.apply(
                retrieve_activity_user,
                daterange=group_daterange,
                activity_file=f,
                hashtag=ht_target,
                group_num=1,
                ht_row_mapping = ht_row_mapping,
                axis=1
            )

            num_interacts = num_interacts.join(act_pre_peak[['author_id','act_pre_peak']].set_index('author_id'))

        act_post_peak = df[df[f'peak_{ht_target}'] & df[f'vocab:#{ht_target}']]
        act_post_peak = act_post_peak.sort_values('created_at', ascending=False).groupby('author_id').agg(lambda x: x.iloc[0]).reset_index()

        if act_post_peak.empty:
            num_interacts['act_post_peak'] = 0
        else:
            act_post_peak['act_post_peak'] = act_post_peak.apply(
                retrieve_activity_user,
                daterange=group_daterange,
                activity_file=f,
                hashtag=ht_target,
                group_num=1,
                ht_row_mapping = ht_row_mapping,
                axis=1
            )
            temp = act_post_peak[['author_id','act_post_peak']].set_index('author_id')

            num_interacts = num_interacts.join(temp)
        num_interacts = num_interacts.reset_index().fillna(0)
        num_interacts['ht'] = ht_target
        res.append(stats_df_tuple(ht_target=ht_target, data=num_interacts))

In [162]:
temp.isnull().sum()

act_post_peak    0
dtype: int64

In [149]:
stats_df = pd.concat([i.data for i in res], axis=0)

In [150]:
stats_df

Unnamed: 0,author_id,int_pre_peak,act_pre_peak,act_post_peak,ht
0,10118982,1,9.0,1.0,metoo
1,1013557603,38,77.0,,metoo
2,1023568159,33,43.0,8.0,metoo
3,102496469,2,0.0,,metoo
4,1025741330,12,23.0,1.0,metoo
...,...,...,...,...,...
483,96825619,2,6.0,4.0,timesup
484,97423151,1,0.0,,timesup
485,988406744,2,9.0,,timesup
486,989210214,2,17.0,2.0,timesup


In [157]:
res[0].data

Unnamed: 0,author_id,int_pre_peak,act_pre_peak,act_post_peak,ht
0,10118982,1,9.0,1.0,metoo
1,1013557603,38,77.0,,metoo
2,1023568159,33,43.0,8.0,metoo
3,102496469,2,0.0,,metoo
4,1025741330,12,23.0,1.0,metoo
...,...,...,...,...,...
2002,98606948,8,76.0,1.0,metoo
2003,988406744,9,253.0,3.0,metoo
2004,989210214,2,17.0,,metoo
2005,995054521,100,146.0,,metoo


In [None]:
stats_results = smf.ols('activity_私も_hashtagged ~ activity_私も_normal',
    data=df[df['vocab:#私も']]
)

Matching users on activity: