In [3]:
import numpy as np
import networkx as nx
import datetime
import tqdm
from typing import NamedTuple
import h5py

import time
import random
import pandas as pd
import pickle
from collections import Counter, defaultdict
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import bernoulli, binom
sns.set('talk')
sns.set_style('ticks')

#load in search hashtags
with open('../references/search_hashtags.txt', 'r') as f:
    search_hashtags = f.readlines()
    search_hashtags = [i.replace('\n', '') for i in search_hashtags]
    search_hashtags = [i.replace('#', '') for i in search_hashtags]
    search_hashtags = [i.lower() for i in search_hashtags]
    search_hashtags.remove('وأناكمان')

In [11]:
with h5py.File('/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/06_reporting/ABM_output_group_2_batch_2.hdf5', 'r') as f:
    for i in tqdm.tqdm(range(12000)):
        x = f['batch_result'][24,i,:].sum()
        if x > 0:
            print(f'found. {x}')

hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi


# 0. Import real data and setup

In [2]:
################################################################################
# Set Parameters
################################################################################
group_num = 3
hashtag_split = True
ngram_range = '34'
min_user = 100

overwrite = False
read_in = True

################################################################################
# Set relevant file paths
################################################################################

plot_save_path = f'/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/results/0{group_num}_group/'

follows_dir = f'/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/01_raw/0{group_num}_group/'

activity_file = '/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/03_processed/activity_counts.hdf5'

peak_analysis_file = '/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/02_intermediate/FAS_peak_analysis.hdf5'

# abm df save path.
abm_processed_df_savepath = f'/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/06_reporting/ABM_processed_df_group_{group_num}.obj'

graph_savepath = f'/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/06_reporting/ABM_graph_group_{group_num}.obj'

################################################################################
# Define useful functions
################################################################################

def unit_conv(val):
    return datetime.datetime.strptime('2017-10-16', '%Y-%m-%d') + datetime.timedelta(days=int(val))

def reverse_unit_conv(date):
    return (datetime.datetime.strptime(date, '%Y-%m-%d') - datetime.datetime.strptime('2017-10-16', '%Y-%m-%d')).days

class daterange(NamedTuple):
    start: str
    end: str

def date_to_array_index(date, daterange):
    return (datetime.datetime.strptime(date, '%Y-%m-%d') - datetime.datetime.strptime(daterange.start, '%Y-%m-%d')).days

def group_peaks_and_daterange(peak_analysis_file, group_num):

    #obtain peak times again
    with h5py.File(peak_analysis_file, 'r') as f:
        FAS_peaks = f['peak_detections']
        x = f['segments']['selected_ranges'][int(group_num)-1]
        group_date_range = daterange(
            start = x[0].decode(),
            end = x[1].decode()
        )

        # group_start_index = reverse_unit_conv(group_date_range.start)
        # group_end_index = reverse_unit_conv(group_date_range.end)

        most_prominent_peaks = {}
        for name, h5obj in FAS_peaks.items():

            peak_locations = h5obj['peak_locations']
            peak_locations = [(i,e) for i,e in enumerate(h5obj['peak_locations']) if (unit_conv(e) > datetime.datetime.strptime(group_date_range.start, '%Y-%m-%d')) and (unit_conv(e) < datetime.datetime.strptime(group_date_range.end, '%Y-%m-%d'))]
            peak_indices = [i[0] for i in peak_locations]
            prominences = [element for index, element in enumerate(h5obj['prominences']) if index in peak_indices]
            if len(prominences) == 0:
                continue
            max_prominence = np.argmax(prominences)
            most_prominent_peaks[name] = unit_conv(peak_locations[max_prominence][1])

    daterange_length = (datetime.datetime.strptime(group_date_range.end, '%Y-%m-%d') - datetime.datetime.strptime(group_date_range.start, '%Y-%m-%d')).days

    return most_prominent_peaks, group_date_range, daterange_length

################################################################################
# Read in peaks
################################################################################

most_prominent_peaks, group_date_range, daterange_length = group_peaks_and_daterange(peak_analysis_file, group_num)


################################################################################
# Determine if ABM df has already been processed.
################################################################################

if (os.path.isfile(abm_processed_df_savepath) and overwrite) or not os.path.isfile(abm_processed_df_savepath):

    print('overwriting or writing for the first time')

    # read df raw for ABM
    stats_df_save_dir = '/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/06_reporting/'
    df_filename = os.path.join(stats_df_save_dir, f'ABM_raw_df_group_{group_num}.obj')
    stats_filename = os.path.join(stats_df_save_dir, f'ABM_stats_df_group_{group_num}.obj')

    if os.path.isfile(df_filename):
        print('reading in df')
        with open(df_filename, 'rb') as f:
            df = pickle.load(f)
    if os.path.isfile(stats_filename):
        print('reading in stats_df')
        with open(stats_filename, 'rb') as f:
            stats_df = pickle.load(f)

    print('N.B. users are not included in stats df because in creating the activity counts users were split into before and after peak interactions')

    print(f'Length of df: {len(df)}')
    unique_author_stats_df_count = len(stats_df['author_id'].unique())
    print(f'Number of unique authors in stats df: {unique_author_stats_df_count}')
    unique_author_df_count = len(df['author_id'].unique())
    print(f'Number of unique authors in df: {unique_author_df_count}')

    # generate ht column
    df_colnames = df.columns
    vocab_colnames = [i for i in df_colnames if i.startswith('vocab')][::-1]
    def process_row_ht(row):
        for col in vocab_colnames:
            if col == 'vocab:#timesup':
                continue
            if row[col] == 1:
                return col.split('#')[-1]
        return None

    df['ht'] = df.apply(process_row_ht, axis=1)
    df['ht'] = df['ht'].fillna('metoo')

    df = df.merge(stats_df, on=['author_id','ht'], how='right')

    # incorporate primary ht
    with open(f'/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/03_processed/primary_ht_global.obj', 'rb') as f:
        user_order, res = pickle.load(f)

    unknown_count = 0
    def process_primary_res(author_id):
        global unknown_count
        if author_id not in user_order:
            # print(f'{author_id} not in users')
            unknown_count += 1
            return 'metoo'
        return search_hashtags[np.argmax(res[user_order.index(author_id),:])]


    df['primary_ht'] = df['author_id'].map(df.groupby('author_id').apply(lambda x: process_primary_res(x.name)))
    print(f'Number of unknown primary hashtags for users: {unknown_count}')

    df['interacted_users'].loc[df['interacted_users'].isnull()] = df['interacted_users'].loc[df['interacted_users'].isnull()].apply(lambda x: [])

    with open(abm_processed_df_savepath, 'wb') as f:
        pickle.dump(df, f)


elif os.path.isfile(abm_processed_df_savepath) and read_in:
    print('reading in')
    with open(abm_processed_df_savepath, 'rb') as f:
        df = pickle.load(f)

################################################################################
# Generate the 
################################################################################

temp_df = df.groupby('author_id').first()

reading in


In [3]:
# archive:

# df[['text','lang']]

#### Language detection for individual tweets - possibly something to pass to ARC.
# from langdetect import detect
# import langdetect

# def proper_detect(text):
#     try:
#         return detect(text)
#     except:
#         return None

# df['text'].apply(proper_detect)

# 1. Make the user network

Grow a graph using the Barabási-Albert preferential attachment model.

A graph of `n` nodes is grown by attaching new nodes each with `m` edges that are preferentially attached to existing nodes with high degree.

https://networkx.org/documentation/networkx-1.9.1/reference/generated/networkx.generators.random_graphs.barabasi_albert_graph.html

In [4]:
df.head().columns

Index(['tweet_id', 'author_id', 'tweet_lang', 'text', 'likes', 'created_at',
       'in_reply_to', 'mentions', 'quotes', 'replies', 'contains_hashtags',
       'quoted_user_id', 'internal', 'vocab:#metoo', 'vocab:#balancetonporc',
       'vocab:#moiaussi', 'vocab:#نه_یعنی_نه', 'vocab:#米兔', 'vocab:#我也是',
       'vocab:#gamani', 'vocab:#tôicũngvậy', 'vocab:#私も', 'vocab:#watashimo',
       'vocab:#나도', 'vocab:#나도당했다', 'vocab:#גםאנחנו', 'vocab:#ятоже',
       'vocab:#ricebunny', 'vocab:#enazeda', 'vocab:#anakaman',
       'vocab:#yotambien', 'vocab:#sendeanlat', 'vocab:#kutoo',
       'vocab:#withyou', 'vocab:#wetoo', 'vocab:#cuentalo',
       'vocab:#quellavoltache', 'vocab:#niunamenos', 'vocab:#woyeshi',
       'vocab:#myharveyweinstein', 'vocab:#noustoutes',
       'vocab:#stilleforopptak', 'vocab:#nårdansenstopper',
       'vocab:#nårmusikkenstilner', 'vocab:#memyös', 'vocab:#timesup',
       'vocab:#niere', 'vocab:#jotambe', 'author_total_hashtags', 'peak_kutoo',
       'peak_metoo', 

In [1]:
def generate_network(df, follows_dir = None):

    # construct graph
    G = nx.DiGraph()

    # add nodes to graph
    unique_users = df['author_id'].unique()

    # for the most common hasthags used by an interacted user
    filtered_ht = df.groupby('author_id')[['ht','gender','age','int_pre_peak','act_pre_peak','norm_act_pre_peak','org','lang', 'primary_ht']].agg(pd.Series.mode)

    # attributes
    # G = nx.path_graph(3)
    # attrs = {0: {"attr1": 20, "attr2": "nothing"}, 1: {"attr2": 3}}
    # nx.set_node_attributes(G, attrs)

    # switch correct type of graph being constructed:

    # OLD AS OF 11 MAY 2022
    # if gen_from == 'interaction':

    #     # first add all nodes
    #     for row in df.itertuples(index=False):
    #         G.add_node(row.author_id)
    #         if type(row.interacted_users) == float and np.isnan(row.interacted_users):
    #             continue
    #         for interacted_user in row.interacted_users:
    #             if interacted_user not in unique_users:
    #                 continue
    #             G.add_edge(row.author_id, interacted_user)

    #     print(2*G.number_of_edges() / float(G.number_of_nodes()))

    # sanity check: followers data is collected
    assert os.path.isdir(follows_dir)

    successful_follows=0
    # extract users whom a user follows and following
    for user_id in tqdm.tqdm(unique_users):

        # add user to graph as a node with attributes.
        G.add_node(
            user_id,
            gender       = filtered_ht.loc[user_id]['gender'],
            age          = filtered_ht.loc[user_id]['age'],
            int_pre_peak = filtered_ht.loc[user_id]['int_pre_peak'],
            act_pre_peak = filtered_ht.loc[user_id]['act_pre_peak'],
            norm_act_pre_peak = filtered_ht.loc[user_id]['norm_act_pre_peak'],
            org               = filtered_ht.loc[user_id]['org'],
            lang              = filtered_ht.loc[user_id]['lang'],
            primary_ht        = filtered_ht.loc[user_id]['primary_ht']
        )

        # print(f'Processing {user_id}')
        follows_filepath = os.path.join(follows_dir, f'following_{user_id}.txt')
        try:
            total_edges_to_add = df[df['author_id'] == user_id]['interacted_users'].sum()
        except:
            print(user_id)
        if isinstance(total_edges_to_add, int):
            total_edges_to_add = np.array([])
        total_edges_to_add = np.intersect1d(total_edges_to_add, unique_users)

        if os.path.isfile(follows_filepath):
            try:
                edges_to = pd.read_table(follows_filepath).values.flatten().astype(str)
                successful_follows+=1
                new_total_edges_to_add = np.union1d(total_edges_to_add, edges_to)
                new_total_edges_to_add = np.intersect1d(new_total_edges_to_add, unique_users)
                length_diff = len(new_total_edges_to_add) - len(total_edges_to_add)
                assert length_diff >= 0
                # print(f'Length diff: {length_diff} for {user_id}')
                total_edges_to_add = new_total_edges_to_add
            except pd.errors.EmptyDataError:
                pass

        for interacted_user in total_edges_to_add:
            ht = filtered_ht.loc[interacted_user]['ht']
            G.add_edge(user_id, interacted_user, ht=ht)

    print(f'Total successfully read follows: {successful_follows}')

    # then add attributes:

    # for k,v in user_edges.items():

    #     # add k
    #     G.add_node(str(k))

    #     v_counter = Counter(v)
    #     v_edge_list = [(str(k), str(node), count) for node,count in v_counter.items() if str(node) in user_edges_keys]

    #     G.add_weighted_edges_from(v_edge_list)

    # with open(graph_object_file, 'wb') as f:
    #     pickle.dump(G, f)
    return G


if os.path.isfile(graph_savepath):
    print('loading in')
    with open(graph_savepath, 'rb') as f:
        G = pickle.load(f)
else:
    print('generating network')
    G = generate_network(df, follows_dir)
    with open(graph_savepath, 'wb') as f:
        pickle.dump(G,f)

NameError: name 'graph_savepath' is not defined

In [6]:
# df[df['author_id']=='16996244']['interacted_users']

In [7]:
# positions = nx.spring_layout(G)
# nx.draw(G, node_size=25, pos=positions)

In [8]:
# degrees        = dict(G.degree()).values()
# sorted_degrees = sorted(degrees)[::-1]

# fig, ax = plt.subplots(figsize=(20,5))

# ax.bar(x=range(len(sorted_degrees)), height=sorted_degrees)

# plt.xlabel('Nodes')
# plt.ylabel('Node degree')
# plt.xlim(-5,len(degrees)+5)

# plt.show()

## Get statistics from data

# 2. Make the agents

In [9]:
class Agent(object): 

    # initialise internal variables
    def __init__(self, ID, df, search_hashtags):
        self.ID = ID
        self.supporting_metoo = False    # initial assumption.
        self.supporting_metoo_dict = {i:0 for i in search_hashtags}
        self.interacts_with   = []       # I wrote this to represent a symmetric interaction, rather than following.
        # self.interaction_counter = defaultdict(int)    # counts interactions
        self.forget_all_interactions()
        # print(ID)
        # row = df[df['author_id'] == ID].iloc[0,:]
        # self.total_hashtags = row.author_total_hashtags
        self.primary_ht = df.loc[ID,'primary_ht']
        self.support_tracker = np.zeros(shape=(35,1))
        self.activity_tracker = np.zeros(shape=(35,1))
        self.individual_propensity = np.zeros(shape=(35,1))
        self.experimentation_count = 0

    def update_tracker(self):

        # support_update_array = np.zeros(shape=(35,1))
        # individual_prop_update_array = np.zeros(shape=(35,1)) 
        # print('processing propensities')

        # index=0
        # for k,v in self.propensity_params.items():
        #     support_update_array[index,0] = self.supporting_metoo_dict[k]
        #     individual_prop_update_array[index,0] = (1-v)**(support_update_array[index,0])
        #     index += 1

        # start_time = time.time()

        support_update_array = np.array(list(self.supporting_metoo_dict.values())).reshape(-1,1)
        # individual_prop_update_array = (1-np.array(list(self.propensity_params.values())))**support_update_array

        # individual_prop_update_array = individual_prop_update_array.reshape(-1,1)

        self.support_tracker = np.hstack((self.support_tracker, support_update_array))
        # print(f'support array shape for {self.ID}: {self.support_tracker.shape}')
        # self.individual_propensity = np.hstack((self.individual_propensity, individual_prop_update_array))

        # timetaken = time.time() - start_time
        # print(f'end sim. Time taken: {timetaken}')

    def simulate(self, search_hashtag_propensity):

        keys = list(self.supporting_metoo_dict.keys())
        propensities = np.array([search_hashtag_propensity[i] for i in keys])

        self.probability_matrix = np.power(1-propensities.reshape(-1,1),self.support_tracker)

        self.simulated = np.random.binomial(1, self.probability_matrix)

        # return self.probability_matrix

        # update_array = np.zeros(shape=(35,1))
        # for index, v in enumerate(self.individual_propensity[:,-1]):
        #     b = bernoulli(v)
        #     update_array[index, 0] = 1-b.rvs(1)[0]
        # update_array = update_array.reshape(-1,1)
        # self.activity_tracker = np.hstack((self.activity_tracker,update_array))

    def interact(self, other, experimentation_success_chance):

        # Keep track of interactions with others.
        # This step is asymmetric: 'self' keeps track, but 'other' does not.

        if other.ID not in self.interaction_counter:
            self.interaction_counter[other.ID]  = 1
        else:
            self.interaction_counter[other.ID] += 1

        # For later models implementing likes
        experimentation_trial = np.random.uniform()
        if experimentation_trial <= experimentation_success_chance:
            self.experimentation_success = True
        else:
            self.experimentation_success = False
        self.experimentation_count += self.experimentation_success

    # How do they behave?
    # How do agents change?
    def maybe_join(self, other, filtered_ht_dict, interact_threshold = 1, model_num = None, verbose=False):


        # Model num. This is just for ease of reproducibility. model_num being None means take the latest (probably most complex) ABM.

        if model_num == 1:

            # Simplest model. No 
            if self.supporting_metoo == False and \
                other.supporting_metoo == True and \
                self.interaction_counter[other.ID] > interact_threshold:

                self.supporting_metoo = True

                if verbose:
                    print(f'Agent {self.ID} has spoken a lot to Agent {other.ID} and now supports {other.primary_ht}')

        elif model_num == 2:

            # Model 2:
            # still have singular 'metoo' supporting
            # ADD different primary language requirement.
            if  self.supporting_metoo == False and \
                other.supporting_metoo == True and \
                other.primary_ht != self.primary_ht and \
                self.interaction_counter[other.ID] > interact_threshold:

                self.supporting_metoo = True

                if verbose:
                    print(f'Agent {self.ID} has spoken a lot to Agent {other.ID} and now supports {other.primary_ht}')

        elif model_num == 3:

            # Model 3:
            # still have singular 'metoo' supporting
            # and different language requirement.
            # ADD minimum reciprocal interaction
            if  self.supporting_metoo == False and \
                other.supporting_metoo == True and \
                other.primary_ht != self.primary_ht and \
                self.interaction_counter[other.ID] > interact_threshold and \
                other.interaction_counter[self.ID] > interact_threshold:

                self.supporting_metoo = True

                if verbose:
                    print(f'Agent {self.ID} has spoken a lot to Agent {other.ID} and now supports {other.primary_ht}')

        elif model_num == 4:

            # Model 4:
            # ADD metoodict
            # and different language requirement.
            # and minimum reciprocal interaction.
            #
            # Now there is the possibility of those who are 'supporting metoo' already in one language to be influenced to support another language.
            #
            # This model only allows for one user to influence another on their primary hashtag.

            # if  (self.supporting_metoo is False) and \
            if  (self.interaction_counter[other.ID] > interact_threshold) and \
                (other.primary_ht != self.primary_ht) and \
                (self.supporting_metoo_dict[other.primary_ht] == 0) and \
                (other.interaction_counter[self.ID] > interact_threshold):

                self.supporting_metoo_dict[other.primary_ht] += 1

                if verbose:
                    print(f'Agent {self.ID} has spoken a lot to Agent {other.ID} and now supports {other.primary_ht}')

        elif model_num == 5:

            # Model 5:
            # use metoodict
            # and different language requirement.
            # and minimum reciprocal interaction.
            #
            # ADD experimentation success
            #
            # This model only allows for one user to influence another on their primary hashtag.

            # if  (self.supporting_metoo is False) and \
            if  (self.interaction_counter[other.ID] > interact_threshold) and \
                (other.primary_ht != self.primary_ht) and \
                (self.supporting_metoo_dict[other.primary_ht] == 0) and \
                (other.interaction_counter[self.ID] > interact_threshold or self.experimentation_success):

                self.supporting_metoo_dict[other.primary_ht] += 1

                if verbose:
                    print(f'Agent {self.ID} has spoken a lot to Agent {other.ID} and now supports {other.primary_ht}')

        elif model_num == 6:

            # Model 6:
            # use metoodict
            # and different language requirement.
            # and minimum reciprocal interaction.
            #
            # ADD ability to influence other users within your community too.
            #
            # This model only allows for one user to influence another on their primary hashtag.

            # if  (self.supporting_metoo is False) and \
            if  (self.interaction_counter[other.ID] > interact_threshold) and \
                (other.primary_ht != self.primary_ht) and \
                (self.supporting_metoo_dict[other.primary_ht] == 0) and \
                (other.interaction_counter[self.ID] > interact_threshold):

                self.supporting_metoo_dict[other.primary_ht] += 1

                if verbose:
                    print(f'Agent {self.ID} has spoken a lot to Agent {other.ID} and now supports {other.primary_ht}')

            elif (self.interaction_counter[other.ID] > interact_threshold) and \
                 (other.primary_ht == self.primary_ht) and \
                 (other.interaction_counter[self.ID] > interact_threshold):

                samplelist = [k for k,v in other.supporting_metoo_dict.items() if v>0]

                if samplelist:
                    # select a random value within
                    sampled_ht_for_influence = random.choices(
                        samplelist,
                        [v for _,v in other.supporting_metoo_dict.items() if v>0],
                        k=1
                    )

                    self.supporting_metoo_dict[sampled_ht_for_influence[0]] += 1

                if verbose:
                    print(f'Agent {self.ID} has influenced someone of their own primary ht community.')

        elif model_num is None:
            pass

    def forget_all_interactions(self):
        self.interaction_counter = defaultdict(int)

    def forget_support_metoo(self):
        self.supporting_metoo = False
        self.supporting_metoo_dict = {i:0 for i in search_hashtags}

In [10]:
# Assemble network of agents
# agents = { ID: Agent(ID, df) for ID in df['author_id'].unique() }
# this is extremely slow
# %load_ext line_profiler

# temp_groupby_for_agent_creation
# temp_df = df.groupby('author_id').first()

def produce_agents():
    agents = {}
    for user in tqdm.tqdm(list(df['author_id'].unique())):
        agents[user] = Agent(user,temp_df, search_hashtags)

    return agents

# %lprun -f Agent.__init__ produce_agents()

agents_overwrite = True
agents_read_in = True
agents_savepath = f'/home/hubert/DPhil_Studies/2021-04_Study_A_Diffusion/data/06_reporting/ABM_agents_group_{group_num}.obj'

if os.path.isfile(agents_savepath) and agents_overwrite:
    print('File exists and overwriting')
    agents = produce_agents()
    with open(agents_savepath, 'wb') as f:
        pickle.dump(agents, f)
elif os.path.isfile(agents_savepath) and agents_read_in:
    print('reading in')
    with open(agents_savepath, 'rb') as f:
        agents = pickle.load(f)
elif not os.path.isfile(agents_savepath):
    print('producing agents for the first time')
    agents = produce_agents()
    with open(agents_savepath, 'wb') as f:
        pickle.dump(agents, f)

print('creating agents complete')

File exists and overwriting


100%|██████████| 12120/12120 [00:00<00:00, 53721.42it/s]


creating agents complete


In [11]:
for i, e in enumerate(list(agents[list(agents.keys())[0]].supporting_metoo_dict.keys())):
    assert e == search_hashtags[i]

In [12]:
for user_id, agent in agents.items():
    agent.forget_all_interactions()
    agent.forget_support_metoo()
for edge in G.edges():
    i,j = edge
    agents[i].interacts_with.append(j)

In [13]:
print( 'For instance, agent 12 interacts with', agents[list(agents.keys())[11]].interacts_with )

For instance, agent 12 interacts with ['2188044039', '266215308', '3023497329', '473782165', '4897361049', '80820758', '815500888634785792', '89784381', '932933255258615809', '947601090']


# 2.5: Intermediate checks with likes std

In [14]:
most_prominent_peaks

{'metoo': datetime.datetime(2018, 12, 12, 0, 0),
 'moiaussi': datetime.datetime(2018, 12, 5, 0, 0),
 'niunamenos': datetime.datetime(2018, 11, 25, 0, 0),
 'noustoutes': datetime.datetime(2018, 11, 24, 0, 0),
 'wetoo': datetime.datetime(2018, 12, 12, 0, 0),
 'ятоже': datetime.datetime(2018, 11, 29, 0, 0),
 '米兔': datetime.datetime(2018, 11, 26, 0, 0)}

In [15]:
# check rt

# df['likes_deviation'] = df.groupby('author_id')['likes'].transform(lambda x: abs(x - x.mean()) / x.std())
# df['likes_std_2'] = df['likes_deviation'] > 2

likes_df_temp = df.sort_values(by='created_at', ascending=True).groupby('author_id')['likes_std_2'].sum()

# likes_df_temp_moiaussi = 


In [None]:
likes_df_temp

In [None]:
# intermediate: check correlation between likes std and activity tout court:

# goal: correlation between activity in x days after first likes std above certain threshold

act_val = {}
for user_id, agent in df.itertuples():
    # obtain user activity

    act_val[user_id] = {}
    with h5py.File(activity_file, 'r') as f:
        activity = f[f'group_{group_num}'][user_id]['hashtagged'][:]
        feature_order = f[f'group_{group_num}'][user_id]['hashtagged'].attrs['feature_order']
        feature_order = feature_order.split(';')
        # act_val[user_id] = np.sum(activity[:,-int(daterange_length/2):])

        for hashtag_in_period in most_prominent_peaks:
            hashtag_in_period_index = feature_order.index(hashtag_in_period)

            # obtain the index offset from the detected peak of the hashtag to collect initial time window.
            peak_index_index = (datetime.datetime.strptime(group_date_range.end, '%Y-%m-%d')-most_prominent_peaks[hashtag_in_period]).days
            # offset_index -= peak_delta_init
            # offset_index = max(0,offset_index)+1
            # print(f'Offset for {hashtag_in_period} is {offset_index}')

            act_val[user_id][hashtag_in_period_index]= np.sum(activity[hashtag_in_period_index,-peak_index_index-1:])

act_val = pd.DataFrame.from_dict(act_val, orient='index').reset_index()
act_val.columns = ['user_id'] + list(most_prominent_peaks.keys())

In [None]:
act_val['sum'] = act_val.iloc[:,1:].sum(axis=1)

In [None]:
act_val_combined = act_val.merge(likes_df_temp, left_on='user_id', right_index=True)

In [None]:
act_val_combined['likes_std_2'].astype('float')

In [None]:
act_val_combined = act_val_combined[act_val_combined['sum']<1000]

In [None]:
act_val_combined['likes_std_2'].astype('float').corr(act_val_combined['sum'])

In [None]:
np.log(act_val_combined['likes_std_2'].astype('float')+1)

In [None]:
import matplotlib.pyplot as plt

plt.scatter(np.log10(act_val_combined['likes_std_2'].astype('float')+1), np.log10(act_val_combined['sum']+1))

# 3. Run the model

First model:

In [None]:
#model parameters:

# N.B. these thresholds are > not >=
params = {
    'interact_threshold': 1,
    'initial_activity_threshold': 2,
    'peak_delta_init': 7,
    'model_num': 5,
    'interact_prob': 0.7,
    'interact_prob_multiplier': 1.1,
    'experimentation_chance': 0.1,
}


search_hashtag_propensity = {
    'metoo': 0.1,
    'balancetonporc': 0.1,
    'moiaussi': 0.1,
    'نه_یعنی_نه': 0.1,
    '米兔': 0.1,
    '我也是': 0.1,
    'gamani': 0.1,
    'tôicũngvậy': 0.1,
    '私も': 0.1,
    'watashimo': 0.1,
    '나도': 0.1,
    '나도당했다': 0.1,
    'גםאנחנו': 0.1,
    'ятоже': 0.1,
    'ricebunny': 0.1,
    'enazeda': 0.1,
    'anakaman': 0.1,
    'yotambien': 0.1,
    'sendeanlat': 0.1,
    'kutoo': 0.1,
    'withyou': 0.1,
    'wetoo': 0.1,
    'cuentalo': 0.1,
    'quellavoltache': 0.1,
    'niunamenos': 0.1,
    'woyeshi': 0.1,
    'myharveyweinstein': 0.1,
    'noustoutes': 0.1,
    'stilleforopptak': 0.1,
    'nårdansenstopper': 0.1,
    'nårmusikkenstilner': 0.1,
    'memyös': 0.1,
    'timesup': 0.1,
    'niere': 0.1,
    'jotambe': 0.1
}

In [12]:
with h5py.File(activity_file, 'r') as f:
    activity = f[f'group_{group_num}']
    print(activity)


<HDF5 group "/group_2" (12586 members)>


In [None]:
def reset_abm(agents, initial_activity_threshold, search_hashtag_propensity, peak_delta_init = 10):

    # First, reset everyone's memory of their interactions
    for user_id, agent in agents.items():
        agent.forget_all_interactions()
        agent.forget_support_metoo()
        agent.propensity_params = search_hashtag_propensity


    # Alternate second step: activate everyone above a certain activity threshold before the daterange. This activation will be done by primary language they have expressed some 
    pre_val = {}
    for user_id, agent in agents.items():

        pre_val[user_id] = {}

        # obtain user activity
        with h5py.File(activity_file, 'r') as f:
            activity = f[f'group_{group_num}'][user_id]['hashtagged'][:]
            feature_order = f[f'group_{group_num}'][user_id]['hashtagged'].attrs['feature_order']
            feature_order = feature_order.split(';')

            # obtain values for the hashtags that have peaks in this time period
            for hashtag_in_period in most_prominent_peaks:
                hashtag_in_period_index = feature_order.index(hashtag_in_period)

                # obtain the index offset from the detected peak of the hashtag to collect initial time window.
                offset_index = (most_prominent_peaks[hashtag_in_period] - datetime.datetime.strptime(group_date_range.start, '%Y-%m-%d')).days
                offset_index -= peak_delta_init
                offset_index = max(0,offset_index)+1
                # print(f'Offset for {hashtag_in_period} is {offset_index}')

                pre_val[user_id][hashtag_in_period_index]= np.sum(activity[hashtag_in_period_index,:offset_index])

    pre_val = pd.DataFrame.from_dict(pre_val, orient='index').reset_index()
    pre_val.columns = ['user_id'] + list(most_prominent_peaks.keys())

    init_count = set()
    for _, row in pre_val.iterrows():
        for hashtag_in_period in most_prominent_peaks:
            if row[hashtag_in_period] > initial_activity_threshold:
                init_count.add(row['user_id'])
                agent_in_question = agents[row['user_id']]
                agent_in_question.supporting_metoo=True
                agent_in_question.supporting_metoo_dict[hashtag_in_period] = True

    print(f'Total initally set to support: {len(init_count)}')

    return pre_val


In [85]:
pre_val = reset_abm(agents,
    initial_activity_threshold=params['initial_activity_threshold'],
    search_hashtag_propensity=search_hashtag_propensity,
    peak_delta_init=params['peak_delta_init']
)

Total initally set to support: 337


In [86]:
pre_val.iloc[:,1:].sum(axis=0)

metoo         4862.0
moiaussi        69.0
niunamenos      42.0
noustoutes     177.0
wetoo          287.0
ятоже            0.0
米兔               0.0
dtype: float64

In [87]:
def run_model(
        agents,
        params,
        verbose = False
    ):

    for time in tqdm.tqdm(range(daterange_length)):
        if verbose:
            print(f'Starting interactions on day {time+1}')
        for _, agent in agents.items():

            # pick a random person that the agent interacts with
            try:

                other_agent = agents[np.random.choice(agent.interacts_with)]

                # interact with them
                if np.random.uniform()<=(params['interact_prob'])*params['interact_prob_multiplier']**(agent.interaction_counter[other_agent.ID]):
                    agent.interact(other_agent, params['experimentation_chance'])

                    # if you've interacted with them many times recently, say something
                    agent.maybe_join(
                        other_agent,
                        filtered_ht_dict,
                        params['interact_threshold'],
                        model_num = params['model_num'],
                        verbose=verbose)

                agent.update_tracker()

            except ValueError:

                agent.update_tracker()
                continue

    return agents


filtered_ht_dict = df.groupby('author_id')['ht'].agg(pd.Series.mode).to_dict()

# some have multiple for mode, just take first one as an approximation now:
for k,v in filtered_ht_dict.items():
    if isinstance(v, np.ndarray):
        filtered_ht_dict[k] = v[0]

modelled_agents = run_model(
            agents,
            params,
            verbose = False
        )

print('Simulating...')
for _, agent in modelled_agents.items():
    agent.simulate(search_hashtag_propensity)

100%|██████████| 42/42 [00:29<00:00,  1.41it/s]


Simulating...


# 3.5 Checking simulation part

In [27]:
counter = 0
nonzero = []
for agent_id, agent in modelled_agents.items():
    if agent.support_tracker.sum() > 0:
        counter +=1
        nonzero.append(agent_id)

print(counter)

4516


In [28]:
nonzero[0]

'1000030383969316865'

In [30]:
modelled_agents['1000030383969316865'].support_tracker.sum(axis=1)

array([ 0., 16.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0., 35.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

# 4. Display Results

In [None]:
def print_abm_results(agents_dict, model_num=None):

    output_dict={}

    total = len(agents_dict)
    # num_supporting = 0
    # num_not_supporting = 0

    for user_id, agent in agents_dict.items():
        output_dict[user_id] = agent.supporting_metoo_dict

    output_df = pd.DataFrame.from_dict(output_dict, orient='index').reset_index()

    num_supporting = output_df.iloc[:,1:].sum(axis=0)
    num_supporting = num_supporting[num_supporting>0]
    num_not_supporting = (output_df==False).sum(axis=0)

    print(num_supporting)
    # print(num_not_supporting)
    return output_df

In [None]:
res = print_abm_results(modelled_agents)

In [None]:
# compare with actual activity

# define active at the end as having above 10? or somewhere along the distribution?

act_val = {}
for user_id, agent in agents.items():
    # obtain user activity

    act_val[user_id] = {}
    with h5py.File(activity_file, 'r') as f:
        activity = f[f'group_{group_num}'][user_id]['hashtagged'][:]
        feature_order = f[f'group_{group_num}'][user_id]['hashtagged'].attrs['feature_order']
        feature_order = feature_order.split(';')
        # act_val[user_id] = np.sum(activity[:,-int(daterange_length/2):])

        for hashtag_in_period in most_prominent_peaks:
            hashtag_in_period_index = feature_order.index(hashtag_in_period)

            # obtain the index offset from the detected peak of the hashtag to collect initial time window.
            peak_index_index = (datetime.datetime.strptime(group_date_range.end, '%Y-%m-%d')-most_prominent_peaks[hashtag_in_period]).days
            # offset_index -= peak_delta_init
            # offset_index = max(0,offset_index)+1
            # print(f'Offset for {hashtag_in_period} is {offset_index}')

            act_val[user_id][hashtag_in_period_index]= np.sum(activity[hashtag_in_period_index,-peak_index_index-1:])

act_val = pd.DataFrame.from_dict(act_val, orient='index').reset_index()
act_val.columns = ['user_id'] + list(most_prominent_peaks.keys())

# act_val = pd.DataFrame.from_dict(act_val, orient='index').reset_index()
# act_val.columns = ['user_id', 'val']

# (act_val['val']>10).sum()


In [None]:
(act_val.iloc[:,1:]>0).sum(axis=0)

# 5. Visualise

In [None]:
res[res['kutoo']>0]['kutoo'].sort_values()

In [None]:
res[res['metoo']>0]['metoo'].sort_values(ascending=False)

In [None]:
res[res['kutoo']>0]['kutoo'].sort_values(ascending=False)

In [None]:
res[res['niunamenos']>0]['niunamenos'].sort_values(ascending=False)

In [None]:
agents['92263198'].support_tracker[19,:]

In [None]:
search_hashtags.index('kutoo')

In [None]:
res

In [None]:
for ht, _ in most_prominent_peaks.items():
    abm_res = res[res[ht]>0][['index', ht]]
    act_res = act_val[act_val[ht]>3][['user_id',ht]]

    overlap = set(abm_res['index']) & set(act_res['user_id'])

    print(f'For {ht}: overlap = {len(overlap)}, total = {len(act_res)}')

# 6. Graph Attributes

* to help answer questions about whether or not paritcular users were key brokers of knowledge. cf. Sandra's work?