# 17 NoVax Communities Interactions - Save Data 
We want to analyse the interactions between users in novax communities of different countries, and compare them with the interactions between other users in the same countries.

Here, we collect the data about number of users and number of interactions between users in novax communities and users in rest of the networks for each pair of countries.

We will analyse the probability that two random users in novax communities are connected with respect to two random users in other communities, that is:

$A_{ij}$ = $\frac{\delta^{A}_{ij}}{\delta^{P}_{ij}}$, where

$\delta^{A}_{ij} = \frac{\text{#edges antivax from i to j}}{\text{#pairs of antivax users between i and j}}$

$\delta^{P}_{ij} = \frac{\text{#edges provax from i to j}}{\text{#pairs of provax users between i and j}}$

$\delta^{A}_{ij}$ is the probability that two random novax users from countries i and j are connected.

For each period, we create a dataframe that for each pair of countries having a novax communities store information about number ofedges, number of users, density of retweets (=probability of connection) for users in novax communities and users in other communities.

In [11]:
import pandas as pd
import numpy as np
from glob import glob
from functools import reduce
from itertools import permutations

In [8]:
#data on edges and users are stored in this folder
folder = "/data/public/jlenti/multilang-vax/EuropeAmerica_RTCO"

#list of novax communities
#i transform it into a dataframe with columns (id, country, period, community)
#novax_com = pd.read_csv("/home/jlenti/Files/novax_communities_thres066_0402.csv", index_col = 0).apply(lambda x: pd.Series([x[0]] + x[0].split("_"), index = ["com", "country", "period", "community"]), axis = 1)
novax_com = pd.read_csv("/home/jlenti/Files/novax_communities_9tot_antivax_0902.csv", index_col = 0).apply(lambda x: pd.Series([x[0]] + x[0].split("_"), index = ["com", "country", "period", "community"]), axis = 1)

#list of all countries (size ordered)
countries = ["US", "BR", "AR", "GB", "ES", "MX", "FR", "CA", "TR", "VE", "AU", "CO", "IT", "CL", "DE",
             "PT", "IE", "PY", "EC", "RU", "UY", "NZ", "PL", "NL", "PE", "CU", "PA", "GR"]
#named periods
periods = ["period" + str(u + 1) for u in range(4)]
periods_names = ["pre-COVID", "pre-vax", "vax development", "vax rollout"]
periods_abbr = ["PC", "PV", "VD", "VR"]

In [10]:
cross_countries_interaction = {}
for p in [str(u + 1) for u in range(4)]:
    print("period", p)
    #list of countries with a novax community in the period
    novax_com_countries = list(set(novax_com.query("period == @p")["country"]))
    #cross-country retweets in the period, filtering countries with a novax community 
    cross_RT = pd.read_csv(sorted(glob("/".join([folder, "period" + str(p), "*cross*RT*"])))[0]) \
    .query("(country in @novax_com_countries)&(country_RT in @novax_com_countries)")
    #since I want to measure the density of retweets, I need to know the number of links (retweets) and
    #the number of possible links (all the pairs of nodes)
    #I create an empty list that will be filled one pair of country at a time
    novax_edges, provax_edges, novax_users, provax_users = [], [], [], []
    #for each country I store the list of novax communities in that country/period
    novax_coms_dict = {c: novax_com.query("period == @p").query("country == @c")["community"].tolist() 
                       for c in novax_com_countries}
    #read the dataframes with community assignement, and give "novax" label to users in a novax community
    #for each country with a novax community in that period
    users_dict = {c: pd.read_csv(sorted(glob("/".join([folder, "period" + str(p), c + "*_RT_com*"])))[0]) \
                  .assign(novax = lambda x: x.community.isin([int(u) for u in novax_coms_dict[c]]))
                 for c in novax_com_countries}
    #extract from previous dictionary the list of users in novax community and list of users in other
    #community for each country, and create two new dictionaries
    novax_users_dict = {c: users_dict[c].query("novax")["user"].tolist() for c in novax_com_countries}
    provax_users_dict = {c: users_dict[c].query("~novax")["user"].tolist() for c in novax_com_countries}
    #for each pair of countries, count the number of retweets between users in novax communities and
    #between users in rest of networks, and count the number of potential links between users
    #in novax communities and in users in other communities
    #find all the pairs of countries with permutations()
    for c1, c2 in permutations(novax_com_countries, 2):
        #extract lists of users in novax communities and in other commuinties in the two countries
        novax_u1, novax_u2, provax_u1, provax_u2 = novax_users_dict[c1], novax_users_dict[c2], provax_users_dict[c1], provax_users_dict[c2]
        #count the number of links between users in novax communities, and store them in list
        novax_edges.append([c1, c2, len(cross_RT.query("(user in @novax_u1)&(user_RT in @novax_u2)"))])
        #count the number of links between users in other communities, and store them in list
        provax_edges.append([c1, c2, len(cross_RT.query("(user in @provax_u1)&(user_RT in @provax_u2)"))])
        #count number of users in novax commuinities in the two countries, and store them in list
        novax_users.append([c1, c2, len(novax_u1), len(novax_u2)])
        #count number of users in other commuinities in the two countries, and store them in list
        provax_users.append([c1, c2, len(provax_u1), len(provax_u2)])
        
    cross_countries_interaction["period" + p] = {"novax_edges": novax_edges, "provax_edges": provax_edges, 
                                                 "novax_users": novax_users, "provax_users": provax_users}


period 1
period 2
period 3
period 4


In [16]:
#with reduce() merge two dataframes
novax_countries_retweets_full = {period: reduce(lambda x, y: pd.merge(x, y), 
                                                #merge dataframes with informations on edges and nodes of the pairs of countries
                                                #on the columns ["country", "country_RT"]
                                                [pd.DataFrame(cross_countries_interaction[period]["novax_users"], 
                                                              columns = ["country", "country_RT", "novax_users", "novax_users_RT"])
                                                 .assign(novax_pairs = lambda x: x["novax_users"] * x["novax_users_RT"]), #compute the number of possible pairs of users
                                                 pd.DataFrame(cross_countries_interaction[period]["novax_edges"], 
                                                              columns = ["country", "country_RT", "novax_retweets"]),
                                                 pd.DataFrame(cross_countries_interaction[period]["provax_users"], 
                                                              columns = ["country", "country_RT", "provax_users", "provax_users_RT"])
                                                 .assign(provax_pairs = lambda x: x["provax_users"] * x["provax_users_RT"]), #compute the number of possible pairs of users
                                                 pd.DataFrame(cross_countries_interaction[period]["provax_edges"],
                                                              columns = ["country", "country_RT", "provax_retweets"])
                                                ])
                                 #the density is the ratio between the pairs of users connected and the possible pairs of users,
                                 #that is the probability that two random users are connected
                                 #we compute the density (probability) of retweets both for users in novax communities
                                 #and users in other commuinties
                                 .assign(novax_density = lambda x: x["novax_retweets"] / x["novax_pairs"], 
                                           provax_density = lambda x: x["provax_retweets"] / x["provax_pairs"]
                                          )
                                 #then we compute the density ratio, that is the ratio between the density of retweets
                                 #of users in novax commuinties and te density of retweets in other communities
                                 #this tell us the probability that two random users in novax communities are connected
                                 #wrt the two random users in other communities
                                 .assign(density_ratio = lambda x: x["novax_density"] / x["provax_density"])
                            for period in periods
                           }

In [20]:
novax_countries_retweets_full["period1"]

Unnamed: 0,country,country_RT,novax_users,novax_users_RT,novax_pairs,novax_retweets,provax_users,provax_users_RT,provax_pairs,provax_retweets,novax_density,provax_density,density_ratio
0,IE,AU,243,152,36936,5,2574,4087,10519938,83,0.000135,7.889780e-06,17.157548
1,IE,IT,243,2769,672867,0,2574,2612,6723288,1,0.000000,1.487367e-07,0.000000
2,IE,NZ,243,135,32805,2,2574,1632,4200768,13,0.000061,3.094672e-06,19.700412
3,IE,CA,243,373,90639,13,2574,5854,15068196,162,0.000143,1.075112e-05,13.340573
4,IE,GB,243,1442,350406,35,2574,21828,56185272,499,0.000100,8.881331e-06,11.246528
...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,DE,CA,434,373,161882,3,5017,5854,29369518,53,0.000019,1.804592e-06,10.269366
86,DE,GB,434,1442,625828,1,5017,21828,109511076,145,0.000002,1.324067e-06,1.206799
87,DE,CU,434,78,33852,0,5017,1620,8127540,0,0.000000,0.000000e+00,
88,DE,US,434,6513,2826642,344,5017,7870,39483790,537,0.000122,1.360052e-05,8.948128


In [None]:
#save dataframes

#for period in novax_countries_retweets_full:
#    novax_countries_retweets_full.to_csv("/home/jlenti/Files/novax_provax_countries_retweets_{0}.csv".format(period), index_col = 0)