# 11 Networks Stats
In this notebook, we extract different statistics on the networks we have built, for each country, each period.
They can be topological properties of the networks, or related to behaviors of the users (such as sharing activity), or account status.

In [4]:
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi
from RWC_score import rwc
import pandas as pd
from glob import glob
import numpy as np

In [6]:
#data on edges and users are stored in this folder
folder = "/data/public/jlenti/multilang-vax/EuropeAmerica_RTCO"

#list of novax communities
#i transform it into a dataframe with columns (id, country, period, community)
#novax_com = pd.read_csv("/home/jlenti/Files/novax_communities_thres066_0402.csv", index_col = 0).apply(lambda x: pd.Series([x[0]] + x[0].split("_"), index = ["com", "country", "period", "community"]), axis = 1)
novax_com = pd.read_csv("/home/jlenti/Files/novax_communities_9tot_antivax_0902.csv", index_col = 0).apply(lambda x: pd.Series([x[0]] + x[0].split("_"), index = ["com", "country", "period", "community"]), axis = 1)
novax_com_l = novax_com.drop("com", axis = 1).groupby(["country", "period"]) \
.apply(lambda x: x["community"].tolist()).reset_index().rename(columns = {0: "communities"}).set_index(["country", "period"])

#list of neutral domains (url shorteners, twitter.com, facebook.com)
neutrals = pd.read_csv("/home/jlenti/Files/neutral_domains_1309.txt")["0"].tolist()
#list of low-credible domains
blacklist = pd.read_csv("/home/jlenti/Files/merged_blacklist_1309.txt")["0"].tolist()
blacklist.extend(pd.read_csv( '/home/jlenti/Files/lemonde_blacklist_2709.txt')["domain"].tolist())
blacklist.extend(pd.read_csv('/home/jlenti/Files/greek_blacklist_1009.txt')["0"].tolist())
#domains associated to youtube
youtube_domains = ["youtube.com", "youtu.be"]

#list of all countries (size ordered)
countries = ["US", "BR", "AR", "GB", "ES", "MX", "FR", "CA", "TR", "VE", "AU", "CO", "IT", "CL", "DE",
             "PT", "IE", "PY", "EC", "RU", "UY", "NZ", "PL", "NL", "PE", "CU", "PA", "GR"]
#sorted by language
lang_sort = ["US", "IE", "GB", "CA", "NZ", "AU", "FR", "IT", "PL", "NL", "DE", "RU", "TR", 
             "BR", "PT", "GR", "AR", "ES", "MX","VE", "CO", "CL",
             "PY", "EC", "UY", "PE", "CU", "PA"]
#countries speaking english or italian, the ones with a list of low-credible domains
LC_countries = ["IT", "US", "GB", "AU", "NZ", "IE"]
#countries with a novax community, sorted by language
lang_sort_novax = ["US", "GB", "IE", "AU", "NZ", "CA", "DE", "FR", "GR", "IT",
                   "NL", "PL", "RU",  "BR", "PA", "CU", "ES", "UY"]
#named periods
periods = ["period" + str(u + 1) for u in range(4)]
#dataframe with the account status of the users (found, not found, or suspended)
account_status = pd.concat([pd.read_csv(file) 
                            for file in sorted(glob("/".join([folder, "*", "*status*"])))]) \
.groupby("user_screen_name").tail(1).rename(columns = {"user_screen_name": "user"}).set_index("user")

#count of the labels from first round of labeling for each community
label_count_1 = pd.read_csv("/home/jlenti/Files/communities_labels_round1.csv") \
.assign(country = lambda x: [u[:2] for u in x["com_id"]], 
        period = lambda x: ["period" + u[3] for u in x["com_id"]], 
        community = lambda x: [int(u[-1]) for u in x["com_id"]],
        antivax_prop = lambda x: x["antivax"] / (x["antivax"] + x["provax"]))

In [7]:
#easy way to get data from the folder, just giving as input country, period, and object needed, 
#getting the file names with glob
def read_data(country, period, obj, layer):
    files = sorted(glob("/".join([folder, period, country + "*" + layer + "_" + obj + "*"])))[0]
    return pd.read_csv(files)

### Example

In [8]:
country, period = "IT", "period1"
p = period[-1]

In [13]:
#get data about users, edges
RT_com = read_data(country, period, "com", "RT")
CO_com = read_data(country, period, "com", "CO")
#RT edgelist
RT_edges = read_data(country, period, "ed", "RT")
#user-url dataframe
urls = read_data(country, period, "urls", "")        

In [14]:
#list of users in the RT network
users = RT_com["user"].tolist()
tot_users = len(users)

In [15]:
#average number of retweets per user
activity = RT_edges.groupby("user").sum().mean()["weight"]
#number of (weighted) edges over number of pairs of users. It is the probability 
#that two random users are connected 
density = len(RT_edges.query("(user in @users)&(user_RT in @users)").groupby(["user", "user_RT"]).count()) / \
(len(users) * (len(users) - 1))
        

In [16]:
#average number of urls shared per user
avg_urls = len(urls.query("user in @users")) / len(users)
#average number of youtube videos shared per user
avg_youtube = len(urls.query("(domain in @youtube_domains)&(user in @users)")) / len(users)
#proportion of domains with a low-credible domain
#exclude neutral domains, assign domains to LC blacklist, count domains LC and not LC, divide by total not neutral domains, get only LC = True
LC_prop = (urls.query("domain not in @neutrals").assign(LC = lambda x: x["domain"].isin(blacklist)) \
           .groupby("LC").count()["urls"] / len(urls.query("domain not in @neutrals")))[True]        

In [17]:
#proportion of tweets annotated as 'antivax' at first round of labeling
tot_labels_network = label_count_1.query("(country == @country)&(period == @period)") \
[["antivax", "other", "provax"]].sum()
prop_antivax_labels = (tot_labels_network / tot_labels_network.sum())["antivax"]

In [18]:
#proportion of accounts with status 'not found' or 'suspended'
notfound_users = 1 - account_status.loc[users]["status"].value_counts() \
.apply(lambda x: x / len(account_status.loc[users]))["found"]
#proportion of accounts with status 'supended'
susp_users = account_status.loc[users]["status"].value_counts() \
.apply(lambda x: x / len(account_status.loc[users]))["suspended"]

In [21]:
#compute normalized mutual information between RT and CO layers (-> echo chambers effect)
layers_coms = RT_com.merge(CO_com, on = "user")
nmi_RTCO = nmi(layers_coms["community_x"], layers_coms["community_y"])

In [23]:
print("Is there a novax community in this country/period?", (country, p) in [(u[1]["country"], u[1]["period"]) for u in novax_com.iterrows()])

Is there a novax community in this country/period? True


So, I can compare some stats for users in novax communities and rest of users.

In [27]:
#coms are the novax communities 
coms = [int(u) for u in novax_com_l.loc[(country, p), "communities"]]
RT_com["novax"] = RT_com["community"].isin(coms)
#select novax users and non-novax users
novax_users = RT_com.query("novax").user.tolist()
provax_users = RT_com.query("~novax").user.tolist()

tot_novax, tot_provax = len(novax_users), len(provax_users)
#proportion of antivax users in the network
prop_antivax = len(novax_users) / (len(novax_users) + len(provax_users))            

In [28]:
#average retweets per user
novax_activity = RT_edges.query("user in @novax_users").groupby("user").sum().mean()["weight"]
provax_activity = RT_edges.query("user in @provax_users").groupby("user").sum().mean()["weight"]
#probability that two random (both novax/provax) users are connected
novax_density = len(RT_edges.query("(user in @novax_users)&(user_RT in @novax_users)") \
                    .groupby(["user", "user_RT"]).count()) / \
(len(novax_users) * (len(novax_users) - 1))
provax_density = len(RT_edges.query("(user in @provax_users)&(user_RT in @provax_users)") \
                     .groupby(["user", "user_RT"]).count()) / \
(len(provax_users) * (len(provax_users) - 1))

In [29]:
#keep only users in RT network, novax or provax
urls_novax = urls.merge(RT_com)
#proportion of urls shared by pro/novax user
novax_urls = urls_novax.groupby("novax").count()["user"][True] / len(novax_users)
provax_urls = urls_novax.groupby("novax").count()["user"][False] / len(provax_users)
#proportion of youtube shared by pro/novax user
youtube_domains = ["youtube.com", "youtu.be"]
provax_youtube = urls_novax.query("domain in @youtube_domains").groupby("novax").count()["user"][False] / len(provax_users)
novax_youtube = urls_novax.query("domain in @youtube_domains").groupby("novax").count()["user"][True] / len(novax_users)
#proportion of low-credible domains shared by pro/novax users (if some LC domains have been shared)
LC_prop = urls_novax.query("domain not in @neutrals").assign(LC = lambda x: x["domain"].isin(blacklist)) \
.groupby(["novax", "LC"]).count()["domain"].unstack().apply(lambda x: x / x.sum(), axis = 1)
if True in LC_prop.columns:
    novax_LC_prop = LC_prop.loc[True, True]
    provax_LC_prop = LC_prop.loc[False, True]
else:
    novax_LC_prop, provax_LC_prop = 0, 0.00000000000000001

In [30]:
#proportion of not 'found' accounts (counted as 1 - 'found')
notfound_novax = 1 - account_status.loc[novax_users]["status"].value_counts().apply(lambda x: x / len(account_status.loc[novax_users]))["found"]
notfound_provax = 1 - account_status.loc[provax_users]["status"].value_counts().apply(lambda x: x / len(account_status.loc[provax_users]))["found"]
#proportion of 'suspended accounts'
susp_novax = account_status.loc[novax_users]["status"].value_counts().apply(lambda x: x / len(account_status.loc[novax_users]))["suspended"]
susp_provax = account_status.loc[provax_users]["status"].value_counts().apply(lambda x: x / len(account_status.loc[provax_users]))["suspended"]


In [31]:
#compute rwc novax vs provax
rwc_novax = rwc(RT_edges, RT_com)

In [35]:
#i also define all the ratio betweeen novax stats and rest of network stats
activity_ratio, density_ratio, urls_ratio, youtube_ratio, \
LC_prop_ratio,susp_ratio, notfound_ratio = novax_activity / provax_activity, \
novax_density / provax_density, novax_urls / provax_urls, novax_youtube / provax_youtube, \
novax_LC_prop / provax_LC_prop, susp_novax / susp_provax, notfound_novax / notfound_provax


In [39]:
pd.Series([country, period, tot_users, activity, density, avg_urls, avg_youtube, prop_antivax_labels, nmi_RTCO,
           LC_prop, susp_users, notfound_users, coms, tot_novax, tot_provax, prop_antivax, novax_activity, provax_activity,
           novax_density, provax_density, novax_urls, provax_urls, novax_youtube, provax_youtube, 
           novax_LC_prop, provax_LC_prop, susp_novax, susp_provax, 
           notfound_novax, notfound_provax, activity_ratio, density_ratio, urls_ratio, youtube_ratio,
           LC_prop_ratio, susp_ratio, notfound_ratio, rwc_novax],
          index = ["country", "period", "tot_users", "activity", "density", "avg_urls",
                   "avg_youtube", "prop_antivax_labels", "nmi_RTCO",
                   "LC_prop", "susp_users", "notfound_users", "coms", "tot_novax", 
                   "tot_provax", "prop_antivax", "novax_activity", "provax_activity",
                   "novax_density", "provax_density", "novax_urls", "provax_urls",
                   "novax_youtube", "provax_youtube",
                   "novax_LC_prop", "provax_LC_prop", 
                   "susp_novax", "susp_provax", "notfound_novax", "notfound_provax",
                   "activity_ratio", "density_ratio", "urls_ratio", "youtube_ratio",
                   "LC_prop_ratio", "susp_ratio", "notfound_ratio", "rwc_novax"]
         )

country                                                               IT
period                                                           period1
tot_users                                                           5381
activity                                                        5.876541
density                                                         0.000394
avg_urls                                                        1.131202
avg_youtube                                                     0.136406
prop_antivax_labels                                                0.375
nmi_RTCO                                                        0.357016
LC_prop                LC        False      True
novax               ...
susp_users                                                      0.035495
notfound_users                                                  0.134547
coms                                                                 [1]
tot_novax                                          

### All countries,  periods
I create a dataframe info_df with such information for all countries and periods.

Networks without a novax communities will have None entries for all the features comparing novax-rest of network.

In [43]:
#create a list that will be transformed in a pandas DataFrame
info = []

for country in countries:
    print(country)
    for period in periods:
        print(period)
        #all features are set nan, so that if i skip them i will have a nan entry
        tot_users, activity, density, avg_urls, avg_youtube, nmi_RTCO, prop_antivax_labels, \
        LC_prop, susp_users, notfound_users, coms, tot_novax, tot_provax, prop_antivax, novax_activity, provax_activity, \
        novax_density, provax_density, novax_urls, provax_urls, novax_youtube, provax_youtube, \
        novax_LC_prop, provax_LC_prop, rwc_novax, susp_novax, susp_provax, notfound_novax, notfound_provax, \
        activity_ratio, density_ratio, urls_ratio, youtube_ratio, LC_prop_ratio, susp_ratio, notfound_ratio = [np.nan] * 36
                
        p = period[-1]
        #get data about users, edges
        RT_com = read_data(country, period, "com", "RT")
        CO_com = read_data(country, period, "com", "CO")
        #list of users in the RT network
        users = RT_com["user"].tolist()
        tot_users = len(users)
        #RT edgelist
        RT_edges = read_data(country, period, "ed", "RT")
        #user-url dataframe
        urls = read_data(country, period, "urls", "")
        
        #average number of retweets per user
        activity = RT_edges.groupby("user").sum().mean()["weight"]
        #number of (weighted) edges over number of pairs of users. It is the probability that two random users are connected 
        density = len(RT_edges.query("(user in @users)&(user_RT in @users)").groupby(["user", "user_RT"]).count()) / (len(users) * (len(users) - 1))
        
        #average number of urls shared per user
        avg_urls = len(urls.query("user in @users")) / len(users)
        #average number of youtube videos shared per user
        avg_youtube = len(urls.query("(domain in @youtube_domains)&(user in @users)")) / len(users)
        #proportion of domains with a low-credible domain
        #exclude neutral domains, assign domains to LC blacklist, count domains LC and not LC, divide by total not neutral domains, get only LC = True
        LC_prop = (urls.query("domain not in @neutrals").assign(LC = lambda x: x["domain"].isin(blacklist)).groupby("LC").count()["urls"] / len(urls.query("domain not in @neutrals")))[True]
        #proportion of tweets annotated as 'antivax' at first round of labeling
        tot_labels_network = label_count_1.query("(country == @country)&(period == @period)")[["antivax", "other", "provax"]].sum()
        prop_antivax_labels = (tot_labels_network / tot_labels_network.sum())["antivax"]
        #proportion of accounts with status 'not found' or 'suspended'
        notfound_users = 1 - account_status.loc[users]["status"].value_counts() \
        .apply(lambda x: x / len(account_status.loc[users]))["found"]
        #proportion of accounts with status 'supended'
        susp_users = account_status.loc[users]["status"].value_counts() \
        .apply(lambda x: x / len(account_status.loc[users]))["suspended"]
        
        #compute normalized mutual information between RT and CO layers (-> echo chambers effect)
        layers_coms = RT_com.merge(CO_com, on = "user")
        nmi_RTCO = nmi(layers_coms["community_x"], layers_coms["community_y"])
        
        #now i focus on networks with a novax community
        if (country, p) in [(u[1]["country"], u[1]["period"]) for u in novax_com.iterrows()]:
            #coms are the novax communities 
            coms = [int(u) for u in novax_com_l.loc[(country, p), "communities"]]
            RT_com["novax"] = RT_com["community"].isin(coms)
            #select novax users and non-novax users
            novax_users = RT_com.query("novax").user.tolist()
            provax_users = RT_com.query("~novax").user.tolist()
            
            tot_novax, tot_provax = len(novax_users), len(provax_users)
            #proportion of antivax users in the network
            prop_antivax = len(novax_users) / (len(novax_users) + len(provax_users))
            #average retweets per user
            novax_activity = RT_edges.query("user in @novax_users").groupby("user").sum().mean()["weight"]
            provax_activity = RT_edges.query("user in @provax_users").groupby("user").sum().mean()["weight"]
            #probability that two random (both novax/provax) users are connected
            novax_density = len(RT_edges.query("(user in @novax_users)&(user_RT in @novax_users)").groupby(["user", "user_RT"]).count()) / \
            (len(novax_users) * (len(novax_users) - 1))
            provax_density = len(RT_edges.query("(user in @provax_users)&(user_RT in @provax_users)").groupby(["user", "user_RT"]).count()) / \
            (len(provax_users) * (len(provax_users) - 1))

            #keep only users in RT network, novax or provax
            urls_novax = urls.merge(RT_com)
            #proportion of urls shared by pro/novax user
            novax_urls = urls_novax.groupby("novax").count()["user"][True] / len(novax_users)
            provax_urls = urls_novax.groupby("novax").count()["user"][False] / len(provax_users)
            #proportion of youtube shared by pro/novax user
            youtube_domains = ["youtube.com", "youtu.be"]
            provax_youtube = urls_novax.query("domain in @youtube_domains").groupby("novax").count()["user"][False] / len(provax_users)
            novax_youtube = urls_novax.query("domain in @youtube_domains").groupby("novax").count()["user"][True] / len(novax_users)
            #proportion of low-credible domains shared by pro/novax users
            LC_prop = urls_novax.query("domain not in @neutrals").assign(LC = lambda x: x["domain"].isin(blacklist)).groupby(["novax", "LC"]).count()["domain"].unstack().apply(lambda x: x / x.sum(), axis = 1)
            if True in LC_prop.columns:
                novax_LC_prop = LC_prop.loc[True, True]
                provax_LC_prop = LC_prop.loc[False, True]
            else:
                novax_LC_prop, provax_LC_prop = 0, 0.00000000000000001
            #proportion of not 'found' accounts (counted as 1 - 'found')
            notfound_novax = 1 - account_status.loc[novax_users]["status"].value_counts().apply(lambda x: x / len(account_status.loc[novax_users]))["found"]
            notfound_provax = 1 - account_status.loc[provax_users]["status"].value_counts().apply(lambda x: x / len(account_status.loc[provax_users]))["found"]
            #proportion of 'suspended accounts'
            susp_novax = account_status.loc[novax_users]["status"].value_counts().apply(lambda x: x / len(account_status.loc[novax_users]))["suspended"]
            susp_provax = account_status.loc[provax_users]["status"].value_counts().apply(lambda x: x / len(account_status.loc[provax_users]))["suspended"]

            activity_ratio, density_ratio, urls_ratio, youtube_ratio, \
            LC_prop_ratio,susp_ratio, notfound_ratio = novax_activity / provax_activity, \
            novax_density / provax_density, novax_urls / provax_urls, novax_youtube / provax_youtube, \
            novax_LC_prop / provax_LC_prop, susp_novax / susp_provax, notfound_novax / notfound_provax
            
            #compute rwc novax vs provax
            rwc_novax = rwc(RT_edges, RT_com)

        

        info.append([country, period, tot_users, activity, density, avg_urls, avg_youtube, prop_antivax_labels, nmi_RTCO,
                     LC_prop, susp_users, notfound_users, coms, tot_novax, tot_provax, prop_antivax, novax_activity, provax_activity, 
                     novax_density, provax_density, novax_urls, provax_urls, novax_youtube, provax_youtube, 
                     novax_LC_prop, provax_LC_prop, susp_novax, susp_provax, 
                     notfound_novax, notfound_provax, activity_ratio, density_ratio, urls_ratio, youtube_ratio,
                     LC_prop_ratio, susp_ratio, notfound_ratio, rwc_novax
                    ])





US
period1
period2
period3
period4
BR
period1
period2
period3
period4
AR
period1
period2
period3
period4
GB
period1
period2
period3
period4
ES
period1
period2
period3
period4
MX
period1
period2
period3
period4
FR
period1
period2
period3
period4
CA
period1
period2
period3
period4
TR
period1
period2
period3
period4
VE
period1
period2
period3
period4
AU
period1
period2
period3
period4
CO
period1
period2
period3
period4
IT
period1
period2
period3
period4
CL
period1
period2
period3
period4
DE
period1
period2
period3
period4
PT
period1
period2
period3
period4
IE
period1
period2
period3
period4
PY
period1
period2
period3
period4
EC
period1
period2
period3
period4
RU
period1
period2
period3
period4
UY
period1
period2
period3
period4
NZ
period1
period2
period3
period4
PL
period1
period2
period3
period4
NL
period1
period2
period3
period4
PE
period1
period2
period3
period4
CU
period1
period2
period3
period4
PA
period1
period2
period3
period4
GR
period1
period2
period3
period4


In [47]:
info_df = pd.DataFrame(info, columns = ["country", "period", "tot_users", "activity", "density", "avg_urls", "avg_youtube", "prop_antivax_labels", "nmi_RTCO",
                                        "LC_prop", "susp_users", "notfound_users", "coms", "tot_novax", "tot_provax", "prop_antivax", "novax_activity", "provax_activity",
                                        "novax_density", "provax_density", "novax_urls", "provax_urls", "novax_youtube", "provax_youtube", 
                                        "novax_LC_prop", "provax_LC_prop", "susp_novax", "susp_provax", "notfound_novax", "notfound_provax",
                                        "activity_ratio", "density_ratio", "urls_ratio", "youtube_ratio", "LC_prop_ratio", "susp_ratio", "notfound_ratio", "rwc_novax"]
                      ).set_index(["country", "period"])

In [48]:
info_df

Unnamed: 0_level_0,Unnamed: 1_level_0,tot_users,activity,density,avg_urls,avg_youtube,prop_antivax_labels,nmi_RTCO,LC_prop,susp_users,notfound_users,...,notfound_novax,notfound_provax,activity_ratio,density_ratio,urls_ratio,youtube_ratio,LC_prop_ratio,susp_ratio,notfound_ratio,rwc_novax
country,period,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
US,period1,14383,10.579823,0.000131,3.513175,0.257874,0.325000,0.789012,LC False True novax ...,0.183480,0.299242,...,0.520805,0.115883,3.368754,2.087119,3.873466,40.289606,48.633205,9.145032,4.494223,0.923934
US,period2,97061,8.386016,0.000025,2.760748,0.140716,0.450000,0.641293,LC False True novax ...,0.157015,0.271716,...,0.466105,0.100326,1.268846,1.273936,1.343769,9.883458,20.097412,7.414329,4.645921,0.775686
US,period3,202545,11.753085,0.000015,3.603224,0.098980,0.250000,0.671800,LC False True novax ...,0.121390,0.231924,...,0.397854,0.081093,1.342454,1.200058,0.995429,4.002849,23.044754,9.072278,4.906122,0.776760
US,period4,247992,12.564771,0.000013,4.760081,0.059837,0.175000,0.581895,0.074198,0.037094,0.092467,...,,,,,,,,,,
BR,period1,69726,1.360515,0.000018,0.076428,0.000459,0.075000,0.138043,0.003638,0.033546,0.313699,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PA,period4,12949,8.070465,0.000353,1.289598,0.012279,0.050000,0.061987,0.018386,0.005638,0.072515,...,,,,,,,,,,
GR,period1,782,2.424904,0.001732,0.166240,0.000000,0.100000,1.000000,0.037829,0.021739,0.130435,...,,,,,,,,,,
GR,period2,3321,3.125489,0.000651,2.470942,0.009335,0.090000,0.168216,LC False True novax ...,0.015357,0.104185,...,0.169312,0.100255,1.206847,11.499367,2.000702,40.507937,5.553514,3.551020,1.688808,0.795935
GR,period3,6374,6.496545,0.000596,5.874804,0.038751,0.233333,0.217185,LC False True novax ...,0.011767,0.094446,...,0.145089,0.090618,0.930260,6.601134,1.080590,10.620764,3.076008,4.810065,1.601116,0.693948
