# 10 RWC - NoVax vs Rest of Networks
In this notebook we measure the polarization of the retweets networks having a no-vax community with the Random Walk Controversy Score (RWC).

Given a partition X, Y, it measure the separatedness between the two sides, by comparing the probability that a Random Walk ended in a popular node of one side has started in the same side with the probability that it has started in the opposite side.

$RWC = P_{XX}P_{YY} - P_{XY}P_{YX}$, 

where $P_{XY}$ = P(random walk started in Y|random walk ended in a popular node in X).

So, to measure it, we need to extract the k most popular nodes of each side, simulate random walks ending in these nodes, and compare the probability that they started in same side vs started in opposing side.

We can measure it only in those networks that have a no-vax community.

In [17]:
import pandas as pd
import networkx as nx
from glob import glob

In [12]:
#data on edges and users are stored in this folder
folder = "/data/public/jlenti/multilang-vax/EuropeAmerica_RTCO"

#list of novax communities
#i transform it into a dataframe with columns (id, country, period, community)
#novax_com = pd.read_csv("/home/jlenti/Files/novax_communities_thres066_0402.csv", index_col = 0).apply(lambda x: pd.Series([x[0]] + x[0].split("_"), index = ["com", "country", "period", "community"]), axis = 1)
novax_com = pd.read_csv("/home/jlenti/Files/novax_communities_9tot_antivax_0902.csv", index_col = 0).apply(lambda x: pd.Series([x[0]] + x[0].split("_"), index = ["com", "country", "period", "community"]), axis = 1)
novax_com_l = novax_com.assign(community = lambda x: [int(u) for u in x["community"]]) \
.drop("com", axis = 1).groupby(["country", "period"]) \
.apply(lambda x: x["community"].tolist()).reset_index().rename(columns = {0: "communities"}).set_index(["country", "period"])


#list of all countries (size ordered)
countries = ["US", "BR", "AR", "GB", "ES", "MX", "FR", "CA", "TR", "VE", "AU", "CO", "IT", "CL", "DE",
             "PT", "IE", "PY", "EC", "RU", "UY", "NZ", "PL", "NL", "PE", "CU", "PA", "GR"]

#named periods
periods = ["period" + str(u + 1) for u in range(4)]


In [18]:
#easy way to get data from the folder, just giving as input country, period, and object needed, 
#getting the file names with glob
def read_data(country, period, obj, layer):
    files = sorted(glob("/".join([folder, period, country + "*" + layer + "_" + obj + "*"])))[0]
    return pd.read_csv(files)

### Example

In [24]:
#k is the number of most popular nodes I select from each side
k = 100

In [56]:
country, period = "IT", "period1"
p = period[-1]
#list of novax communities in the country, period selected
novax_coms = novax_com_l.loc[country, p]["communities"]
novax_coms

[1]

In [57]:
#read data about users in communities and edgelist of RT network
coms = read_data(country, period, "com", "RT")
edges = read_data(country, period, "ed", "RT")

In [58]:
#intstead of the community number, I need the attribute "novax", telling whether the user is in a novax commuinty
coms["novax"] = coms["community"].isin(novax_coms)
coms.head()

Unnamed: 0,user,community,novax
0,000Salvatore,1,True
1,CriticaScient,1,True
2,DavideFalchieri,1,True
3,FmMosca,1,True
4,GavinoSanna1967,1,True


In [59]:
#create the weighted directed networkx graph (DiGraph)
#on this graph i can perform all the random walks
G = nx.from_pandas_edgelist(edges, source = "user", target = "user_RT", 
                                edge_attr = "weight", create_using = nx.DiGraph)

In [60]:
#in-degree dataframe, for selecting the most popular nodes in each side
in_deg = pd.DataFrame(G.in_degree, columns = ["user", "in_degree"])

In [61]:
#we call users1 the ones in novax community, and users2 the ones in other side
users_com1 = coms.query("novax")["user"]
users_com2 = coms.query("~novax")["user"]    

In [62]:
#for each user we get information on its indegree and on its membership to a novax community
#then we sorted nodes by indegree and extract the first k ones
top1 = coms.merge(in_deg).sort_values("in_degree", ascending = False).query("novax").head(k)["user"]
top2 = coms.merge(in_deg).sort_values("in_degree", ascending = False).query("~novax").head(k)["user"]

In [63]:
#we will define random walks having uniform probability to restart in one node of side X
#for this, we define these uniform distribution, assignign probability 1 / N_X to the N_X nodes
#of side X, and ~0 to the rest
N1 = len(users_com1)
N2 = len(users_com2)
uni_users1 = {u: 1 / N1 for u in users_com1}
uni_users1.update({u: 1e-12 for u in users_com2})
uni_users2 = {u: 1 / N2 for u in users_com2}
uni_users2.update({u: 1e-12 for u in users_com1})

In [64]:
#i compute the personalized page rank, with restart on the nodes of one side
#it simulates random walks (until convergence), such that at each step the probability of restart
#at a node in side X is 0.85
#in this way we have the probability of ending at each node, having started at side x
pr1 = nx.pagerank(G, alpha = 0.85,
                  personalization = uni_users1,   
                  dangling = uni_users1, max_iter = 100000)

pr2 = nx.pagerank(G, alpha = 0.85,
                  personalization = uni_users2,
                  dangling = uni_users2, max_iter = 100000)
pr1

{'000Salvatore': 9.402002761919295e-05,
 'CriticaScient': 0.001634102728491285,
 'DavideFalchieri': 0.0027620003841711174,
 'FmMosca': 0.0003316328218032935,
 'GavinoSanna1967': 0.006109954512192893,
 'luigivanti': 0.0035538771479295783,
 'moschettopres': 0.0026683873112264183,
 '00MrFrost00': 2.603414564775453e-13,
 'RobertoBurioni': 0.0020332083414692365,
 '0Dawnstar0': 2.603414564775453e-13,
 '0x800a': 2.603414564775453e-13,
 'repubblica': 1.0165124130026396e-05,
 '1000_best': 9.402002761919295e-05,
 'massimo4951': 0.07929637249202601,
 '1211andreas': 9.402002761919295e-05,
 'Useppe00': 0.0009359771917490753,
 '123stellas': 9.402002761919295e-05,
 'Fedele233Fedele': 0.0002610311057669181,
 '14FranCesc': 2.603414564775453e-13,
 'tiziana_botti': 0.0005216680927598599,
 '1603ina': 9.402002761919295e-05,
 'sabrika65': 0.013483939307411395,
 '17_elamanu': 9.402002761919295e-05,
 'sangesnicola': 0.006717483607187567,
 '1950Elda': 9.402002761919295e-05,
 'valteroma': 0.00027868165605814745

In [65]:
#create a dataframe with users and page ranks, filter users at the top 
#and measure the total probability of ending the RW in one of these nodes
#copmute this, for RW starting at side 1 or 2, and ending at side 1 or 2
#rw_YX is the probability of ending at side X having started at side Y
rw_11 = pd.DataFrame({"user": pr1.keys(), "pr": pr1.values()}).merge(top1).sum()["pr"]
rw_12 = pd.DataFrame({"user": pr1.keys(), "pr": pr1.values()}).merge(top2).sum()["pr"]
rw_21 = pd.DataFrame({"user": pr2.keys(), "pr": pr2.values()}).merge(top1).sum()["pr"]
rw_22 = pd.DataFrame({"user": pr2.keys(), "pr": pr2.values()}).merge(top2).sum()["pr"]

In [66]:
#P_XY = P(started Y|end X) = (P(end X|started Y) * P(started Y)) / P(end X)) 
#P(end X|started Y) = rw_XY
#P(started Y) = #users in Y / #users
#P(end X) = P(end X|started X) * P(started X) + P(end X|started Y) * P(started Y) 


#proportion of nodes per side = P(started X)
prop1, prop2 = len(users_com1) / len(coms), len(users_com2) / len(coms)

#P(statred X|end Y) = P_XY
p11 = rw_11 * prop1 / (rw_11 * prop1 + rw_21 * prop2)
p12 = rw_12 * prop2 / (rw_12 * prop1 + rw_22 * prop2)
p21 = rw_21 * prop1 / (rw_11 * prop1 + rw_21 * prop2)
p22 = rw_22 * prop2 / (rw_12 * prop1 + rw_22 * prop2)

#RWC = P_XX * P_YY - P_XY * P_YX
rwc = p11 * p22 - p12 * p21

In [67]:
rwc

0.927006410657293

### Function RWC score
Starting from the steps above, we can define a function with input edgelist and nodes assignement, and output rwc.

In [68]:

#function for computing the RWC in a RT network with sides novax = True/False
#give as input an edgelist with columns user, user_RT, weight, and a dataframe with columns user, community, novax (T/F),
#it computes the RW, selecting the k most popular users per side
def rwc(edges, com, k = 100):
    G = nx.from_pandas_edgelist(edges, source = "user", target = "user_RT", 
                                edge_attr = "weight", create_using = nx.DiGraph)
    #in-degree dataframe
    in_deg = pd.DataFrame(G.in_degree, columns = ["user", "in_degree"])
        #most popular nodes per side
        #we measure the probability to end in one of the top nodes of the community
    top1 = com.merge(in_deg).sort_values("in_degree", ascending = False).query("novax").head(k)["user"]
    top2 = com.merge(in_deg).sort_values("in_degree", ascending = False).query("~novax").head(k)["user"]

    users_com1 = com.query("novax")["user"]
    users_com2 = com.query("~novax")["user"]
    len_users_com1 = len(users_com1)
    len_users_com2 = len(users_com2)
        #define uniform distribution on the nodes of one side (~0 on the other side)
    uni_users1 = {u: 1 / len_users_com1 for u in users_com1}
    uni_users1.update({u: 1e-12 for u in users_com2})
    uni_users2 = {u: 1 / len_users_com2 for u in users_com2}
    uni_users2.update({u: 1e-12 for u in users_com1})

        #personalized page rank. With probability alpha restart the RW from a node with probability personalization
        #(restart with a random node of that side)
    pr1 = nx.pagerank(G, alpha = 0.85,
                      personalization = uni_users1,
                      dangling = uni_users1, max_iter = 100000)

    pr2 = nx.pagerank(G, alpha = 0.85,
                      personalization = uni_users2,
                      dangling = uni_users2, max_iter = 100000)


        #sum of the PR of the side top nodes. It is the probability that a personalized RW will end in a top
        #node of that community
    rw_11 = pd.DataFrame({"user": pr1.keys(), "pr": pr1.values()}).merge(top1).sum()["pr"]
    rw_12 = pd.DataFrame({"user": pr1.keys(), "pr": pr1.values()}).merge(top2).sum()["pr"]
    rw_21 = pd.DataFrame({"user": pr2.keys(), "pr": pr2.values()}).merge(top1).sum()["pr"]
    rw_22 = pd.DataFrame({"user": pr2.keys(), "pr": pr2.values()}).merge(top2).sum()["pr"]

        #proportion of nodes per side
    prop1, prop2 = len(users_com1) / len(com), len(users_com2) / len(com)

        #conditional probability
        #pAB: probability to have started in side A given that it ended in side B
    p11 = rw_11 * prop1 / (rw_11 * prop1 + rw_21 * prop2)
    p12 = rw_12 * prop2 / (rw_12 * prop1 + rw_22 * prop2)
    p21 = rw_21 * prop1 / (rw_11 * prop1 + rw_21 * prop2)
    p22 = rw_22 * prop2 / (rw_12 * prop1 + rw_22 * prop2)

        #RWC definition
    rwc = p11 * p22 - p12 * p21
    return rwc
    

In [69]:
rwc(edges, coms)

0.927006410657293