# Networks Build - CO edgelists
In this notebook we create the cosharing weighted edgelists for each country in each period and save them in the folder_EU_AM.

- The CO network is an undirected network, where nodes are the users, and an edge with weight w connect two users that have shared w common URLs.
- For each selected country, for each period, we create a dataframe with columns URL, domain, lang, country, users, where for each URL we associate a list of the users that have shared it.
- Save it in folder_EU_AM in the format /.../period1/IT_it_period1_adj_CO_edges.csv.gz
- From these, create the edgelists: for each row of the dataframe, for each combination of two users that have shared the same URL, create a row of the edgelist. Add the weight column counting the occurrences of the pairs of users
- Save edgelist in folder_EU_AM, in the format /.../period1/IT_it_period1_CO_edges.csv.gz

In [77]:
import pandas as pd
from glob import glob
from itertools import combinations, product
from urllib.parse import urlparse

In [3]:
periods = {"period1": ["201910","201911","201912"],
           "period2": ["202007","202008","202009"], 
           "period3": ["202010","202011","202012"], 
           "period4":  ["202101","202102","202103"]}

folder_DATA = "/data/public/jlenti/multilang-vax/DATA_clean_url"
folder_EU_AM = "/data/public/jlenti/multilang-vax/EuropeAmerica_RTCO"
neutrals = pd.read_csv("/home/jlenti/Files/neutral_domains_1309.txt")["0"].tolist()

In [80]:
#dataframes with all the pairs lang-countries, with the selected countries
selected_pairs = pd.read_csv("/home/jlenti/Files/country_langs_selected_2104.csv", index_col = 0)
selected_pairs.head()

Unnamed: 0,country,lang
0,US,en
1,BR,pt
2,AR,es
3,GB,en
4,ES,es


### Example - Italy period 1

In [7]:
lang = "it"
period = "period1"
country = "IT"

In [60]:
#read data from folder_DATA, selecting only columns with user, country, lang, urls
url_data = pd.concat([pd.read_csv(file, lineterminator = "\n",
                                  sep = "\t", low_memory = False, 
                                  quoting = False, escapechar = None)\
                      [["user_screen_name", "user_country_code", "urls", "lang"]] 
                      .dropna() #drop rows without a urls
                      for month in periods[period] #for each month of the period
                      for file in sorted(glob("/".join([folder, lang, month + "*"])))]) #for all the file in the selected language


In [61]:
#some tweets contains more than one urls, separated by a " ", so i have to split them
url_data.loc[882].head(1).loc[882, "urls"]

'http://ilpedante.org/post/esclusioni-scolastiche-e-vaccinazioni-uno-pseudopaper https://twitter.com/Agenzia_Italia/status/1180035408412979200'

In [62]:
url_data["urls"] = url_data.apply(lambda x: x["urls"].split(" "), axis = 1) #split urls separated by a space
url_data = url_data.explode("urls") #create one row per urls

In [64]:
#extract the domain from the urls with urlparse()
url_data["domain"] = [urlparse(u).netloc for u in url_data["urls"]]
#if the domain has the form www.domain.com keep only domain.com
url_data["domain"] = url_data["domain"].apply(lambda x: x[4:] if x[:4] == "www." else x)
#filter out domain pointing to urls shorteners or mainstream social networks
url_data = url_data.query("domain not in @neutrals")

In [67]:
#create the dataframe only for one country
url_data = url_data.query("user_country_code == @country")

In [85]:
#group all the users that have shared the same URL (and have the same domain)
#creating the column "urls", where we have the list of users that shared the same url
hyper_url_data = url_data.groupby(["urls", "domain"]) \
.apply(lambda x: x["user_screen_name"].unique()).reset_index().rename(columns = {0:"users"}) \
.assign(lang = lang, country = country)[["urls", "country", "lang", "domain", "users"]]
#save them
#hyper_url_data.to_csv(folder_EU_AM + "/{0}/{1}_{2}_{0}_adj_CO_edges.csv.gz".format(period, country, lang), index = False)
hyper_url_data.head()

Unnamed: 0,urls,country,lang,domain,users
0,http://A.Ge,IT,it,A.Ge,[academorosario]
1,http://ANSA.it,IT,it,ANSA.it,"[OdontotecnicaF, giuliano4573, minarompa, cbat..."
2,http://C.Li.Va,IT,it,C.Li.Va,[Clutcher]
3,http://C.VET,IT,it,C.VET,[Sissona]
4,http://CalabriaMagnifica.it,IT,it,CalabriaMagnifica.it,[Calab_Magnifica]


In [86]:
#for creating the edgelist filter only the URLs that have been shared by more than one user
#that are the only URL that cause the creation of the links
CO_url_data = hyper_url_data[hyper_url_data.apply(lambda x: len(x["users"]) > 1, axis = 1)]

In [113]:
#create a list that will be transformed in a dataframe with all the combinations of users that shared the same
#urls
cosharing_users = []
for l in CO_url_data.iterrows():
    l_urls, l_country, l_lang, l_dom, l_usr = l[1]
    #combinations(l, k) return all the combination of size k from the list l
    #so we can have the pairs of users that shared the same url from the list of users that shared the same url
    for pair in combinations(l_usr, 2):
        pair = sorted(pair) #since the links are undirected I want all of them in the same order, alphabetical
        cosharing_users.append([pair[0], pair[1], l_urls, l_country, l_lang, l_dom])

In [114]:
CO_edgelist = pd.DataFrame(cosharing_users, columns = ["user1", "user2", "url", "country", "lang", "domain"])
CO_edgelist.head() #capital letters are always before lowercase

Unnamed: 0,user1,user2,url,country,lang,domain
0,OdontotecnicaF,giuliano4573,http://ANSA.it,IT,it,ANSA.it
1,OdontotecnicaF,minarompa,http://ANSA.it,IT,it,ANSA.it
2,OdontotecnicaF,cbatcaselli,http://ANSA.it,IT,it,ANSA.it
3,OdontotecnicaF,lillydessi,http://ANSA.it,IT,it,ANSA.it
4,OdontotecnicaF,carloerbaa,http://ANSA.it,IT,it,ANSA.it


In [125]:
#count rows and give weight to each link
weighted_CO_edgelist = CO_edgelist.groupby(["user1", "user2"]).count()["url"].reset_index().rename(columns = {"url": "weight"})
#weighted_CO_edgelist.to_csv(folder_EU_AM + "/{0}/{1}_{2}_{0}_CO_edges.csv.gz".format(period, country, lang), index = False)
weighted_CO_edgelist.head()

Unnamed: 0,user1,user2,weight
0,000Salvatore,AndreaLisi15,1
1,000Salvatore,IacobellisT,1
2,000Salvatore,MPaperoga,1
3,000Salvatore,alpardu,1
4,000Salvatore,angy_cocco,1


## All countries - Periods

In [126]:
for period in periods:
    for _, (country, lang) in selected_pairs.iterrows():
        #read data from folder_DATA, selecting only columns with user, country, lang, urls
        url_data = pd.concat([pd.read_csv(file, lineterminator = "\n",
                                  sep = "\t", low_memory = False, 
                                  quoting = False, escapechar = None)\
                      [["user_screen_name", "user_country_code", "urls", "lang"]] 
                      .dropna() #drop rows without a urls
                      for month in periods[period] #for each month of the period
                      for file in sorted(glob("/".join([folder, lang, month + "*"])))]) #for all the file in the selected language
        url_data["urls"] = url_data.apply(lambda x: x["urls"].split(" "), axis = 1) #split urls separated by a space
        url_data = url_data.explode("urls") #create one row per urls
        
        #extract the domain from the urls with urlparse()
        url_data["domain"] = [urlparse(u).netloc for u in url_data["urls"]]
        #if the domain has the form www.domain.com keep only domain.com
        url_data["domain"] = url_data["domain"].apply(lambda x: x[4:] if x[:4] == "www." else x)
        #filter out domain pointing to urls shorteners or mainstream social networks
        url_data = url_data.query("domain not in @neutrals")
        #create the dataframe only for one country
        url_data = url_data.query("user_country_code == @country")
        
        #group all the users that have shared the same URL (and have the same domain)
        #creating the column "urls", where we have the list of users that shared the same url
        hyper_url_data = url_data.groupby(["urls", "domain"]) \
        .apply(lambda x: x["user_screen_name"].unique()).reset_index().rename(columns = {0:"users"}) \
        .assign(lang = lang, country = country)[["urls", "country", "lang", "domain", "users"]]
        #save them
        #hyper_url_data.to_csv(folder_EU_AM + "/{0}/{1}_{2}_{0}_adj_CO_edges.csv.gz".format(period, country, lang), index = False)

        #for creating the edgelist filter only the URLs that have been shared by more than one user
        #that are the only URL that cause the creation of the links
        CO_url_data = hyper_url_data[hyper_url_data.apply(lambda x: len(x["users"]) > 1, axis = 1)]
        
        #create a list that will be transformed in a dataframe with all the combinations of users that shared the same
        #urls
        cosharing_users = []
        for l in CO_url_data.iterrows():
            l_urls, l_country, l_lang, l_dom, l_usr = l[1]
            #combinations(l, k) return all the combination of size k from the list l
            #so we can have the pairs of users that shared the same url from the list of users that shared the same url
            for pair in combinations(l_usr, 2):
                pair = sorted(pair) #since the links are undirected I want all of them in the same order, alphabetical
                cosharing_users.append([pair[0], pair[1], l_urls, l_country, l_lang, l_dom])
        
        CO_edgelist = pd.DataFrame(cosharing_users, columns = ["user1", "user2", "url", "country", "lang", "domain"])
        
        #count rows and give weight to each link
        weighted_CO_edgelist = CO_edgelist.groupby(["user1", "user2"]).count()["url"].reset_index().rename(columns = {"url": "weight"})
        #weighted_CO_edgelist.to_csv(folder_EU_AM + "/{0}/{1}_{2}_{0}_CO_edges.csv.gz".format(period, country, lang), index = False)
        