# Networks Build - RT edgelists
In this notebook we create the retweets weighted edgelists for each country in each period and save them in the folder_EU_AM.

- The RT network is a directed network, where nodes are the users, and an edge is a directed link that represent a user retweeting another. The weight of the edge is the number of retweets from a user to another.
- For each selected country, for each period, we create an edgelist (filtering the tweets in the language selected)
- All the dataframes are store in /data/public/jlenti/multilang-vax/EuropeAmerica_RTCO, in the format /.../period1/IT_it_period1_RT_edges.csv.gz


In [4]:
import pandas as pd
from glob import glob
import numpy as np
import networkx as nx

In [27]:
periods = {"period1": ["201910","201911","201912"],
           "period2": ["202007","202008","202009"], 
           "period3": ["202010","202011","202012"], 
           "period4": ["202101","202102","202103"]
          }

folder_EU_AM = "/data/public/jlenti/multilang-vax/EuropeAmerica_RTCO"
folder_url = "/data/public/jlenti/multilang-vax/DATA_clean_url"

### Example

In [10]:
#dataframe with rows user, country, RT_user, RT_country, lang
c, l = "NL", "nl"

RT_data = pd.concat([pd.read_csv(file, lineterminator = "\n",
           sep = "\t", low_memory = False, quoting = False, escapechar = None)[["user_screen_name", "user_country_code", "RT_user_screen_name", "RT_user_country_code", "lang"]].dropna()
                      for file in sorted(glob(folder_url + "/{0}/2019*".format(l)))]).rename(columns = {"user_country_code": "country", 
                                                                                            "RT_user_country_code": "country_RT",
                                                                                            "user_screen_name": "user",
                                                                                            "RT_user_screen_name": "user_RT"})


In [15]:
edges = RT_data.query("(country == @c)&(country_RT == @c)&(lang == @l)") \
.groupby(["user", "user_RT"]).count().rename(columns = {"lang": "weight"})["weight"].reset_index()
edges.head()

Unnamed: 0,user,user_RT,weight
0,020Ruijgrok,shossontwits,1
1,020tho,thiessenmark,1
2,030Binnenstad,hansdamen,2
3,0limpiaElena,VVD,2
4,0limpiaElena,VVDBunnik,4


## All RT Networks

In [12]:
#dataframe with country-lang pairs we selected for the analysis
selected_pairs = pd.read_csv("/home/jlenti/Files/country_langs_selected_2104.csv", index_col = 0)

In [20]:
langs_countries_list = selected_pairs.groupby("lang").apply(lambda x: x["country"].tolist()).reset_index()

In [23]:
langs_countries_list

Unnamed: 0,lang,0
0,de,[DE]
1,el,[GR]
2,en,"[US, GB, CA, AU, IE, NZ]"
3,es,"[AR, ES, MX, VE, CO, CL, PY, EC, UY, PE, CU, PA]"
4,fr,[FR]
5,it,[IT]
6,nl,[NL]
7,pl,[PL]
8,pt,"[BR, PT]"
9,ru,[RU]


In [26]:
for row in langs_countries_list.iterrows():
    a = 0
row

(10,
 lang      tr
 0       [TR]
 Name: 10, dtype: object)

In [None]:
for period in periods:
    print(period)
    for  _, (lang, countries) in lang_countries_list.iterrows():
        print(lang)
        RT_data = pd.concat([pd.read_csv(file, lineterminator = "\n",
                                         sep = "\t", low_memory = False,
                                         quoting = False, escapechar = None)[["user_screen_name", "user_country_code", 
                                                                              "RT_user_screen_name", "RT_user_country_code", 
                                                                              "lang"]].dropna().query("(user_country_code != ' ')&(RT_user_country_code != ' ')")
                             for month in periods[period]
                             for file in sorted(glob(folder_url + "/{1}/{0}*".format(month, lang)))])
        .rename(columns = {"user_country_code": "country",
                           "RT_user_country_code": "country_RT",
                           "user_screen_name": "user",
                           "RT_user_screen_name": "user_RT"})
        for country in countries:
            edges = RT_data.query("(country == @country)&(country_RT == @country)&(lang == @lang)") \
            .groupby(["user", "user_RT"]) \
            .count().rename(columns = {"lang": "weight"})["weight"].reset_index()
            
            if len(edges[["user", "user_RT"]]
                   .stack()
                   .unique()) > 200000:
                edges = edges.query("weight > 1")
                print(country, lang, period, "weight 1")
                
            edges.to_csv("/".join([folder_EU_AM, period, "_".join([country, lang, period, "RT", "edges.csv.gz"])]),
                         compression = "gzip", index = False)