# 14 Cross Countries URLs - Save Data
In this notebook we analyse the retweets containing URLs from one country to another.
We extract them, flag the low-credible domains and study the misinformation flow.

In [None]:
from glob import glob
import pandas as pd
import numpy as np
from urllib.parse import urlparse

In [None]:
folder = "/data/public/jlenti/multilang-vax/DATA_clean_url"

#list of neutral domains (url shorteners, twitter.com, facebook.com)
neutrals = pd.read_csv("/home/jlenti/Files/neutral_domains_1309.txt")["0"].tolist()
#list of low-credible domains
blacklist = pd.read_csv("/home/jlenti/Files/merged_blacklist_1309.txt")["0"].tolist()
blacklist.extend(pd.read_csv( '/home/jlenti/Files/lemonde_blacklist_2709.txt')["domain"].tolist())
blacklist.extend(pd.read_csv('/home/jlenti/Files/greek_blacklist_1009.txt')["0"].tolist())

#domains associated to youtube
youtube_domains = ["youtube.com", "youtu.be"]

#list of all countries (size ordered)
countries = ["US", "BR", "AR", "GB", "ES", "MX", "FR", "CA", "TR", "VE", "AU", "CO", "IT", "CL", "DE",
             "PT", "IE", "PY", "EC", "RU", "UY", "NZ", "PL", "NL", "PE", "CU", "PA", "GR"]
#countries speaking english or italian, the ones with a list of low-credible domains
LC_countries = ["IT", "US", "GB", "AU", "NZ", "IE"]
#sorted by language
lang_sort = ["US", "IE", "GB", "CA", "NZ", "AU", "FR", "IT", "PL", "NL", "DE", "RU", "TR", 
             "BR", "PT", "GR", "AR", "ES", "MX","VE", "CO", "CL",
             "PY", "EC", "UY", "PE", "CU", "PA"]

#named periods
periods = {"period1": ["201910", "201911", "201912"],
           "period2": ["202007", "202008", "202009"],
           "period3": ["202010", "202011", "202012"],
           "period4": ["202101", "202102", "202103"]
          }

In [None]:
#list of users with wrong geolocations
filtered_users = pd.concat([pd.read_csv("/data/public/jlenti/multilang-vax/Geolocation_Mismatches/more_countries_users_RT.csv"),
                            pd.read_csv("/data/public/jlenti/multilang-vax/Geolocation_Mismatches/misgeo_popular_user_countries_pairs.csv")])["user"].tolist()

In [None]:
#from "/data/public/jlenti/multilang-vax/DATA_clean_url" I query the tweets that are retweets between 
#different countries (in our list of countries), and that contain a url
#I create a dictionary associating the period to the corresponding dataframe (with all languages)
cross_border_urls = {p: pd.concat([pd.read_csv(file, sep = "\t", lineterminator = "\n",
                                               low_memory = False, quoting = False,
                                               escapechar = None) \
                                   [["user_screen_name", "RT_user_screen_name", "user_country_code", 
                                     "RT_user_country_code", "urls", "lang"]] \
                                    #keep only retweets beween different countries of our list with a url
                                   .query("urls == urls") \
                                   .query("(user_country_code in @countries)&(RT_user_country_code in @countries)&(user_country_code != RT_user_country_code)") \
                                   .query("(user_screen_name not in @filtered_users)&(RT_user_screen_name not in @filtered_users)")
                                   #each periods has 3 months (in dictionary periods)
                                   for month in periods[p]
                                   #files in the folder have the format folder/it/20200101-it.....tsv.gz
                                   #to keep all data from a specific month I select folder/*/month*
                                   for file in sorted(glob("/".join([folder, "*", month + "*"])))]) 
                     for p in periods}

In [None]:
for p in periods:
    #I use urlparse from library urllib to extract the domain from all the urls
    #I remove the head www. from all domains
    cross_border_urls[p]["domain"] = cross_border_urls[p]["urls"].apply(lambda x: urlparse(x).netloc).apply(lambda x: x[4:] if x[:4] == "www." else x)
    #label low-credible domains with LC True (that are stored in blacklist)
    cross_border_urls[p]["LC"] = cross_border_urls[p]["domain"].isin(blacklist)
    #label neutral domains with neutral True (that are stored in neutrals. urls shorteners, social networks, or generic domains)
    cross_border_urls[p]["neutral"] = cross_border_urls[p]["domain"].isin(neutrals)

In [None]:
#concatenate the dictionary of dataframes to a unique dataframe with column "period"
cross_urls_df = pd.concat([cross_border_urls[p].assign(period = p) for p in periods])

In [None]:
#cross_urls_df.to_csv("/home/jlenti/Files/cross_border_retweeted_urls_2104.csv", index = False)