# Sample tweets per community
In this notebook we create a dataframe with 20 sample tweets for each RT community for each country.<br>
In /data/public/jlenti/multilang-vax/EuropeAmerica_RTCO, all the communities found with hierarchical clustering are saved. Each user is associated to its community.<br>
In /data/public/jlenti/multilang-vax/DATA_clean_url all data about tweets are stored.<br>
For each country:
- we extract the list of users of each community
- we extract 40 sample files of the language and period analyzed
- we filter the tweets shared by users in the list
- 20 tweets are extracted and translated in English
- 20 rows will be added to the dataframe, with rows text, translated_text, country, languae, community

In [2]:
from glob import glob
import pandas as pd
import numpy as np
import random
import sys
sys.path.insert(0, "/home/jlenti/Codes/Multilayer_RT_CO_ME/")
from deep_translator import GoogleTranslator
import gspread
from oauth2client.service_account import ServiceAccountCredentials

In [4]:
#folder with networks and communities data 
folder_com = "/data/public/jlenti/multilang-vax/EuropeAmerica_RTCO"
#folder with tweets data
folder_data = "/data/public/jlenti/multilang-vax/DATA_clean_url"
#list of periods and relative months
periods = {"period1": ["201910", "201911", "201912"],
           "period2": ["202007", "202008", "202009"],
           "period3": ["202010", "202011", "202012"],
           "period4": ["202101", "202102", "202103"]}
#list of countries
countries = ["US", "BR", "AR", "GB", "ES", "MX", "FR", "CA", "TR", "VE", "AU", "CO", "IT", "CL", "DE",
             "PT", "IE", "PY", "EC", "RU", "UY", "NZ", "PL", "NL", "PE", "CU", "PA", "GR"]

## Example

In [5]:
period, country = "period1", "DE"
#in folder_com we have files such as ../DE_de_period1_RT_communities.csv.gz
#with columns [user, community]
#for each community we extract the list of users
file = sorted(glob("/".join([folder_com, period, country + "*RT*com*"])))[0]
lang = file.split("_")[2]
coms = pd.read_csv(file)

In [14]:
#get the list of users in the community
community = 2
users = coms.query("community == @community")["user"].drop_duplicates().tolist()


In [9]:
#extract 50 files to be read in the correct period/language
#extract 20 tweets per file in the considered country (without replacement)
df = pd.concat([pd.read_csv(f, lineterminator = "\n",
           sep = "\t", low_memory = False, quoting = False, escapechar = None) \
                .query("user_country_code == @country")[["user_screen_name", "text"]] \
                .sample(n = 50, replace = False) for month in periods[period] 
                for f in random.choices(glob("/".join([folder_data, lang, month + "*"])), k = 20)])

In [10]:
#filter the user in the community of interest
tweets = df.query("user_screen_name in @users").head(20)["text"].tolist()

In [11]:
#translate the tweets with Google Translate in English
trans_tweets = [GoogleTranslator(source = lang, target = "en").translate(u) for u in tweets]

In [15]:
#get the dataframe
pd.DataFrame([tweets, trans_tweets]).transpose().rename(columns = {0: "tweet", 1: "en_tweet"}) \
.assign(lang = lang).assign(country = country, commuinty = community)

Unnamed: 0,tweet,en_tweet,lang,country,commuinty
0,Die #Masern sind in Deutschland noch immer nic...,#Measles has still not been eliminated in Germ...,de,DE,2
1,💉 Bislang dürfen nur Ärzte ihre Patienten impf...,"💉 So far, only doctors are allowed to vaccinat...",de,DE,2
2,2006 wurde der erste Impfstoff gegen #HPV zuge...,In 2006 the first vaccine against #HPV was app...,de,DE,2
3,"Die #Grippewelle hat noch nicht begonnen, erst...","The #flu wave has not yet started, but there a...",de,DE,2
4,Eine Vermischung anthroposophischer Ideen und ...,"In principle, I consider a mixture of anthropo...",de,DE,2
5,@kimbjoernbecker @FAZ_Politik @hausaerzteverb ...,@kimbjoernbecker @FAZ_Politik @hausaerzteverb ...,de,DE,2
6,#Masernimpfung: Wissenschaftsbasierte &amp; qu...,#Measles Vaccination: Science-Based &amp; Qual...,de,DE,2
7,"Apotheker sollen gegen #Grippe impfen, meint d...","Pharmacists should vaccinate against #flu, say...",de,DE,2
8,#indenMedien Für die Sendung #ZDFZoom zum Them...,#indenMedien @ZDF spoke to psychologist @Corne...,de,DE,2
9,Bislang dürfen nur Ärzte ihre Patienten impfen...,"So far, only doctors are allowed to vaccinate ...",de,DE,2


## All communities
Repeat the same procedure for all communities, all periods, all countries.

In [None]:
#create a dictiornary with keys periods. It will be converted to a dataframe.
sample_tweets = {period: [] for period in periods}
for period in periods:
    for country in countries:
        #file containing community assignement for users in the right period/country
        file = sorted(glob("/".join([folder_com, period, country + "*RT*com*"])))[0]
        coms = pd.read_csv(file)
        #get the language to know which tweets files to open
        lang = file.split("_")[2]
        print(country, lang)
        #create the dataframe extracting some tweets from some random files
        df = pd.concat([pd.read_csv(f, lineterminator = "\n",
               sep = "\t", low_memory = False, quoting = False, escapechar = None)
                    .query("user_country_code == @country")[["user_screen_name", "id_str", "RT_id_str", "text"]]
                    .sample(n = 200, replace = True) 
                    for month in periods[period] 
                    for f in random.choices(glob("/".join([folder_data, lang, month + "*"])), k = 40)]) \
        #i want only tweets with different texts
        .groupby("text").head(1)

        #extract some tweets in each community of the network
        for community in coms.community.unique():
            users = coms.query("community == @community")["user"].drop_duplicates().tolist()
            #exclude too small communities (community with < 20 users or < 1% of the users)
            if (len(users) > 20) & (len(users) / len(coms) > 0.01): 
                tweets = df.query("user_screen_name in @users").sample(n = 20, replace = False)
                texts = tweets["text"].tolist()
                #translate all non-English tweets
                if lang != "en":
                    trans_texts = [GoogleTranslator(source = lang, target = "en").translate(u) for u in texts]
                else:
                    trans_texts = texts
                ids = tweets["id_str"].tolist()
                #append the new sampled tweets to the dictionary
                sample_tweets[period].append(pd.DataFrame([ids, texts, trans_texts]).transpose()\
                                             .rename(columns = {0: "id", 1: "text", 2: "en_text"}) \
                                             .assign(lang = lang).assign(country = country, community = community))

In [None]:
#create a unique dataframe concatenating all the dataframes
sample_tweets_df = {period: pd.concat(sample_tweets[period]) for period in periods}

In [None]:
#save it in /home/jlenti/Files/sample_tweets_2112
for per in sample_tweets_df:
    sample_tweets_df[per].to_csv("/home/jlenti/Files/sample_tweets_2112/sample_tweets_" + per + ".csv", index = 0)