In [1]:
import numpy as np
import pandas as pd
import os, sys

In [2]:
input_path = '../data/twitter_data'
output_path = '../data/twitter_filtered'

In [3]:
def score(row):
    rts, reps, likes = map(float, [row['retweets_count'], row['replies_count'], row['likes_count']])
    rts = rts if rts != np.nan else 0
    reps = reps if reps != np.nan else 0
    likes = likes if likes != np.nan else 0
    score = 5 * rts + 4 * reps * 3 * likes
    row['score'] = score
    return row

In [4]:
def covid_related(row):
    related = False
    keywords = ['covid', 'covid19', 'covid-19', 'virus',
                'quarentena', 'quarantine', 'quarantena',
                'corona', 'corona19', 'corona virus',
                'coronavirus', 'coronavírus', 'coronavirus19',
                'coronavírus19', 'positive', 'positivi',
                'positivo', 'positives', 'positivos',
                'cases', 'casos', 'caso', 'coronabond']
    for keyword in keywords:
        c = keyword in row['tweet'].lower()
        related = related or c
    row['related'] = related
    return row

In [5]:
total_related = 0
total_unrelated = 0

In [7]:
maindf = pd.DataFrame()

for filename in os.listdir(input_path):
    if (filename == 'China_chinaorgcn.tsv'):
        df = pd.read_json(f'{input_path}/{filename}')
    else:
        df = pd.read_csv(f'{input_path}/{filename}', sep='\t', engine='python')
    df = df.apply(score, axis=1)
    df = df.apply(covid_related, axis=1)
    
    maindf = pd.concat([maindf, df], ignore_index=True).reset_index(drop=True)
    
    related = len(df[df['related']].reset_index(drop=True))
    total_related += related / len(df)
    
    unrelated = len(df[~df['related']].reset_index(drop=True))
    total_unrelated = unrelated / len(df)

    print(filename, related / len(df), unrelated / len(df))

Italy_virgilio_it.tsv 0.5625 0.4375
UK_BBCNews.tsv 0.2969529085872576 0.7030470914127424
UK_Telegraph.tsv 0.4001850138760407 0.5998149861239593
UK_guardian.tsv 0.2611497248769186 0.7388502751230813
China_ChinaDaily.tsv 0.6172640819312363 0.3827359180687637
Italy_repubblica.tsv 0.12805320435308343 0.8719467956469166
Brazil_Estadao.tsv 0.24898045779213127 0.7510195422078687
US_nytimes.tsv 0.30868993545302414 0.6913100645469759
Brazil_JornalOGlobo.tsv 0.20886958810369785 0.7911304118963022
Brazil_JornaldoBrasil.tsv 0.2087912087912088 0.7912087912087912
Italy_Corriere.tsv 0.17814551169455486 0.8218544883054452
China_PDChina.tsv 0.6241379310344828 0.3758620689655172
Italy_Libero_official.tsv 0.6443747823058168 0.3556252176941832
US_huffpost.tsv 0.27516907285721137 0.7248309271427886
US_cnn.tsv 0.33731060606060603 0.662689393939394
China_chinaorgcn.tsv 0.601145038167939 0.3988549618320611
US_FoxNews.tsv 0.27673406950619056 0.7232659304938095
China_shanghaidaily.tsv 0.8579418344519015 0.14205

In [8]:
nfiles = len(os.listdir(input_path))
print('Avg Related', total_related / nfiles, '; Avg Unrelated', total_unrelated / nfiles)

Avg Related 0.3797434920510331 ; Avg Unrelated 0.03492589088615579


In [9]:
tweets_related = maindf[maindf['related']]
tweets_related.drop('related', axis=1, inplace=True)

Unnamed: 0,id,datestamp,timestamp,username,tweet,replies_count,retweets_count,likes_count,url,country,score,related
0,1243199622933348353,2020-03-26,12:34:22,virgilio_it,📰🎙 #Notizie #Curiosità #SaluteeBenessere 🛀💉\nI...,0,0,0,https://twitter.com/virgilio_it/status/1243199...,Italy,0.0,True
2,1242882313115127808,2020-03-25,15:33:29,virgilio_it,📰🎙 #Notizie #Curiosità #SaluteeBenessere 🛀💉\nI...,0,1,0,https://twitter.com/virgilio_it/status/1242882...,Italy,5.0,True
3,1242843309099364352,2020-03-25,12:58:30,virgilio_it,📰🎙 #Notizie #Curiosità #SaluteeBenessere 🛀💉\n#...,0,0,0,https://twitter.com/virgilio_it/status/1242843...,Italy,0.0,True
5,1242434278476087298,2020-03-24,09:53:09,virgilio_it,📰🎙 #Notizie #Curiosità #SaluteeBenessere 🛀💉\n#...,1,0,0,https://twitter.com/virgilio_it/status/1242434...,Italy,0.0,True
6,1242431161323982851,2020-03-24,09:40:46,virgilio_it,📰🎙 #Notizie #Curiosità #InItalia 🇮🇹 🌍\nUna dis...,0,0,0,https://twitter.com/virgilio_it/status/1242431...,Italy,0.0,True


In [10]:
countries = ['Brazil', 'US', 'UK', 'China', 'Italy']
for country in countries:
    country_df = tweets_related[tweets_related['country'] == country].reset_index(drop=True)
    country_df.to_json(f'{output_path}/{country}.json')