In [None]:
from datetime import datetime, date, timedelta
import pandas as pd
import numpy as np
import random
import string
import json
import re
from collections.abc import Iterable
from sklearn.metrics import mean_squared_error

from matplotlib import pyplot as plt
import plotly.express as px

In [None]:
import TweetsUtils
import importlib
importlib.reload(TweetsUtils)
from TweetsUtils import *

### Constants

In [None]:
# KEYWORDS = ['febbre', 'brividi', 'dolori alle ossa', 'dolori muscolari', 'malessere generale', 
#             'mal di testa', 'mal di gola', 'raffreddore', 'tosse', 'congiuntivite']
KEYWORDS = ['febbre', 'mal di testa', 'raffreddore', 'mal di gola', 'tosse']
COVID_KEYWORDS = ['gusto', 'olfatto']

In [None]:
base_path = 'tweets/flu_tweets/'
tweets_filename = base_path + 'flu_tweets_contents.json'
users_filename = base_path + 'flu_tweets_users.json'
places_filename = base_path + 'flu_tweets_places.json'

### Read files

In [None]:
tweets = read_file(tweets_filename)
for tw in tweets:
    tw['datetime'] = datetime.strptime(tw['datetime'], '%Y-%m-%d %H:%M:%S')
users = read_file(users_filename)
places = read_file(places_filename)

print(len(tweets), 'tweets')
print(len(users), 'users')
print(len(places), 'places')

### Filtering

In [None]:
# filter out tweets without keywords
# tweets = [tw for tw in tweets if count_keywords(tw['text'].lower(), KEYWORDS+COVID_KEYWORDS) > 0] # <-------- CAMBIARE SE SERVE
tweets = [tw for tw in tweets if count_keywords(tw['text'].lower(), KEYWORDS) > 0] # <-------- CAMBIARE SE SERVE
print('removed tweets without keywords:', len(tweets))

In [None]:
# filter out tweets of users that tweet too much (more than 50 tweets about flu)
x = pd.value_counts(select_fields(tweets, ['author_id'], as_list=True))
x = x[x>=50]
ids = list(x.index)
tweets = [tw for tw in tweets if tw['author_id'] not in ids]
print('removed "super" tweeters:', len(tweets))

In [None]:
# filter out tweets with popular mentions (over 50 times)
x = select_fields(tweets, ['text'], as_list=True)
x = [get_mentions(text) for text in x]
x = [item.lower() for sublist in x for item in sublist] #flatten
x = pd.value_counts(x)
x = x[x>50]
x = set(x.index)
tweets = [tw for tw in tweets if not has_mentions(tw['text'], x)]
print('removed popular mentions:', len(tweets))

In [None]:
# # filter out ambiguous tweets (brividi in combination with at least another keyword)
# tweets_brividi = keyword_in_combination(tweets, 'brividi', KEYWORDS, 2)
# tweets = [tw for tw in tweets if 'brividi' not in tw['text'].lower()]
# tweets = tweets + tweets_brividi
# print('removed ambiguous keywords:', len(tweets))

In [None]:
# filter out tweets with popular hashtags (more than 100 times, excluding some specific ones)
x = select_fields(tweets, ['hashtags'], as_list=True)
x = list(np.array(x, dtype='object').reshape(-1))
x = [h[0][0].lower() for h in x if len(h) > 0]
x = pd.value_counts(x)
x = list(x[x>100].index)
x = [h for h in x if (h not in KEYWORDS) 
     and (h not in ['influenza', 'salute', 'lunedì', 'primavera', 'malditesta', 'emicrania', 'benemanonbenissimo']) 
     and ('buon' not in h) and ('febbre' not in h)] 
x = [h for h in x if ('vaccin' not in h) and ('virus' not in h) and ('covid' not in h) and ('corona' not in h)]
tweets = [tw for tw in tweets if not has_words(tw['text'], x)]
print('removed popular hashtags:', len(tweets))

In [None]:
# filter out outliers
def outlier_in_text(text):
    for outlier in ['alessiamorani', 'alessia morani', 'morani', 'temptationisland', 'benji_mascolo', 
                    'higuain', 'milan', 'arisa', 'sanremo', 'claudio', 'clario', 'gf16']:
        if outlier in text:
            return True
    return False

tweets = [tw for tw in tweets if not outlier_in_text(tw['text'].lower())]
print('removed outliers:', len(tweets))

In [None]:
full_tweets = tweets.copy()
tweets = filter_list(tweets, 'datetime', datetime(2017,1,1), datetime(2020,1,20))

**statistiche**

In [None]:
tmp = [[k, len([tw for tw in full_tweets if k in tw['text'].lower()])] for k in KEYWORDS]
pd.DataFrame(tmp, columns=['keyword', 'count']).sort_values(by='count', ascending=False)

In [None]:
plt.rcParams["figure.figsize"] = (8,5)

x = pd.DataFrame(select_fields(full_tweets, ['datetime', 'text']))
x['year'] = x['datetime'].dt.year
x['month'] = x['datetime'].dt.month
del x['datetime']
x = x.groupby(['year', 'month']).count().reset_index()

i1 = 0
i2 = 255
for year in [2017,2018,2019,2020,2021,2022]:
    tmp = x[x['year']==year].copy()
#     print(year, np.sum(tmp['text']))
    col = (i1/255, 0.25, i2/255)
    plt.plot(list(tmp['month']), list(tmp['text']), label=year, color=col)
    i1 += 50
    i2 -= 50
    
plt.legend(loc='upper center')
# plt.tight_layout()
# plt.savefig('timeseries_by_year.svg')
plt.show();

In [None]:
len(tweets)

In [None]:
select_fields(random.sample(tweets, 10), ['text'])

### Tweets quantity

In [None]:
def linechart(s, rolling_mean=None, normalize=False):
    if rolling_mean:
        s = s.rolling(window=rolling_mean, center=True).mean()
    if normalize:
        s = (s - s.min()) / (s.max() - s.min())
    plt.plot(s)


def get_tweets_volume(tweets, resampling='1d'):
    s = pd.DataFrame(index=pd.to_datetime([tw['datetime'] for tw in tweets]))
    s['count'] = [1]*len(s)
    s = s.resample(resampling).count()
    return s

In [None]:
plt.rcParams["figure.figsize"] = (9,5)

# s = get_tweets_volume(tweets, resampling='1d')
# linechart(s, rolling_mean=1)
# plt.xlim(datetime(2017,1,1), datetime(2022,7,1))
# plt.ylim(50,840)
# plt.savefig('half_timeseries.jpg', bbox_inches='tight')
# plt.show();

# s = get_tweets_volume(full_tweets, resampling='1d')
# linechart(s, rolling_mean=1)
# plt.xlim(datetime(2017,1,1), datetime(2022,7,1))
# plt.ylim(50,840)
# plt.savefig('full_timeseries.jpg', bbox_inches='tight')
# plt.show();

s = get_tweets_volume(full_tweets, resampling='1d')
plt.plot(s)
tmp = filter_list(full_tweets, 'datetime', datetime(2020,1,20), datetime(2022,10,20))
s = get_tweets_volume(tmp, resampling='1d')
plt.plot(s)
plt.xlim(datetime(2017,1,1), datetime(2022,7,1))
plt.ylim(50,840)

plt.tight_layout()
plt.savefig('full_timeseries.svg')
plt.show();

**salvataggio serie storica**

In [None]:
# # filter out tweets without keywords
# tmp = [tw for tw in full_tweets if count_keywords(tw['text'].lower(), ['perdita', 'perso']) > 0]
# tmp = [tw for tw in tmp if count_keywords(tw['text'].lower(), ['gusto', 'olfatto']) > 0]
# print('len:', len(tmp))

In [None]:
# # GIORNALIERA ITALIA
# df = pd.DataFrame()
# for k in KEYWORDS:
#     tmp = [tw for tw in full_tweets if k in tw['text'].lower()]
#     df[k] = get_tweets_volume(tmp, resampling='1d')
# for k in COVID_KEYWORDS:
#     tmp = [tw for tw in full_tweets if count_keywords(tw['text'], ['perdita', 'perso']) > 0]
#     tmp = [tw for tw in tmp if count_keywords(tw['text'], [k]) > 0]
#     df[k] = get_tweets_volume(tmp, resampling='1d')
# df['total'] = df.apply(lambda x: np.sum(x), axis=1)
# df['date'] = df.index
# df = df.fillna(0.0)
# df = df[['date']+KEYWORDS+COVID_KEYWORDS+['total']]
# df.to_csv('./files/twitter_daily_italy.csv', sep=';', index=False)

# # SETTIMANALE ITALIA
# df = pd.DataFrame()
# for k in KEYWORDS:
#     tmp = [tw for tw in full_tweets if k in tw['text'].lower()]
#     df[k] = get_tweets_volume(tmp, resampling='1w')
# for k in COVID_KEYWORDS:
#     tmp = [tw for tw in full_tweets if count_keywords(tw['text'], ['perdita', 'perso']) > 0]
#     tmp = [tw for tw in tmp if count_keywords(tw['text'], [k]) > 0]
#     df[k] = get_tweets_volume(tmp, resampling='1w')
# df['total'] = df.apply(lambda x: np.sum(x), axis=1)
# df['date'] = df.index #[day-timedelta(days=6) for day in list(tmp.index)]
# df = df.fillna(0.0)
# df = df[['date']+KEYWORDS+COVID_KEYWORDS+['total']]
# df.to_csv('./files/twitter_weekly_italy.csv', sep=';', index=False)

In [None]:
regione = 'sicilia'

df = get_tweets_with_location(full_tweets, users, places)
df = df[df['region']==regione]
tmp_tweets = df.to_dict('records')

# GIORNALIERA REGIONE
df = pd.DataFrame()
for k in KEYWORDS:
    tmp = [tw for tw in tmp_tweets if k in tw['text'].lower()]
    df[k] = get_tweets_volume(tmp, resampling='1d')
for k in COVID_KEYWORDS:
    tmp = [tw for tw in tmp_tweets if count_keywords(tw['text'], ['perdita', 'perso']) > 0]
    tmp = [tw for tw in tmp if count_keywords(tw['text'], [k]) > 0]
    df[k] = get_tweets_volume(tmp, resampling='1d')
df['total'] = df.apply(lambda x: np.sum(x), axis=1)
df['date'] = df.index
df = df.fillna(0.0)
df = df[['date']+KEYWORDS+COVID_KEYWORDS+['total']]
df.to_csv('./files/twitter_daily_'+regione+'.csv', sep=';', index=False)

# SETTIMANALE REGIONE
df = pd.DataFrame()
for k in KEYWORDS:
    tmp = [tw for tw in tmp_tweets if k in tw['text'].lower()]
    df[k] = get_tweets_volume(tmp, resampling='1w')
for k in COVID_KEYWORDS:
    tmp = [tw for tw in tmp_tweets if count_keywords(tw['text'], ['perdita', 'perso']) > 0]
    tmp = [tw for tw in tmp if count_keywords(tw['text'], [k]) > 0]
    df[k] = get_tweets_volume(tmp, resampling='1w')
df['total'] = df.apply(lambda x: np.sum(x), axis=1)
df['date'] = df.index #[day-timedelta(days=6) for day in list(tmp.index)]
df = df.fillna(0.0)
df = df[['date']+KEYWORDS+COVID_KEYWORDS+['total']]
df.to_csv('./files/twitter_weekly_'+regione+'.csv', sep=';', index=False)

**per analizzare quali tweet portano a picchi improvvisi**

In [None]:
s = pd.DataFrame(index=pd.to_datetime([tw['datetime'] for tw in full_tweets]))
s['count'] = [1]*len(s)
s = s.resample('1d').count()
s = s.sort_values(by='count', ascending=False).head(20)
s

In [None]:
i = 0
d1 = list(s.index)[i]
d2 = list(s.index)[i]+timedelta(days=1)
x = filter_list(full_tweets, 'datetime', d1, d2)
x = select_fields(x, ['text'], as_list=True)
x = ' '.join(x)
pd.value_counts(x.split()).head(60)

### Geolocalization

In [None]:
df = get_tweets_with_location(full_tweets, users, places)
df

In [None]:
l = len(df[df['type']=='geolocalization'])
print('Geolocalization:', l, '(' + str(np.round(100*l/len(full_tweets), 2)) + '%)')

l = len(df[df['type']=='user_location'])
print('User location:', l, '(' + str(np.round(100*l/len(full_tweets), 2)) + '%)')

In [None]:
plt.rcParams["figure.figsize"] = (7,5)

x1 = pd.DataFrame(pd.value_counts(df['region']))
x1.columns = ['tweets_count']

x2 = json.load(open('files/italy_regions_population.json', 'r'))
x2 = pd.DataFrame({'population': list(x2.values())}, index=list(x2.keys()))
x2['population'] /= 500

x = x1.join(x2)

fig = x.plot(kind='bar', color=['blue', 'orange'])
plt.tight_layout()
plt.savefig('regions.svg')

In [None]:
# split text for visualization
df['formatted_text'] = df['text'].str.wrap(50).apply(lambda x: x.replace('\n', '<br>'))

# re-group date
group_by = 'month'
code = {'week': 'W', 'month': 'M', 'year': 'Y'}
df['datetime'] = df['datetime'].dt.to_period(code[group_by]).apply(lambda r: r.start_time)    
df['datetime'] = df['datetime'].astype(str)
    
# sort by date
df = df.sort_values(by='datetime')

In [None]:
fig = px.density_mapbox(df, lat=df['lat'], 
                            lon=df['lon'], 
                            radius=10,
                            hover_data={'formatted_text': True, 'name': True, 'lat': False, 'lon': False, 'datetime': False},
                            animation_frame='datetime', 
                            width=600, height=600
                       )
fig.update_layout(mapbox_style="carto-positron", mapbox_zoom=4.5, mapbox_center={"lat": 42, "lon": 12.5})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 600
fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = 600
fig.layout.coloraxis.showscale = True   
fig.layout.sliders[0].pad.t = 10
fig.layout.updatemenus[0].pad.t= 10

fig.show()

In [None]:
x = df[['datetime', 'region', 'text']].groupby(['datetime', 'region']).count().unstack(fill_value=0).stack()
x = x.reset_index()
x.columns = [group_by, 'region', 'count']
pop = read_file('files/italy_regions_population.json')
x['normalized_count'] = x.apply(lambda x: x['count']/pop[x['region']], axis=1)

In [None]:
import seaborn as sns
sns.displot(x['normalized_count'], bins=50)

In [None]:
### https://github.com/deldersveld/topojson
### then converted from topojson to geojson
italy_regions_geo = read_file('files/italy_regions_borders.geojson')

# Choropleth representing the length of region names
fig = px.choropleth(data_frame=x, 
                    geojson=italy_regions_geo, 
                    locations='region', # name of dataframe column
                    featureidkey='properties.NAME_1',  # path to field in GeoJSON feature object with which to match the values passed in to locations
                    color='normalized_count',
                    color_continuous_scale="ylorbr",
                    animation_frame=group_by,
                    scope="europe",
                    range_color=(0,0.00006),#max(x['normalized_count'])),
                    width=1440, height=900
                   )
fig.update_geos(showcountries=False, showcoastlines=False, showland=True, fitbounds="locations")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}, dragmode=False)
fig.show()

### Confronti

**twitter**

In [None]:
t = get_tweets_volume(full_tweets, resampling='7d')
t = t['count']
t = t.apply(lambda x: (x-np.mean(t))/np.std(t)) #standardize
t

**google**

In [None]:
# google
g = pd.read_csv('./files/google_weekly_italy.csv', sep=';')
g.index = pd.to_datetime(g['date'])
del g['date']
# g = g['2017-01-01':'2022-05-15']
g = g['average']
g = g.apply(lambda x: (x-np.mean(g))/np.std(g)) #standardize
g

**influweb** (https://influenzanet.info/#page/home)

In [None]:
off = pd.read_csv('./files/IT_incidence.csv')
off = off[off['syndrome']>='ili.ecdc']
off['year'] = off['yw'].astype(str).str[:4].astype(int)
off['week'] = off['yw'].astype(str).str[-2:].astype(int)
off = off[off['year']>=2017]
off = off[['year', 'week', 'incidence', 'lower', 'upper', 'count', 'part']].reset_index(drop=True)
off

In [None]:
import itertools
all_years = list(range(2017,2022+1))
all_weeks = list(range(1,52+1))
combined = [all_years, all_weeks]
df1 = pd.DataFrame(columns = ['year', 'week'], data=list(itertools.product(*combined)))
off = df1.merge(off, how='left', left_on=['year', 'week'], right_on=['year', 'week']).copy().fillna(0)
off = off.iloc[:len(g)]['count']
off.index = g.index
off = off.apply(lambda x: (x-np.mean(off))/np.std(off)) #standardize
off

**flunet**

In [None]:
# # https://www.who.int/tools/flunet
# filename = './files/FluNetInteractiveReport.csv'

# official = pd.read_csv(filename)
# for field in ['Country', 'WHOREGION', 'FLUREGION']:
#     del official[field]

# c = official.columns
# official = official[list(c[:5]) + list(c[-3:])]
# official.columns = ['year', 'week', 'start_date', 'end_date', 'number_specimen', 
#                     'all_positive_viruses', 'all_negative_viruses', 'activity']

# official = official.fillna(0)

# tmp = official[['start_date', 'number_specimen']].copy()
# tmp = tmp.set_index('start_date')
# tmp.index = pd.to_datetime(tmp.index)
# tmp.index = [t+timedelta(-1) for t in tmp.index]

# tmp = tmp['2017-01-01':'2022-05-15']
 
# tmp = (tmp - tmp.min()) / (tmp.max() - tmp.min()) #normalize

# tmp

**plottini**

In [None]:
plt.rcParams["figure.figsize"] = (10,5)

tmp = pd.concat([t,g,off],axis=1)
tmp.columns = ['twitter', 'google', 'official']

tmp1 = pd.concat([t,off],axis=1)
tmp1.columns = ['twitter', 'official']

tmp2 = pd.concat([g,off],axis=1)
tmp2.columns = ['google', 'official']

tmp3 = pd.concat([t,g],axis=1)
tmp3.columns = ['twitter', 'google']

# tmp3 = tmp.copy()
# tmp3['avg'] = tmp3.apply(lambda x: np.mean([x['twitter'], x['google']]), axis=1)
# tmp3 = tmp3[['avg', 'official']]

In [None]:
plt.rcParams["figure.figsize"] = (9,5)

for df in [tmp1, tmp2, tmp3]:
    
    if 'official' in df.columns:
        mse = mean_squared_error(df['official'], df.iloc[:,0])
        print(list(df.columns), '-->', np.round(mse, 3))
    
    for col in df.columns:
    
        if col == 'twitter':
            color = 'C0'
        elif col == 'google':
            color = 'C1'
        elif col == 'avg':
            color = 'C3'
        else:
            color = 'green'

        plt.plot(df[col], color=color, label=col)
    
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig('confronto_'+('_'.join(list(df.columns)))+'.svg')
    plt.show();

In [None]:
t2 = t['2017-07-01':'2019-06-01'].copy()
t2[t2 < 0] = 0

g2 = g['2017-07-01':'2019-06-01'].copy()
g2[g2 < 0] = 0

off2 = off['2017-07-01':'2019-06-01'].copy()
off2[off2 < 0] = 0

plt.plot(t2)
plt.plot(g2)
plt.plot(off2)
plt.show();

In [None]:
tmp = t2['2017-07-01':'2018-06-01'].copy()
t_twitter = list(tmp[tmp>0].index)[0]

tmp = g2['2017-07-01':'2018-06-01'].copy()
t_google = list(tmp[tmp>0].index)[0]

tmp = off2['2017-07-01':'2018-06-01'].copy()
t_official = list(tmp[tmp>0].index)[0]

print('     twitter     |      google')
print(t_official - t_twitter, '|', t_official - t_google)

tmp = t2['2018-07-01':'2019-06-01'].copy()
t_twitter = list(tmp[tmp>0].index)[0]

tmp = g2['2018-07-01':'2019-06-01'].copy()
t_google = list(tmp[tmp>0].index)[0]

tmp = off2['2018-07-01':'2019-06-01'].copy()
t_official = list(tmp[tmp>0].index)[0]

print(t_official - t_twitter, '|', t_official - t_google)