In [None]:
from datetime import datetime, date, timedelta
import pandas as pd
import numpy as np
import random
import json
import re
import os
from collections.abc import Iterable
import requests

from matplotlib import pyplot as plt
import plotly.express as px
import seaborn as sns
import matplotlib.ticker as ticker

In [None]:
import importlib

import TweetsUtils
importlib.reload(TweetsUtils)
from TweetsUtils import *

### Read files

In [None]:
lang = 'en'

years = [2018,2019,2021,2022]

dates = {2018: ('2018-04-17 00:00:00', '2018-04-22 23:59:59'),
         2019: ('2019-04-08 00:00:00', '2019-04-14 23:59:59'), 
         2021: ('2021-09-04 00:00:00', '2021-09-10 23:59:59'), 
         2022: ('2022-06-06 00:00:00', '2022-06-12 23:59:59')}

In [None]:
def read_tweets(years, lang):
    tweets, users, places = [], [], []
    for year in years:
        tmp_tweets, tmp_users, tmp_places = read_tweets_files(year, lang)
        tweets += tmp_tweets
        users += tmp_users
        places += tmp_places
    return tweets, users, places



def read_tweets_files(year, lang):
    if lang == 'en':
        lang = '_en'
    else:
        lang = ''
    
    base_path = 'tweets/mdw' + str(year)[-2:] + lang + '_tweets/'
    tweets_filename = base_path + 'mdw' + str(year)[-2:] + lang + '_tweets_contents.json'
    users_filename = base_path + 'mdw' + str(year)[-2:] + lang + '_tweets_users.json'
    places_filename = base_path + 'mdw' + str(year)[-2:] + lang + '_tweets_places.json'
    
    tmp_tweets = read_file(tweets_filename)
    tmp_users = read_file(users_filename)
    tmp_places = read_file(places_filename)

    for tw in tmp_tweets:
        tw['datetime'] = datetime.strptime(tw['datetime'], '%Y-%m-%d %H:%M:%S')
        
    tmp_tweets = center_week(tmp_tweets, year)
    
    return tmp_tweets, tmp_users, tmp_places



def center_week(tmp_tweets, year):
    delta = 60
    start_date = datetime.strptime(dates[year][0], '%Y-%m-%d %H:%M:%S') - timedelta(days=delta)
    end_date = datetime.strptime(dates[year][1], '%Y-%m-%d %H:%M:%S') + timedelta(days=delta)
    return filter_list(tmp_tweets, 'datetime', start_date, end_date)

In [None]:
tweets, users, places = read_tweets(years, lang)

print('TOT:', len(tweets), 'tweets')
print('TOT:', len(users), 'users')
print('TOT:', len(places), 'places')

In [None]:
tweets = read_file('tweets/total_tweets.json')
for tw in tweets:
    tw['datetime'] = datetime.strptime(tw['datetime'], '%Y-%m-%d %H:%M:%S')
tweets = center_week(tweets, 2018) + center_week(tweets, 2019) + center_week(tweets, 2021) + center_week(tweets, 2022)
len(tweets)

In [None]:
# ignoro i tweet riguardo il salone del mobile di torino
tweets = [tw for tw in tweets if not ('torino' in tw['text'].lower() and 'milano' not in tw['text'].lower())]
len(tweets)

In [None]:
original_tweets = [tw for tw in tweets if is_original(tw)]
len(original_tweets)

In [None]:
entities_tweets = read_file('tweets/total_tweets_entities.json')
for tw in entities_tweets:
    tw['datetime'] = datetime.strptime(tw['datetime'], '%Y-%m-%d %H:%M:%S')
entities_tweets = center_week(entities_tweets, 2018) + center_week(entities_tweets, 2019) + center_week(entities_tweets, 2021) + center_week(entities_tweets, 2022)
len(entities_tweets)

### Tweets quantity

In [None]:
counts = {}

for lang in ['it', 'en', 'fiera']:
    print(lang)
    
    if lang == 'fiera':
        tmp2 = [434000, 386000, 60000, 262000]
        tmp = [0] + [np.round(100*(tmp2[i]-tmp2[i-1])/tmp2[i-1], 1) for i in range(1, len(tmp2))]

    else:
        tmp_tweets, _, _ = read_tweets(years, lang)
#         tmp_tweets = [tw for tw in tmp_tweets if is_original(tw)] # only originals
        tmp_tweets = [tw for tw in tmp_tweets if not ('torino' in tw['text'].lower() and 'milano' not in tw['text'].lower())]
        tmp = [[str(year), len(filter_year(tmp_tweets, year))] for year in years]
        tmp = pd.DataFrame(tmp, columns=['year', 'count'])
        tmp2 = list(tmp['count'])
        tmp['change'] = [0] + [np.round(100*(tmp2[i]-tmp2[i-1])/tmp2[i-1], 1) for i in range(1, len(tmp2))]
    
    counts[lang] = tmp2
    
    print(tmp)
    print()

In [None]:
plt.rcParams["figure.figsize"] = (11,4)
sns.set_theme(style="whitegrid")

df = pd.DataFrame(counts, index=years)
df = pd.DataFrame(df.stack()).reset_index()
df.columns = ['year', 'type', 'count']

plt.subplots_adjust(wspace=0.4)

i = 1
for _type in ['it', 'fiera']:
    plt.subplot(1, 2, i)
    tmp = df[df['type']==_type].copy()
    col = 'C0' if i == 1 else 'C1'
    ax = sns.barplot(data=tmp, x='year', y='count', color=col)
    ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.0f}'.format(x/1000) + ' k'))
#     plt.tight_layout()
#     plt.savefig('test'+_type+'.png', dpi=300, bbox_inches='tight')    
    i+=1

plt.savefig('test.svg', bbox_inches='tight')
plt.show();
sns.reset_orig()

In [None]:
plt.rcParams["figure.figsize"] = (4,3)
sns.set_theme(style="whitegrid")

df = pd.DataFrame(counts, index=years)
df = pd.DataFrame(df.stack()).reset_index()
df.columns = ['year', 'type', 'count']

for _type in ['it', 'en', 'fiera']:
    tmp = df[df['type']==_type].copy()
    ax = sns.barplot(data=tmp, x='year', y='count', color='green')
    ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.0f}'.format(x/1000) + ' k'))
    plt.savefig('test'+_type+'.png', dpi=300, bbox_inches='tight')
    plt.show();

sns.reset_orig()

In [None]:
def linechart(s, rolling_mean=None, normalize=False):
    
    if rolling_mean:
        s = s.rolling(window=rolling_mean).mean()#.dropna()
        
    if normalize:
        s = (s - s.min()) / (s.max() - s.min())
    
    plt.plot(s)
    
    
import matplotlib.dates as mdates
myFmt = mdates.DateFormatter('%b-%d')


for year in [2018,2019,2021,2022]:
    year_tweets = filter_year(tweets, year)
    s = pd.DataFrame(index=pd.to_datetime([tw['datetime'] for tw in year_tweets]))
    s['count'] = [1]*len(s)
    s = s.resample('1h').count()
    
    fig, ax = plt.subplots(figsize=(12,5))
    ax.plot(s)
    
    delta = 65
    start = datetime.strptime(dates[year][0], '%Y-%m-%d %H:%M:%S') - timedelta(days=delta)
    end = datetime.strptime(dates[year][1], '%Y-%m-%d %H:%M:%S') + timedelta(days=delta)
    plt.xlim(start, end)
    
    plt.ylim(0, 450)
    ax.xaxis.set_major_formatter(myFmt)
    plt.show();

In [None]:
h1 = 3
h2 = 3
freq='3h'

t1 = filter_list(tweets, 'datetime', datetime(2018,4,17)-timedelta(h1), datetime(2018,4,23)+timedelta(h2))
s = pd.DataFrame(index=[tw['datetime'] for tw in t1])
s['count'] = [1]*len(s)
s = s.resample(freq).count()
s = list(s['count'])
plt.plot(s, label='2018')

t2 = filter_list(tweets, 'datetime', datetime(2019,4,8)-timedelta(h1), datetime(2019,4,15)+timedelta(h2))
s = pd.DataFrame(index=[tw['datetime'] for tw in t2])
s['count'] = [1]*len(s)
s = s.resample(freq).count()
s = list(s['count'])
plt.plot(s, label='2019')

t3 = filter_list(tweets, 'datetime', datetime(2021,9,4)-timedelta(h1), datetime(2021,9,11)+timedelta(h2))
s = pd.DataFrame(index=[tw['datetime'] for tw in t3])
s['count'] = [1]*len(s)
s = s.resample(freq).count()
s = list(s['count'])
plt.plot(s, label='2021')

t4 = filter_list(tweets, 'datetime', datetime(2022,6,6)-timedelta(h1), datetime(2022,6,13)+timedelta(h2))
s = pd.DataFrame(index=[tw['datetime'] for tw in t4])
s['count'] = [1]*len(s)
s = s.resample(freq).count()
s = list(s['count'])
plt.plot(s, label='2022')


_start = 24/int(freq[0]) * 3
_end = 24/int(freq[0]) * 10
_max = 1100
plt.plot([_start,_start], [0,_max], '--', color='black')
plt.plot([_end,_end], [0,_max], '--', color='black')

plt.rcParams["figure.figsize"] = (12,5)
plt.legend(loc='best')
plt.savefig('events_series.svg', bbox_inches='tight')
plt.show();

In [None]:
def top_retweets(tweets_list, topn):
    x = select_fields(tweets_list, ['referenced_tweets'], as_list=True)
    x = [e[0] for e in x if e != []]
    ids = [element['id'] for row in x for element in row]
    x = pd.value_counts(ids).head(topn)
    count = list(x)
    texts = []
    for _id in x.index:
        t = select_fields(filter_list(tweets, 'id', _id), ['text'], as_list=True)[0]
        t = t.replace('\n', '. ').replace('..', '.').replace('. . ', '. ').replace(' . ', '. ')
        texts.append(t)
    return count, texts

In [None]:
# print dei tweet prima e dopo la MDW
year = 2022

start_date = datetime.strptime(dates[year][0], '%Y-%m-%d %H:%M:%S') - timedelta(days=60)
end_date = datetime.strptime(dates[year][0], '%Y-%m-%d %H:%M:%S') 
tmp = filter_list(tweets, 'datetime', start_date, end_date)
c1, t1 = top_retweets(tmp, 3)

start_date = datetime.strptime(dates[year][1], '%Y-%m-%d %H:%M:%S')
end_date = datetime.strptime(dates[year][1], '%Y-%m-%d %H:%M:%S') + timedelta(days=61)
tmp = filter_list(tweets, 'datetime', start_date, end_date)
c2, t2 = top_retweets(tmp, 3)


for i,t in enumerate(t1):
    print(c1[i], '|', t, '\n')
print('---\n')
for i,t in enumerate(t2):
    print(c2[i], '|', t, '\n')

In [None]:
# top retweets globali
for year in years:
    tmp = filter_year(tweets, year)
    c1, t1 = top_retweets(tmp, 3)

    for i,t in enumerate(t1):
        print(c1[i], '|', t, '\n')
    print('---\n')

### Users

In [None]:
ids = select_fields(tweets, ['author_id'], as_list=True, unique=True)
x = filter_list(users, 'id', ids, multiple=True)
x = select_fields(x, ['id', 'name', 'followers_count', 'location'])
x = pd.DataFrame(x, columns=['id', 'name', 'followers_count', 'location'])
x = x.drop_duplicates('id')
x = x.sort_values(by='followers_count', ascending=False).head(25)
x['location'] = x['location'].apply(lambda x: x['name'] if type(x) == dict else x)
x = x.fillna('')
x['avg_valence'] = x['id'].apply(lambda x: 
                                 np.nanmean(select_fields(filter_list(tweets, 'author_id', x), ['valence'], as_list=True)))
x['avg_valence'] = x['avg_valence'].fillna(np.nanmean(x['avg_valence'])).round(2)
x['count'] = x['id'].apply(lambda _id: len([tw for tw in tweets if tw['author_id']==_id]))

In [None]:
fig = px.bar(x, x="followers_count", y="name", 
             color="count", 
             title="Influencers", 
             hover_data={"location": True, "name": False}, 
             color_continuous_scale='blugrn', 
#              color_continuous_scale='RdYlGn', 
#              range_color=(-0.7,0.7),
             width=800, height=700)
# fig.update_layout(yaxis_range=[-1,1])
fig['layout']['yaxis']['autorange'] = "reversed"
fig.show()

In [None]:
# super tweeters
x = tweets.copy()
x = pd.value_counts(select_fields(x, ['author_id'], as_list=True)).head(25)

tmp = pd.DataFrame([select_fields(filter_list(users, 'id', i), ['id', 'name', 'username', 'followers_count'])[0] 
                  for i in list(x.index)])
tmp['tweets'] = list(x)

fig = px.bar(tmp, x='tweets', y='name', 
             color='tweets', 
             title="Super tweeters", 
             hover_data={"followers_count": True, "name": False, 'username':True, 'tweets':True}, 
             color_continuous_scale='blugrn', 
             width=900, height=700)
fig['layout']['yaxis']['autorange'] = "reversed"
fig.show()

In [None]:
# super tweeters
for year in [2018,2019,2021,2022]:
    filtered_tweets = filter_year(tweets, year)
    x = pd.value_counts(select_fields(filtered_tweets, ['author_id'], as_list=True)).head(7)

    tmp = pd.DataFrame([select_fields(filter_list(users, 'id', i), ['id', 'name', 'username', 'followers_count'])[0] 
                      for i in list(x.index)])
    tmp['tweets'] = list(x)
    
    tmp['name'] = tmp['name'].apply(lambda x: x[:20]+'...' if len(x)>20 else x)

    fig = px.bar(tmp, x='tweets', y='name', 
                 color='tweets', 
                 title=str(year),
                 hover_data={"followers_count": True, "name": False, 'username':True, 'tweets':True}, 
                 range_color=(0,600),
                 color_continuous_scale='blugrn', 
                 width=650, height=325)
    
    fig.update_layout(xaxis_range=[0,600])
    fig['layout']['yaxis']['autorange'] = "reversed"
    fig.show()

In [None]:
# tweet più retweettati
x = select_fields(tweets, ['referenced_tweets'], as_list=True)
x = [e[0] for e in x if e != []]
ids = [element['id'] for row in x for element in row]
x = pd.value_counts(ids).head(7)
count = list(x)
print('xxxxxxxxxxxxxxxxx',count,'\n')

for _id in x.index:
    t = select_fields(filter_list(tweets, 'id', _id), ['text'], as_list=True)[0]
    print(t.replace('\n', '. ').replace('..', '.').replace('. . ', '. ').replace(' . ', '. '))
    print('---')

### Hashtags

In [None]:
def get_useless_words():
    ignore = ['milano', 'design', 'week', 'fuori', 'salone', 'mobile', 'mdw', 
          'milanodesignweek', 'salonedelmobile', 'fuorisalone', 
          '18', '19', '21', '22', '2018', '2019', '2021', '2022']
    return ignore + [i+j for i in ignore for j in ignore]


for year in years:
    print(year)
    tmp = filter_year(tweets, year)

    hashtags = select_fields(tmp, ['hashtags'], as_list=True)

    # flatten 
    hashtags = [item[0] for sublist in hashtags for item in sublist if item != []] 

    # lower
    hashtags = [h.lower().strip() for h in hashtags]

    # ignore topic words
    useless = get_useless_words()
    hashtags = [h.replace('ù', 'u') for h in hashtags if h not in useless]
    
    # count
    hashtags = pd.DataFrame(pd.value_counts(hashtags)).head(7)
    hashtags = hashtags.reset_index()
    hashtags.columns = ['hashtag', 'count']

    h_list = list(hashtags['hashtag'])
    hashtags['hashtag'] = [''.ljust(int(1.75*(23-len(x)))) + '#' + x for x in h_list]
#     hashtags['hashtag'] = '#' + hashtags['hashtag']

    fig = px.bar(hashtags, x="count", y="hashtag", 
             color="count", 
             hover_data={"hashtag": False},
                 color_continuous_scale='blugrn', 
             width=700, height=300)

    fig['layout']['yaxis']['autorange'] = "reversed"
    fig.update_layout(xaxis_range=[0,500])
    fig.update_coloraxes(showscale=False)
    fig.show()

### Entities

In [None]:
entities = select_fields(entities_tweets, ['entities'], as_list=True)
entities = [e for e in entities if len(e) > 0]
entities = [e for sublist in entities for e in sublist] #flatten

df = pd.DataFrame(entities)
df['text'] = df['text'].str.title()
df['text'] = df['text'].str.replace('#', '')
df['text'] = df['text'].str.replace('Via Tortona', 'Tortona')
df['text'] = df['text'].str.replace('Italy', 'Italia')
df['text'] = df['text'].str.replace('..', '', regex=False)
df = df[df['text']!='Milano']
df['text'] = [''.ljust(int(1.75*(23-len(x)))) + x for x in list(df['text'])]

df['type'] = df['type'].str.replace('LOC', 'Luogo')
df['type'] = df['type'].str.replace('ORG', 'Organizzazione')
df['type'] = df['type'].str.replace('PER', 'Persona')

In [None]:
for _type in ['Luogo', 'Organizzazione', 'Persona']:
    tmp = df[df['type']==_type]
    d = {e[0]: e[1] for e in tmp.values}
    
    counts = pd.DataFrame(pd.value_counts(tmp['text'])).reset_index().head(10)
    counts.columns = ['entity', 'count']
    counts['type'] = counts.apply(lambda x: d[x['entity']], axis=1)

    fig = px.bar(counts, x="count", y='entity', 
             color="type", 
             color_discrete_map={'Luogo': 'lightgreen', 'Organizzazione': 'orange', 'Persona': 'lightblue'}, 
             hover_data={"entity": False},
             width=700, height=350)
    fig['layout']['yaxis']['autorange'] = "reversed"
    fig.update_layout(barmode='stack', yaxis={'categoryorder':'total descending'})
    fig.update_coloraxes(showscale=False)
    fig.update_layout(xaxis_range=[0,2100])
    fig.update_layout(showlegend=False)
    fig.show()

In [None]:
for year in years:
    print(year)
    tmp = filter_year(entities_tweets, year)
    entities = select_fields(tmp, ['entities'], as_list=True)
    entities = [e for e in entities if len(e) > 0]
    entities = [e for sublist in entities for e in sublist] #flatten
    
    df = pd.DataFrame(entities)
    df['text'] = df['text'].str.title()
    df['text'] = df['text'].str.replace('#', '')
    df['text'] = df['text'].str.replace('Italy', 'Italia')
    df['text'] = df['text'].str.replace('..', '', regex=False)
    df['text'] = df['text'].str.replace('Italy', 'Italia')
    df = df[df['text']!='Milano']
    df['text'] = [''.ljust(int(1.75*(23-len(x)))) + x for x in list(df['text'])]
    df['type'] = df['type'].str.replace('LOC', 'Luogo')
    df['type'] = df['type'].str.replace('ORG', 'Organizzazione')
    df['type'] = df['type'].str.replace('PER', 'Persona')
    
    d = {e[0]: e[1] for e in df.values}
    counts = pd.DataFrame(pd.value_counts(df['text'])).reset_index().head(10)
    counts.columns = ['entity', 'count']
    counts['type'] = counts.apply(lambda x: d[x['entity']], axis=1)
    
    fig = px.bar(counts, x="count", y='entity', 
             color="type", 
             color_discrete_map= {'Luogo': 'lightgreen', 'Organizzazione': 'orange', 'Persona': 'lightblue'}, 
             hover_data={"entity": False},
             width=700, height=350)
    fig['layout']['yaxis']['autorange'] = "reversed"
    fig.update_layout(barmode='stack', yaxis={'categoryorder':'total descending'})
    fig.update_coloraxes(showscale=False)
    fig.update_layout(xaxis_range=[0,1600])
    fig.update_layout(showlegend=False)
    fig.show()

### Sentiment & emotions

In [None]:
# from TweetsSentiment import TweetsSentiment
# ts = TweetsSentiment()

# def change_thresholds(tweets, v_threshold, a_threshold):
#     for tw in tweets:
#         tw['emotion'] = ts.classify_emotion(tw['valence'], tw['arousal'], v_threshold, a_threshold)
#         tw['sentiment'] = ts.classify_sentiment(tw['valence'], v_threshold)
        
# change_thresholds(tweets, 0.35, 0.35)

In [None]:
field = 'sentiment'

In [None]:
for tmp_tweets in [tweets, original_tweets]:

    x = {year: list(pd.value_counts(select_fields(filter_year(tmp_tweets, year), [field], as_list=True))) 
     for year in years}
    x = pd.DataFrame(x, index=['neutral', 'positive','negative']).T
    x = x.melt()
    x['year'] = years*3
    # dfs = []
    # for year in years:
    #     tmp = x[x['year']==year].copy()
    #     tmp['perc_value'] = (tmp['value'] / np.sum(tmp['value']) * 100).round(2)
    #     tmp['perc_value'] = (tmp['perc_value']).astype(str)
    #     tmp['perc_value'] = tmp['perc_value'] + '%'
    #     dfs.append(tmp)
    # x = pd.concat(dfs)
    # fig = px.bar(x, x="year", y="value", color="variable", hover_data={'year':False}, text='perc_value', 
    #             color_discrete_map={'neutral':'lightgray', 'positive':'green', 'negative':'red'})
    # fig.show()

    x = x[x['variable']!='neutral']
    dfs = []
    for year in years:
        tmp = x[x['year']==year].copy()
        tmp['perc_value'] = (tmp['value'] / np.sum(tmp['value']) * 100).round(2)
        tmp['perc_value'] = (tmp['perc_value']).astype(str)
        tmp['perc_value'] = tmp['perc_value'] + '%'
        dfs.append(tmp)
    x = pd.concat(dfs)
    fig = px.bar(x, x="year", y="value", color="variable", hover_data={'year':False}, text='perc_value', 
                color_discrete_map={'neutral':'lightgray', 'positive':'green', 'negative':'red'})
    fig.show()

In [None]:
# piecharts

for year in [2018,2019,2021, 2022]:
    print(year)
    filtered_tweets = filter_year(tweets, year)
    
    x = select_fields(filtered_tweets, [field], as_list=True)
    df = pd.DataFrame(pd.value_counts(x), columns=[field])

    color_dict = {'neutral':'lightgray', 'positive':'green', 'negative':'red'}
    fig = px.pie(df, values=field, names=df.index, color=df.index, hole=0.5, 
                        color_discrete_map=color_dict, width=400, height=400)

    fig.show()

In [None]:
tmp = filter_year(tweets, 2018)
tmp = filter_list(tmp, 'sentiment', 'negative')
tmp = select_fields(tmp, ['text'], as_list=True, unique=True)
tmp

In [None]:
filtered_tweets = tweets.copy()

# # filter tweets by year
# filtered_tweets = filter_year(filtered_tweets, 2018)

# # filter tweets by entity / hashtag / user
# filtered_tweets = filter_list(filtered_tweets, 'entities', 'Fondazione Prada')

# # filter tweets by keyword in text
# words = ['traffic', 'parcheggi']
# x = select_fields(tweets, ['text'], as_list=True)
# filtered_tweets = [tw for tw in filtered_tweets if count_keywords(tw['text'].lower(), words) > 0]

print('number of tweets:', len(filtered_tweets))

In [None]:
for field in ['emotion', 'sentiment']:
    x = select_fields(filtered_tweets, [field], as_list=True)
    df = pd.DataFrame(pd.value_counts(x), columns=[field])

    color_dict = {'neutral':'lightgray', 'positive':'green', 'negative':'red'}
    fig = px.pie(df, values=field, names=df.index, color=df.index, hole=0.5, title=field+' analysis',  
                        color_discrete_map=color_dict, width=500, height=400)

    fig.show()

In [None]:
for sentiment in ['negative', 'positive']:
    tmp = filter_list(filtered_tweets, 'sentiment', sentiment)
    tmp = select_fields(tmp, ['text'], as_list=True)
    print('\n---', sentiment, '---')
    for x in tmp:
        print('XXXXX', x)

**exploration**

In [None]:
df = pd.DataFrame(select_fields(filtered_tweets, ['valence', 'arousal', 'text']))
df['valence'] = df['valence'].round(3)
df['arousal'] = df['arousal'].round(3)
df['formatted_text'] = df['text'].str.wrap(50).apply(lambda x: x.replace('\n', '<br>'))
df = df[['formatted_text', 'valence', 'arousal',]]

fig = px.scatter(df, x="valence", y="arousal", color='valence', 
                labels={
                    "valence": "Valence",
                    "arousal": "Arousal",
                    "formatted_text": "Text",
                }, 
                hover_data={'formatted_text': True, 'valence': True, 'arousal': True}, 

                color_continuous_scale='RdYlGn', 
                range_color=(-1,1),
                width=900, height=700)

fig.update_traces(marker={'size': 6}, selector=dict(mode='markers'))

fig.update_layout(xaxis_range=[-1,1], 
                  yaxis_range=[-1,1], 
                  plot_bgcolor='rgba(128,128,128,0.15)')

fig.show()

### wordclouds

In [None]:
import string, os
from wordcloud import WordCloud

def wordcloud(freq, size=(10,7), save=False):
    plt.rcParams["figure.figsize"] = size

#     # the regex used to detect words is a combination of normal words, ascii art, and emojis
#     normal_word = r"(?:\w[\w']+)"
#     ascii_art = r"(?:[{punctuation}][{punctuation}]+)".format(punctuation=string.punctuation)
#     emoji = r"(?:[^\s])(?<![\w{ascii_printable}])".format(ascii_printable=string.printable)
#     regexp = r"{normal_word}|{ascii_art}|{emoji}".format(normal_word=normal_word, ascii_art=ascii_art, emoji=emoji)

#     d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
#     font_path = os.path.join(d, 'fonts', 'Symbola', 'Symbola.ttf')
#     wc = WordCloud(font_path=font_path, regexp=regexp, background_color="white")
    
    wc = WordCloud(background_color="white", width=1200, height=800)
    wc.generate_from_frequencies(freq)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

    if save:
        wc.to_file(save+'_wordcloud.png')


# for emotion in ['joy', 'anticipation', 'trust', 'surprise', 'sadness', 'fear', 'anger', 'neutral']:
#     filtered_tweets = filter_list(tweets, 'emotion', emotion)
#     texts = select_fields(filtered_tweets, ['lemmatized_text'], as_list=True)
    
#     texts = ' '.join(texts).split()

#     texts = [word.lower().strip() for word in texts 
#              if word.lower().strip() not in ignore
#             and not word.startswith('#')
#             and not word.startswith('@')
#             and not word.startswith('.')]
    
#     if len(texts) > 0:
#         print()
#         print(emotion)
#         wordcloud(pd.value_counts(texts))

In [None]:
for emotion in ['negative', 'positive']:
    filtered_tweets = filter_list(tweets, 'sentiment', emotion)
    texts = select_fields(filtered_tweets, ['preprocessed_text'], as_list=True)
    
    texts = ' '.join(texts).split()

    texts = [word.lower().strip() for word in texts 
             if not word.startswith('.')
#             and not word.startswith('#')
#             and not word.startswith('@')
            and word.isalpha()]
    
    texts = [t.replace('cazzo', 'c***o') for t in texts]
    
    if len(texts) > 0:
        print()
        print(emotion)
        wordcloud(pd.value_counts(texts))

In [None]:
for year in years:
    print()
    print(year)
    for emotion in ['negative', 'positive']:
        tmp_tweets = filter_year(tweets, year)
        filtered_tweets = filter_list(tmp_tweets, 'sentiment', emotion)
        texts = select_fields(filtered_tweets, ['preprocessed_text'], as_list=True)

        texts = ' '.join(texts).split()

        texts = [word.lower().strip() for word in texts 
                 if not word.startswith('.')
#                 and not word.startswith('#')
#                 and not word.startswith('@')
                and word.isalpha()]
        
        texts = [t.replace('cazzo', 'c***o') for t in texts]

        if len(texts) > 0:
            print()
            print(emotion)
            wordcloud(pd.value_counts(texts))

In [None]:
for year in years:
    for sent in ['negative', 'positive']:
        x = filter_list(tweets, 'sentiment', sent)
        x = filter_year(x, year)
        x = select_fields(x, ['text'], as_list=True)
        x = pd.value_counts(x).head(3)
#         x = list(x.index)[:3]
        print('\n---', year, sent)
        print(x)

### filter by sentiment & emotion

In [None]:
field = 'sentiment'
value = 'positive'

In [None]:
filtered_tweets = filter_list(tweets, field, value).copy()

for tw in filtered_tweets:
    user = filter_list(users, 'id', tw['author_id'])[0]
    tw['user_name'] = user['name']
    tw['followers_count'] = user['followers_count']
df = select_fields(filtered_tweets, ['user_name', 'followers_count', 'text', 'valence', 'arousal'])
df = pd.DataFrame(df)
df = df.sort_values(by='valence').reset_index(drop=True)[['text', 'user_name', 'followers_count', 'valence', 'arousal']]
df

In [None]:
df.iloc[1]['text']

### locations

In [None]:
df = get_tweets_with_location(tweets, users, places)
df['country'] = df['country'].str.replace('Italia', 'Italy')
df.index = df['datetime']

print('Geolocalization:', len(df[df['type']=='geolocalization']))
print('User location:  ', len(df[df['type']=='user_location']))

df

In [None]:
x = pd.DataFrame(df['country'].value_counts()).head(20)
x['count'] = x['country']
x['country'] = x.index

fig = px.bar(x, x='count', y='country', 
             color='count', 
             color_continuous_scale='blugrn', 
             width=700, height=600)
fig['layout']['yaxis']['autorange'] = "reversed"
fig.show()

In [None]:
fig = px.scatter_mapbox(df, lat="lat", lon="lon", hover_name="country", 
                        color_discrete_sequence=["fuchsia"], zoom=1, height=600)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.update_traces(marker=dict(size=5))
fig.show()

In [None]:
tmp = [tw['text'] for tw in tweets]
pd.value_counts(tmp).head(20)