# Identifying coincidences in Tokyo Olympics 2020 Tweets

***Student:*** Lais Isabelle ALVES DOS SANTOS

### Importing Libraries

In [267]:
import nltk                             # Natural Language Toolkit
import pandas as pd
import numpy as np
import importlib
from IPython.display import clear_output
import functions as F
import concurrent.futures as cf         # Use multiple threads
from geopy.geocoders import Nominatim   # Geolocalization
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize Nominatim API
geolocator = Nominatim(user_agent="coincidences", timeout=10)

In [257]:
# if the .py is changed, run to reload
importlib.reload(F)

<module 'functions' from 'd:\\Familia\\Lais\\Intercâmbio\\Télécom\\2A\\Creneaux D\\IA225\\mini_project\\ai225-mini-project-coincidences\\functions.py'>

### Load data

In [262]:
tokyo_tweets_db = pd.read_csv('../tokyo_2020_tweets.csv')

tokyo_tweets_db[:6] # visualize a part of the data

  tokyo_tweets_db = pd.read_csv('../tokyo_2020_tweets.csv')


Unnamed: 0,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,retweets,favorites,is_retweet
0,1418888645105356803,Abhishek Srivastav,"Udupi, India",Trying to be mediocre in many things,2021-02-01 06:33:51,45.0,39.0,293.0,False,2021-07-24 10:59:49,Let the party begin\n#Tokyo2020,['Tokyo2020'],Twitter for Android,0.0,0.0,False
1,1418888377680678918,Saikhom Mirabai Channu🇮🇳,"Manipur, India",Indian weightlifter 48 kg category. Champion🏆,2018-04-07 10:10:22,5235.0,5.0,2969.0,False,2021-07-24 10:58:45,Congratulations #Tokyo2020 https://t.co/8OFKMs...,['Tokyo2020'],Twitter for Android,0.0,0.0,False
2,1418888260886073345,Big Breaking,Global,All breaking news related to Financial Market....,2021-05-29 08:51:25,3646.0,3.0,5.0,False,2021-07-24 10:58:17,Big Breaking Now \n\nTokyo Olympic Update \n\n...,,Twitter for Android,0.0,1.0,False
3,1418888172864299008,International Hockey Federation,Lausanne,Official International Hockey Federation Twitt...,2010-10-20 10:45:59,103975.0,2724.0,36554.0,True,2021-07-24 10:57:56,Q4: 🇬🇧3-1🇿🇦\n\nGreat Britain finally find a wa...,,Twitter Web App,1.0,0.0,False
4,1418886894478270464,Cameron Hart,Australia,Football & Tennis Coach,2020-10-31 08:46:17,6.0,37.0,31.0,False,2021-07-24 10:52:51,All I can think of every time I watch the ring...,"['Tokyo2020', 'ArtisticGymnastics', '7Olympics...",Twitter for iPhone,0.0,0.0,False
5,1418885092571766792,Sab Joke H,India,Follows you,2020-09-05 19:50:35,107.0,88.0,102.0,False,2021-07-24 10:45:42,#Tokyo2020 #Olympics\n#MirabaiChanu\n#Weightli...,"['Tokyo2020', 'Olympics', 'MirabaiChanu', 'Wei...",Twitter for Android,0.0,0.0,False


### Pre-process the data

In [263]:
# Download important packages from nltk
nltk.download(["stopwords", "vader_lexicon", "punkt"])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LaisIsabelle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\LaisIsabelle\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LaisIsabelle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [171]:
# get texts and locations and removes the ones that contains nan
texts, locations = zip(*[(t,l) for t,l in zip(tokyo_tweets_db["text"], tokyo_tweets_db["user_location"]) if str(t) != 'nan' and str(l) != 'nan'])

In [143]:
# exclude word "of", "a", "the" and similar to avoid negative effect
stopwords = nltk.corpus.stopwords.words("english")      

In [210]:
sia = SentimentIntensityAnalyzer()
sscores = [sia.polarity_scores(F.pipeline(text, stopwords)) for text in texts]     # get sentiment scores for each text

In [250]:
# Display some result
id_text = 1110
ss = sscores[id_text]

print('Text: {}\nNegative score:\t{}\nNeutral Score:\t{}\nPositive Score:\t{}\nCompound:\t{}'.format(texts[id_text], ss['neg'], ss['neu'], ss['pos'], ss['compound']))

Text: #Tokyo2020  #Olympics  #bbcnews #skynews #gbnews failure to stop the illegal invasion from France is destroying the conservative government
Negative score:	0.614
Neutral Score:	0.386
Positive Score:	0.0
Compound:	-0.9136


In [259]:
# Computes a sentiment for each tweet
sentiment_analysis = [F.set_sentiment(sscores[i]['compound']) for i in range(len(sscores))]

In [277]:
# Separate the texts and location according to sentiment
neutral_texts = [text for i, text in enumerate(texts) if sentiment_analysis[i] == 'neu']
neutral_locations = [location for i, location in enumerate(locations) if sentiment_analysis[i] == 'neu']

negative_texts = [text for i, text in enumerate(texts) if sentiment_analysis[i] == 'neg']
negative_locations = [location for i, location in enumerate(locations) if sentiment_analysis[i] == 'neg']

positive_texts = [text for i, text in enumerate(texts) if sentiment_analysis[i] == 'pos']
positive_locations = [location for i, location in enumerate(locations) if sentiment_analysis[i] == 'pos']

### Explore coincidences

#### Description complexity

Description complexity refers to the amount of information or detail required to fully describe or represent a system, object, or phenomenon. It is a measure of the complexity of the structure or organization of the system being described. In the context of information theory, description complexity is often quantified using metrics such as algorithmic complexity or Kolmogorov complexity, which aim to capture the shortest possible description of a system.

In this case, the *neutral*, *positive* and *negative* tweets are considered and the distance between the places where one dispose of the same sentiment have the description complexity calculated.

In [278]:
# Returns geolocations to get latitude and longitude coordinates
def geo_location_calculator(location):
    return geolocator.geocode(location)

# parallelize the filling
#with cf.ThreadPoolExecutor() as exe:
#    geo_locations = list(exe.map(geo_location_calculator, locations))

#### Causal complexity

#### Unexpecteness calculation