# Imports

In [1]:
import pandas as pd
import plotly as plt
import plotly.graph_objects as go

from IPython.display import clear_output
import nltk
import re
from collections import defaultdict, Counter
from string import punctuation

#stopwords em portugues
from nltk.corpus import stopwords
stopwords = set(stopwords.words('portuguese') + list(punctuation) + ['“','”'])

#funçao pra limpeza de texto
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)

# Frequência de palavras chave relativa ao total de tweets em um dia

In [2]:
tweets = pd.read_csv('100k_user_text_date.csv')

In [3]:
tweets = tweets[['text', 'created_at']]

## Tokenizando o texto

In [4]:
# funçao pra limpar texto e retirar stopwords
def tokenizar_texto(texto_tweet):
    texto_tokenizado = tweet_tokenizer.tokenize(texto_tweet)

    if texto_tokenizado[0] == "rt" and texto_tokenizado[1] == ":":
        del texto_tokenizado[0]
    
    palavras_sem_stopwords = [palavra for palavra in texto_tokenizado if palavra not in stopwords] 
    return palavras_sem_stopwords

In [5]:
tweets['text'] = tweets['text'].apply(lambda x: tokenizar_texto(x))

In [6]:
tweets = tweets.groupby('created_at').sum()

In [7]:
tweets.head()

Unnamed: 0_level_0,text
created_at,Unnamed: 1_level_1
2020-04-11,"[efeitos, quarentena, kkk, quarentena, acabou,..."
2020-04-12,"[desculpas, mina, agora, quarentena, dessa, ve..."
2020-04-13,"[porem, ainda, curto, sempre, inovando, lançan..."
2020-04-14,"[coisas, q, quarentena, ensinou, final, sempre..."
2020-04-15,"[imagina, mundo, pandemia, quarentena, etc, vc..."


## Analisando frequências

In [8]:
days = tweets.index.tolist()

#### Tokenizando a coluna 'text'

In [9]:
tweets['text'] = tweets['text'].apply(lambda x: tokenizar_texto(x))

TypeError: expected string or buffer

In [None]:
tweets = tweets.groupby('created_at').sum()

In [None]:
tweets.head()

## Frequência de palavras-chave por dia

In [None]:
freqs_per_day = {}

In [None]:
for day in days:
    keywords = ['corona', 'covid', 'coronavirus', 'covid19', 'covid-19', 'sars', 'coronavírus', 'quarentena', 'confinamento', 'hidroxicloroquina',
                'cloroquina', 'distanciamento', 'aglomeração', 'aglomerações'] # Lista de palavras chave
    i = days.index(day)
    frequency = Counter(tweets['text'][i])
    frequency = {k: v for k, v in frequency.items() if k in keywords}
    frequency['coronavirus'] += frequency.get('coronavírus')
    del frequency['coronavírus']
    
    frequency = pd.DataFrame.from_dict(frequency, orient='index')
    
    frequency['freq'] = frequency[0]
    frequency = frequency.drop(columns = 0)
    frequency = frequency.sort_values(by='freq', ascending = False)
    
    freqs_per_day[day] = frequency

### Função que extrai frequência de uma palavra ao longo dos dias

In [None]:
def word_per_day(word):
    freq = {}
    for day in days:
        freq[day] = freqs_per_day[day]['freq'][word]/len(tweets['text'][day])
    return freq

# "Covid", "Corona" e "Coronavirus"

In [None]:
fig = go.Figure()
fig.layout.yaxis.tickformat = ',.2%'
fig.add_trace(go.Scatter(x= days, y =list(word_per_day('covid').values()), name='Covid'))
fig.add_trace(go.Scatter(x= days, y =list(word_per_day('corona').values()), name='Corona'))
fig.add_trace(go.Scatter(x= days, y =list(word_per_day('coronavirus').values()), name='Coronavirus'))

# "Quarentena" e "Confinamento"

In [None]:
fig = go.Figure()
fig.layout.yaxis.tickformat = ',.2%'
fig.add_trace(go.Scatter(x= days, y =list(word_per_day('quarentena').values()), name='Quarentena'))
fig.add_trace(go.Scatter(x= days, y =list(word_per_day('confinamento').values()), name='Confinamento'))

# "Hidroxicloroquina", "Cloroquina"

In [None]:
fig = go.Figure()
fig.layout.yaxis.tickformat = ',.2%'
fig.add_trace(go.Scatter(x= days, y =list(word_per_day('hidroxicloroquina').values()), name='Hidroxicloroquina'))
fig.add_trace(go.Scatter(x= days, y =list(word_per_day('cloroquina').values()), name='Cloroquina'))

# "Distanciamento"

In [None]:
fig = go.Figure()
fig.layout.yaxis.tickformat = ',.2%'
fig.add_trace(go.Scatter(x= days, y =list(word_per_day('distanciamento').values()), name='Distanciamento'))

# "Aglomeração", "Aglomerações"

In [None]:
fig = go.Figure()
fig.layout.yaxis.tickformat = ',.2%'
fig.add_trace(go.Scatter(x= days, y =list(word_per_day('aglomeração').values()), name='Aglomeração'))
fig.add_trace(go.Scatter(x= days, y =list(word_per_day('aglomerações').values()), name='Aglomerações'))