In [40]:
import requests
import os
import json
from api_keys import BEARER_TOKEN
from tqdm import tqdm
import os
import pandas as pd
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import re

[nltk_data] Downloading package stopwords to /home/julio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Funções (Análise Exploratória)

### Função para plotar bar plot com a contagem de tokens

In [2]:
def plot_bar_count_words(text_column=None,
                         label_column=None,
                         name_class=None,
                         dataframe=None,
                         metric='SUM',
                         top=50,return_df=True):
    
    corpus = dataframe[text_column].values
    
    vectorizer = CountVectorizer()
    data_vect = vectorizer.fit_transform(corpus)
    data_vect = data_vect.toarray()
    
    df_count_words =  pd.DataFrame({
    "WORDS":vectorizer.get_feature_names(),
    "MEAN":data_vect.mean(axis=0),
    "SUM":data_vect.sum(axis=0),
    "STD":data_vect.std(axis=0),
    }) 
    
    

    if return_df:
    
        return df_count_words[[metric,'WORDS']].sort_values(by=[metric],ascending=False)[0:top]
    
    else:
        
        fig = plt.figure(figsize=(15,10))
        
        ax = sns.barplot(x=metric, 
                 y="WORDS", 
                 data=df_count_words[[metric,'WORDS']].sort_values(by=[metric],
                                                                            ascending=False)[0:top])


### Função para plotar bar plot com tf-idf

In [3]:
def plot_bar_tf_idf(text_column=None,
                         label_column=None,
                         name_class=None,
                         dataframe=None,
                         metric='SUM',
                         top=50,return_df=True):
    
    corpus = dataframe[text_column].values
    
    vectorizer = TfidfVectorizer()
    data_vect = vectorizer.fit_transform(corpus)
    data_vect = data_vect.toarray()
    
    df_count_words =  pd.DataFrame({
    "WORDS":vectorizer.get_feature_names(),
    "MEAN":data_vect.mean(axis=0),
    "SUM":data_vect.sum(axis=0),
    "STD":data_vect.std(axis=0),
    "MAX":data_vect.std(axis=0)
    }) 
    
    

    if return_df:
    
        return df_count_words[[metric,'WORDS']].sort_values(by=[metric],ascending=False)[0:top]
    
    else:
        
        fig = plt.figure(figsize=(15,10))
        
        ax = sns.barplot(x=metric, 
                 y="WORDS", 
                 data=df_count_words[[metric,'WORDS']].sort_values(by=[metric],
                                                                            ascending=False)[0:top])


### Função para contagem de tokens

In [4]:
def calculate_number_words(text):

    quantity_of_words = text.split(" ")

    quantity_of_words = [i for i in quantity_of_words if i!=""]

    quantity_of_words = len(quantity_of_words)

    return quantity_of_words


### Função para contagem de diferentes tokens

In [5]:
def calculate_number_diferent_words(text):

    quantity_of_diferent_words = text.split(" ")

    quantity_of_diferent_words = [i for i in quantity_of_diferent_words if i!=""]

    quantity_of_diferent_words = set(quantity_of_diferent_words)

    quantity_of_diferent_words = list(quantity_of_diferent_words)

    quantity_of_diferent_words = len(quantity_of_diferent_words)

    return quantity_of_diferent_words


### Função para criar textos sem repetição de palavras para ser utilizado na análise exploratória 

In [6]:
def convert_text_to_no_repeat_words(text):

    text_with_no_repeat_words = text.split(" ")

    text_with_no_repeat_words = [i for i in text_with_no_repeat_words if i!=""]

    text_with_no_repeat_words = set(text_with_no_repeat_words)

    text_with_no_repeat_words = list(text_with_no_repeat_words)

    text_with_no_repeat_words = " ".join(text_with_no_repeat_words)

    return text_with_no_repeat_words

### Função para o pré-processamento do texto 

In [7]:
def text_cleaner(text):
    
    nltk_stopwords = stopwords.words('portuguese')

    collection_text = [ {"text" : text}]
    text = pd.DataFrame(collection_text)

    text['text'] = text['text'].astype('str')
    text['text'] = text['text'].str.lower()
    text['text'] = text['text'].str.replace('\n',' ')
    text['text'] = text['text'].str.replace('\r',' ')
    text['text'] = text['text'].apply(lambda x: norm('NFKD', x).encode('ascii', 'ignore').decode())
    text['text'] = text['text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9]',' ',x))
    text['text'] = text['text'].apply(lambda x: re.sub(r'\s+',' ',x))
    pat = r'\b(?:{})\b'.format('|'.join(nltk_stopwords))
    text['text'] = text['text'].str.replace(pat,'')
    text = text['text'].values[0]

    return text

### Função para limpeza dos textos

In [99]:
def text_cleaner(text,stop_words_domain =None):
    
    nltk_stopwords = stopwords.words('portuguese')
    regex_stop_words = '|'.join(nltk_stopwords)
    text = re.sub(r"\shttps([a-zA-Zà-úÀ-Ú0-9]|[-()\"#/@;:<>{}`+=~|.!?,])+$|^https([a-zA-Zà-úÀ-Ú0-9]|[-()\"#/@;:<>{}`+=~|.!?,])+\s|\shttps([a-zA-Zà-úÀ-Ú0-9]|[-()\"#/@;:<>{}`+=~|.!?,])+\s"," ",text)
    text = re.sub(r"[^a-zA-ZÀ-Úà-ú]+"," ",text)
    text = re.sub(r"\s({})\s|\s({})$|^({})\s".format(regex_stop_words,regex_stop_words,regex_stop_words)," ",text)
    
    return text

In [100]:
txt = " uhahua,não"

In [101]:
text_cleaner(txt)

' uhahua '

In [85]:
stopwords.words('portuguese')

['de',
 'a',
 'o',
 'que',
 'e',
 'é',
 'do',
 'da',
 'em',
 'um',
 'para',
 'com',
 'não',
 'uma',
 'os',
 'no',
 'se',
 'na',
 'por',
 'mais',
 'as',
 'dos',
 'como',
 'mas',
 'ao',
 'ele',
 'das',
 'à',
 'seu',
 'sua',
 'ou',
 'quando',
 'muito',
 'nos',
 'já',
 'eu',
 'também',
 'só',
 'pelo',
 'pela',
 'até',
 'isso',
 'ela',
 'entre',
 'depois',
 'sem',
 'mesmo',
 'aos',
 'seus',
 'quem',
 'nas',
 'me',
 'esse',
 'eles',
 'você',
 'essa',
 'num',
 'nem',
 'suas',
 'meu',
 'às',
 'minha',
 'numa',
 'pelos',
 'elas',
 'qual',
 'nós',
 'lhe',
 'deles',
 'essas',
 'esses',
 'pelas',
 'este',
 'dele',
 'tu',
 'te',
 'vocês',
 'vos',
 'lhes',
 'meus',
 'minhas',
 'teu',
 'tua',
 'teus',
 'tuas',
 'nosso',
 'nossa',
 'nossos',
 'nossas',
 'dela',
 'delas',
 'esta',
 'estes',
 'estas',
 'aquele',
 'aquela',
 'aqueles',
 'aquelas',
 'isto',
 'aquilo',
 'estou',
 'está',
 'estamos',
 'estão',
 'estive',
 'esteve',
 'estivemos',
 'estiveram',
 'estava',
 'estávamos',
 'estavam',
 'estivera'

# Funções (Extração de Tweets)

In [9]:
# To set your enviornment variables in your terminal run the following line:
# export 'BEARER_TOKEN'='<your_bearer_token>'


def create_url(query = "@globoplay -is:retweet",until_id=None):
    
    #query = "@BBB -is:retweet"
    #"from:twitterdev -is:retweet"
    # Tweet fields are adjustable.
    # Options include:
    # attachments, author_id, context_annotations,
    # conversation_id, created_at, entities, geo, id,
    # in_reply_to_user_id, lang, non_public_metrics, organic_metrics,
    # possibly_sensitive, promoted_metrics, public_metrics, referenced_tweets,
    # source, text, and withheld
    
    if until_id:
        
        url = "https://api.twitter.com/2/tweets/search/recent?query={}&max_results=100&until_id={}".format(
            query,until_id
        )
        
    else:
        
        url = "https://api.twitter.com/2/tweets/search/recent?query={}&max_results=100".format(
            query
        )
            
    return url


def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers


def connect_to_endpoint(url, headers):
    response = requests.request("GET", url, headers=headers)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()


def extract_100_tweets(query = "@BBB -is:retweet",until_id=None):
    bearer_token = BEARER_TOKEN
    url = create_url(query,until_id)
    headers = create_headers(bearer_token)
    json_response = connect_to_endpoint(url, headers)
    data_tweets = json.dumps(json_response, indent=4, sort_keys=True)
    return json_response

def extract_many_tweets(qnt_cycle=10,folder="data_tweets",start_from_id=None,query="@BBB"):
    
    
    oldest_id = None
    
    for i in tqdm(range(qnt_cycle)):
    
        
        if i == 0:
            
            #extract the 100 tweets first
            
            if start_from_id:
        
                data_tweets = extract_100_tweets(query = "{} -is:retweet".format(query),until_id=None)
            
            else:
                
                data_tweets = extract_100_tweets(query = "{} -is:retweet".format(query),until_id=start_from_id)
                
            
            df_data_tweets_temp = pd.DataFrame(data_tweets["data"])
            
            #get the current date
            
            date_extraction = datetime.now()
            
            df_data_tweets_temp["date_extraction"] = date_extraction 
            
            
            oldest_id = data_tweets['meta']['oldest_id']
            
            oldest_date = date_extraction
            
            df_data_tweets = df_data_tweets_temp.copy()
            
            # name file
            
            date_extraction_str = str(date_extraction).replace(".","-").replace(":","-").replace(" ","-")
            
            name_file = "./{}/persist_tweets_{}_{}.csv".format(folder,date_extraction_str,date_extraction_str)
            
            # persist base
            
            df_data_tweets.to_csv(name_file,sep=",")
            
    
            
        else:
            
            
            #extract more 100 tweets older
            
            data_tweets_temp = extract_100_tweets(query = "{} -is:retweet".format(query),until_id=oldest_id)
            
            df_data_tweets_temp = pd.DataFrame(data_tweets_temp["data"])
            
            
            #get the current date
            
            
            date_extraction = datetime.now()
            
            df_data_tweets_temp["date_extraction"] = date_extraction 
            
            oldest_id = data_tweets_temp['meta']['oldest_id']
            
            df_data_tweets = pd.concat([df_data_tweets,df_data_tweets_temp.copy()])
            
            date_extraction = datetime.now()
            
            df_data_tweets.reset_index(inplace=True,drop=True)
            
            
            # remove old files
            
            os.remove(name_file)
            
            
            # name file
            
            oldest_date_str = str(oldest_date).replace(".","-").replace(":","-").replace(" ","-")
            
            date_extraction_str = str(date_extraction).replace(".","-").replace(":","-").replace(" ","-")
            
            
            name_file = "./{}/persist_tweets_{}_{}.csv".format(folder,oldest_date_str,date_extraction_str)
            
            # persist base
            
            df_data_tweets.to_csv(name_file.format(folder),sep=",")
            
            

    return df_data_tweets
    
    


# Extração de Tweets

In [11]:
data_tweets_final = extract_many_tweets(qnt_cycle=10,folder="data_tweets",query="globoplay")#,start_from_id="1367600965277384706")

100%|██████████| 10/10 [00:14<00:00,  1.45s/it]


In [42]:
data_tweets_final.head(30)

Unnamed: 0,id,text,date_extraction,text_unique_words,number_tokens,number_diferent_tokens,bins,indices_bins,text_clean
0,1383949586826334212,@globoplay @rafakalimann_ ninguém pediu isso c...,2021-04-18 22:04:46.634032,usurpadora logo a no pediu coloca catálogo @gl...,11,11,"[7.2,13.4)",1,@gl b pl y @r f k lim nn_ ningu m p diu c l ...
1,1383949548708499457,@hallentlynn amiga no rave não tem globoplay 🥺,2021-04-18 22:04:46.634032,@hallentlynn no rave tem não amiga globoplay 🥺,8,8,"[7.2,13.4)",1,@h ll ntlynn mig r v m gl b pl y 🥺
2,1383949443326611459,@evansftx amg mas no rave tem a opção globopla...,2021-04-18 22:04:46.634032,a pq mas no antes opção tinha amg rave tem não...,13,13,"[7.2,13.4)",1,@ v nsftx mg r v m pçã gl b pl y? p...
3,1383949414197186561,Melhor pessoa desse bbb https://t.co/HaKu8sRLkX,2021-04-18 22:04:46.634032,desse bbb Melhor pessoa https://t.co/HaKu8sRLkX,5,5,"[1.0,7.2)",0,M lh r p ss s bbb https://t.c /H Ku8sRLkX
4,1383949406836191237,"@globoplay Eu gostei, mas de fato achei que el...",2021-04-18 22:04:46.634032,"gostei, tendo mas 😅😅😅😅 ele estava fato convuls...",13,13,"[7.2,13.4)",1,"@gl b pl y Eu g s i, f t c l st v ..."
5,1383949316419555335,"Nossa, como deve ser horrível ser hater do Gil...",2021-04-18 22:04:46.634032,"como ser 🤭✌🏻 Gil horrível hater vigor Nossa, h...",13,11,"[7.2,13.4)",1,"N ss , v r h rrív l r h r Gil vig ..."
6,1383949242562084877,não faz sentido algum minha globoplay não func...,2021-04-18 22:04:46.634032,algum faz sentido na minha não funcionar tv gl...,10,9,"[7.2,13.4)",1,f z nti lg gl b pl y funci r tv
7,1383949219149451273,Estou aguardando pelos próximos episódios da s...,2021-04-18 22:04:46.634032,da A episódios aguardando vacinas pelos das sé...,14,14,"[13.4,19.7)",2,Est u gu r n s próxim s pi di s s ri A ...
8,1383949162064998402,@globoplay quando poderei matar as saudades de...,2021-04-18 22:04:46.634032,quando matar UM NO @globoplay as de O.C. poder...,12,12,"[7.2,13.4)",1,@gl b pl y p r i m t r s s u s O.C. UM ...
9,1383949094515666946,@hallentlynn se essa pessoa tbm tiver a globop...,2021-04-18 22:04:46.634032,"tiver a @hallentlynn tbm globoplay, sim se pes...",9,9,"[7.2,13.4)",1,"@h ll ntlynn ss p ss tbm r gl b pl y,..."


In [86]:
data_tweets_final["text_clean"] = data_tweets_final["text"].apply(lambda x: text_cleaner(text = x,stop_words_domain =None))

In [87]:
data_tweets_final[["text","text_clean"]]

Unnamed: 0,text,text_clean
0,@globoplay @rafakalimann_ ninguém pediu isso c...,globoplay rafakalimann ninguém pediu coloca l...
1,@hallentlynn amiga no rave não tem globoplay 🥺,hallentlynn amiga rave tem globoplay
2,@evansftx amg mas no rave tem a opção globopla...,evansftx amg no rave a opção globoplay pq ant...
3,Melhor pessoa desse bbb https://t.co/HaKu8sRLkX,Melhor pessoa desse bbb
4,"@globoplay Eu gostei, mas de fato achei que el...",globoplay Eu gostei de fato achei ele tendo c...
...,...,...
995,Eu ia ser tão feliz se a @globoplay adicionass...,Eu ia ser tão feliz a globoplay adicionasse no...
996,Por que ela ta querendo virar a tidinha ? http...,Por ela ta querendo virar tidinha
997,"pelo amor, alguém gravou esse comercial da Raf...",pelo amor alguém gravou comercial Rafa Paulo p...
998,@globoplay Joker do Vigor #BBB21,globoplay Joker Vigor BBB


In [69]:
re.sub(,data_tweets_final["text"][3]

SyntaxError: invalid syntax (<ipython-input-69-e4cc38c47788>, line 1)

In [70]:
data_tweets_final["text"][3]

'Melhor pessoa desse bbb https://t.co/HaKu8sRLkX'

In [78]:
test = 'https://t.co/HaKu8sRLkX Melhor pessoa desse bbb '

In [79]:
re.sub(r"\shttps([a-zA-Zà-úÀ-Ú0-9]|[-()\"#/@;:<>{}`+=~|.!?,])+$|^https([a-zA-Zà-úÀ-Ú0-9]|[-()\"#/@;:<>{}`+=~|.!?,])+\s|\shttps([a-zA-Zà-úÀ-Ú0-9]|[-()\"#/@;:<>{}`+=~|.!?,])+\s"," ",test)

' Melhor pessoa desse bbb '

In [73]:
re.sub(r"\shttps([a-zA-Zà-úÀ-Ú0-9]|[^a-zA-Zà-úÀ-Ú0-9])+$|^https([a-zA-Zà-úÀ-Ú0-9]|[^a-zA-Zà-úÀ-Ú0-9])+\s|\shttps([a-zA-Zà-úÀ-Ú0-9]|[^a-zA-Zà-úÀ-Ú0-9])+\s"," ",test)

'Melhor pessoa desse bbb '

In [None]:
https([a-zA-Zà-úÀ-Ú0-9]|[^a-zA-Zà-úÀ-Ú0-9])+\s

In [13]:
def function_to_calc_histogram(x,initial_interval, final_interval,n_bins,indice=False):

    interval = np.linspace(initial_interval, final_interval, num=n_bins)

    for j,i in enumerate(interval):


        if i == interval[len(interval)-1]:

            if x>=i:

                
                if indice:

                    return j
                
                else:
                    
                    return "{}<".format(x)


        else:

            if x>=i and x<interval[j+1]:

                inicial = round(i, 1)

                final = round(interval[j+1],1)

            
                if indice:

                    return j
                
                else:
                    
                    return "[{},{})".format(inicial,final)


# Criação de uma coluna com os textos sem repetição de palavras para ser utilizado na análise exploratória

In [14]:
data_tweets_final['text_unique_words'] = data_tweets_final['text'].apply(lambda x: convert_text_to_no_repeat_words(x))

# Calculo Número de tokens

In [15]:
data_tweets_final['number_tokens'] = data_tweets_final['text'].apply(lambda x: calculate_number_words(x))

# Calculo Número de diferentes tokens

In [16]:
data_tweets_final['number_diferent_tokens'] = data_tweets_final['text'].apply(lambda x: calculate_number_diferent_words(x))

# Máximo número de tokens

In [17]:
max_count = data_tweets_final["number_tokens"].max()

In [18]:
data_tweets_final["number_tokens"].var()

106.84171271271282

# Mínimo número de tokens

In [19]:
min_count = data_tweets_final["number_tokens"].min()

## Dados para o histograma 

In [20]:
data_tweets_final['bins'] = data_tweets_final['number_tokens'].apply(lambda x: function_to_calc_histogram(x,initial_interval = min_count, final_interval = max_count,n_bins = 10,indice=False))

In [21]:
data_tweets_final['indices_bins'] = data_tweets_final['number_tokens'].apply(lambda x: function_to_calc_histogram(x,initial_interval = min_count, final_interval = max_count,n_bins = 10,indice=True))

In [22]:
data_histogram = data_tweets_final.groupby(["bins","indices_bins"]).sum().sort_values(by=["indices_bins"])

In [23]:
data_histogram.reset_index(drop=False,inplace=True)

In [24]:
data_histogram

Unnamed: 0,bins,indices_bins,number_tokens,number_diferent_tokens
0,"[1.0,7.2)",0,1581,1576
1,"[7.2,13.4)",1,3225,3175
2,"[13.4,19.7)",2,2376,2290
3,"[19.7,25.9)",3,2195,2058
4,"[25.9,32.1)",4,1223,1129
5,"[32.1,38.3)",5,1338,1226
6,"[38.3,44.6)",6,1341,1197
7,"[44.6,50.8)",7,433,370
8,"[50.8,57.0)",8,104,91
9,57<,9,57,48


In [25]:
data_histogram["bins"].tolist()

['[1.0,7.2)',
 '[7.2,13.4)',
 '[13.4,19.7)',
 '[19.7,25.9)',
 '[25.9,32.1)',
 '[32.1,38.3)',
 '[38.3,44.6)',
 '[44.6,50.8)',
 '[50.8,57.0)',
 '57<']

In [26]:
data_histogram["number_tokens"].tolist()

[1581, 3225, 2376, 2195, 1223, 1338, 1341, 433, 104, 57]

# DF top 10 MEAN

In [27]:
df_report_mean = plot_bar_count_words(text_column='text',
                                                dataframe=data_tweets_final,
                                                metric='MEAN',top=10,return_df=True)

In [28]:
df_report_mean["MEAN"].tolist()

[0.924, 0.319, 0.319, 0.284, 0.252, 0.25, 0.213, 0.165, 0.148, 0.139]

In [29]:
df_report_mean["WORDS"].tolist()

['globoplay',
 'co',
 'https',
 'de',
 'que',
 'rafakalimann_',
 'eu',
 'no',
 'do',
 'da']

# DF top 10 SUM docs

In [30]:
df_report_sum = plot_bar_count_words(text_column='text_unique_words',
                                                dataframe=data_tweets_final,
                                                metric='SUM',top=10,return_df=True)

In [31]:
df_report_sum["P_DOCS"] =  df_report_sum["SUM"]/len(data_tweets_final)

In [32]:
df_report_sum

Unnamed: 0,SUM,WORDS,P_DOCS
1345,906,globoplay,0.906
1455,319,https,0.319
669,319,co,0.319
2491,244,rafakalimann_,0.244
824,225,de,0.225
2461,205,que,0.205
1127,178,eu,0.178
2010,145,no,0.145
802,126,da,0.126
940,116,do,0.116


# DF top 10 SUM

In [33]:
df_report_sum_docs = plot_bar_count_words(text_column='text',
                                                dataframe=data_tweets_final,
                                                metric='SUM',top=10,return_df=True)

In [34]:
df_report_sum_docs

Unnamed: 0,SUM,WORDS
1345,924,globoplay
669,319,co
1455,319,https
824,284,de
2461,252,que
2491,250,rafakalimann_
1127,213,eu
2010,165,no
940,148,do
802,139,da


# DF top 10 MEAN TF-IDF

In [35]:
df_report_tfidf_mean = plot_bar_tf_idf(text_column='text',
                                                dataframe=data_tweets_final,
                                                metric='MEAN',top=10,return_df=True)

In [36]:
df_report_tfidf_mean

Unnamed: 0,MEAN,WORDS
1345,0.072245,globoplay
2491,0.056826,rafakalimann_
669,0.04132,co
1455,0.04132,https
824,0.034915,de
2461,0.030825,que
1127,0.027536,eu
3104,0.027514,você
1849,0.024422,merece
2010,0.023554,no


# DF top 10 MAX TF-IDF

In [37]:
df_report_tfidf_max = plot_bar_tf_idf(text_column='text',
                                                dataframe=data_tweets_final,
                                                metric='MAX',top=10,return_df=True)

In [38]:
df_report_tfidf_max

Unnamed: 0,MAX,WORDS
2491,0.12501,rafakalimann_
3104,0.094182,você
1849,0.089983,merece
2091,0.080011,orgulho
3040,0.077694,vc
2190,0.07411,paulovieirareal
824,0.073977,de
669,0.072069,co
1455,0.072069,https
242,0.068867,amor
