In [29]:
import requests
import os
import json
from api_keys import BEARER_TOKEN
from tqdm import tqdm
import os
import pandas as pd
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Funções (Análise Exploratória)

### Função para plotar bar plot com a contagem de tokens

In [2]:
def plot_bar_count_words(text_column=None,
                         label_column=None,
                         name_class=None,
                         dataframe=None,
                         metric='SUM',
                         top=50,return_df=True):
    
    corpus = dataframe[text_column].values
    
    vectorizer = CountVectorizer()
    data_vect = vectorizer.fit_transform(corpus)
    data_vect = data_vect.toarray()
    
    df_count_words =  pd.DataFrame({
    "WORDS":vectorizer.get_feature_names(),
    "MEAN":data_vect.mean(axis=0),
    "SUM":data_vect.sum(axis=0),
    "STD":data_vect.std(axis=0),
    }) 
    
    

    if return_df:
    
        return df_count_words[[metric,'WORDS']].sort_values(by=[metric],ascending=False)[0:top]
    
    else:
        
        fig = plt.figure(figsize=(15,10))
        
        ax = sns.barplot(x=metric, 
                 y="WORDS", 
                 data=df_count_words[[metric,'WORDS']].sort_values(by=[metric],
                                                                            ascending=False)[0:top])


### Função para plotar bar plot com tf-idf

In [3]:
def plot_bar_tf_idf(text_column=None,
                         label_column=None,
                         name_class=None,
                         dataframe=None,
                         metric='SUM',
                         top=50,return_df=True):
    
    corpus = dataframe[text_column].values
    
    vectorizer = TfidfVectorizer()
    data_vect = vectorizer.fit_transform(corpus)
    data_vect = data_vect.toarray()
    
    df_count_words =  pd.DataFrame({
    "WORDS":vectorizer.get_feature_names(),
    "MEAN":data_vect.mean(axis=0),
    "SUM":data_vect.sum(axis=0),
    "STD":data_vect.std(axis=0),
    "MAX":data_vect.std(axis=0)
    }) 
    
    

    if return_df:
    
        return df_count_words[[metric,'WORDS']].sort_values(by=[metric],ascending=False)[0:top]
    
    else:
        
        fig = plt.figure(figsize=(15,10))
        
        ax = sns.barplot(x=metric, 
                 y="WORDS", 
                 data=df_count_words[[metric,'WORDS']].sort_values(by=[metric],
                                                                            ascending=False)[0:top])


### Função para contagem de tokens

In [4]:
def calculate_number_words(text):

    quantity_of_words = text.split(" ")

    quantity_of_words = [i for i in quantity_of_words if i!=""]

    quantity_of_words = len(quantity_of_words)

    return quantity_of_words


### Função para contagem de diferentes tokens

In [5]:
def calculate_number_diferent_words(text):

    quantity_of_diferent_words = text.split(" ")

    quantity_of_diferent_words = [i for i in quantity_of_diferent_words if i!=""]

    quantity_of_diferent_words = set(quantity_of_diferent_words)

    quantity_of_diferent_words = list(quantity_of_diferent_words)

    quantity_of_diferent_words = len(quantity_of_diferent_words)

    return quantity_of_diferent_words


### Função para criar textos sem repetição de palavras para ser utilizado na análise exploratória 

In [6]:
def convert_text_to_no_repeat_words(text):

    text_with_no_repeat_words = text.split(" ")

    text_with_no_repeat_words = [i for i in text_with_no_repeat_words if i!=""]

    text_with_no_repeat_words = set(text_with_no_repeat_words)

    text_with_no_repeat_words = list(text_with_no_repeat_words)

    text_with_no_repeat_words = " ".join(text_with_no_repeat_words)

    return text_with_no_repeat_words

### Função para o pré-processamento do texto 

In [7]:
def text_cleaner(text):
    
    nltk_stopwords = stopwords.words('portuguese')

    collection_text = [ {"text" : text}]
    text = pd.DataFrame(collection_text)

    text['text'] = text['text'].astype('str')
    text['text'] = text['text'].str.lower()
    text['text'] = text['text'].str.replace('\n',' ')
    text['text'] = text['text'].str.replace('\r',' ')
    text['text'] = text['text'].apply(lambda x: norm('NFKD', x).encode('ascii', 'ignore').decode())
    text['text'] = text['text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9]',' ',x))
    text['text'] = text['text'].apply(lambda x: re.sub(r'\s+',' ',x))
    pat = r'\b(?:{})\b'.format('|'.join(nltk_stopwords))
    text['text'] = text['text'].str.replace(pat,'')
    text = text['text'].values[0]

    return text

# Funções (Extração de Tweets)

In [8]:
# To set your enviornment variables in your terminal run the following line:
# export 'BEARER_TOKEN'='<your_bearer_token>'


def create_url(query = "@globoplay -is:retweet",until_id=None):
    
    #query = "@BBB -is:retweet"
    #"from:twitterdev -is:retweet"
    # Tweet fields are adjustable.
    # Options include:
    # attachments, author_id, context_annotations,
    # conversation_id, created_at, entities, geo, id,
    # in_reply_to_user_id, lang, non_public_metrics, organic_metrics,
    # possibly_sensitive, promoted_metrics, public_metrics, referenced_tweets,
    # source, text, and withheld
    
    if until_id:
        
        url = "https://api.twitter.com/2/tweets/search/recent?query={}&max_results=100&until_id={}".format(
            query,until_id
        )
        
    else:
        
        url = "https://api.twitter.com/2/tweets/search/recent?query={}&max_results=100".format(
            query
        )
            
    return url


def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers


def connect_to_endpoint(url, headers):
    response = requests.request("GET", url, headers=headers)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()


def extract_100_tweets(query = "@BBB -is:retweet",until_id=None):
    bearer_token = BEARER_TOKEN
    url = create_url(query,until_id)
    headers = create_headers(bearer_token)
    json_response = connect_to_endpoint(url, headers)
    data_tweets = json.dumps(json_response, indent=4, sort_keys=True)
    return json_response

def extract_many_tweets(qnt_cycle=10,folder="data_tweets",start_from_id=None,query="@BBB"):
    
    
    oldest_id = None
    
    for i in tqdm(range(qnt_cycle)):
    
        
        if i == 0:
            
            #extract the 100 tweets first
            
            if start_from_id:
        
                data_tweets = extract_100_tweets(query = "{} -is:retweet".format(query),until_id=None)
            
            else:
                
                data_tweets = extract_100_tweets(query = "{} -is:retweet".format(query),until_id=start_from_id)
                
            
            df_data_tweets_temp = pd.DataFrame(data_tweets["data"])
            
            #get the current date
            
            date_extraction = datetime.now()
            
            df_data_tweets_temp["date_extraction"] = date_extraction 
            
            
            oldest_id = data_tweets['meta']['oldest_id']
            
            oldest_date = date_extraction
            
            df_data_tweets = df_data_tweets_temp.copy()
            
            # name file
            
            date_extraction_str = str(date_extraction).replace(".","-").replace(":","-").replace(" ","-")
            
            name_file = "./{}/persist_tweets_{}_{}.csv".format(folder,date_extraction_str,date_extraction_str)
            
            # persist base
            
            df_data_tweets.to_csv(name_file,sep=",")
            
    
            
        else:
            
            
            #extract more 100 tweets older
            
            data_tweets_temp = extract_100_tweets(query = "{} -is:retweet".format(query),until_id=oldest_id)
            
            df_data_tweets_temp = pd.DataFrame(data_tweets_temp["data"])
            
            
            #get the current date
            
            
            date_extraction = datetime.now()
            
            df_data_tweets_temp["date_extraction"] = date_extraction 
            
            oldest_id = data_tweets_temp['meta']['oldest_id']
            
            df_data_tweets = pd.concat([df_data_tweets,df_data_tweets_temp.copy()])
            
            date_extraction = datetime.now()
            
            df_data_tweets.reset_index(inplace=True,drop=True)
            
            
            # remove old files
            
            os.remove(name_file)
            
            
            # name file
            
            oldest_date_str = str(oldest_date).replace(".","-").replace(":","-").replace(" ","-")
            
            date_extraction_str = str(date_extraction).replace(".","-").replace(":","-").replace(" ","-")
            
            
            name_file = "./{}/persist_tweets_{}_{}.csv".format(folder,oldest_date_str,date_extraction_str)
            
            # persist base
            
            df_data_tweets.to_csv(name_file.format(folder),sep=",")
            
            

    return df_data_tweets
    
    


# Extração de Tweets

In [13]:
data_tweets_final = extract_many_tweets(qnt_cycle=10,folder="data_tweets",query="globoplay")#,start_from_id="1367600965277384706")

100%|██████████| 10/10 [00:10<00:00,  1.08s/it]


In [14]:
data_tweets_final.head(30)

Unnamed: 0,id,text,date_extraction
0,1382485715888566274,GNT To pistola que perdi a novela!!!!!! como a...,2021-04-14 21:08:04.180408
1,1382485694036246533,@allana_lara1 @globoplay Falei pra assinar o P...,2021-04-14 21:08:04.180408
2,1382485540809936898,Tava aqui lutando com o globoplay pra assistir...,2021-04-14 21:08:04.180408
3,1382485532291256323,@rrrrrulia to procurando no globoplay o capitu...,2021-04-14 21:08:04.180408
4,1382485509302329344,"netflix, prime vídeo, globoplay, disney+ e eu ...",2021-04-14 21:08:04.180408
5,1382485496140546048,https://t.co/kyZ2a8Kp5w,2021-04-14 21:08:04.180408
6,1382485467568934912,@PortalTracklist Eu assino Globoplay para assi...,2021-04-14 21:08:04.180408
7,1382485422522122246,Tô dando contas:\nGLOBOPLAY\nTELECINE\nQuem qu...,2021-04-14 21:08:04.180408
8,1382485420143951874,alguém de bom coração pra me passar a conta do...,2021-04-14 21:08:04.180408
9,1382485268603809793,globoplay você prometeu,2021-04-14 21:08:04.180408


In [27]:
data_tweets_final

Unnamed: 0,id,text,date_extraction,text_unique_words,number_tokens,number_diferent_tokens
0,1382485715888566274,GNT To pistola que perdi a novela!!!!!! como a...,2021-04-14 21:08:04.180408,achando n globoplay? pistola perdi que novela!...,20,20
1,1382485694036246533,@allana_lara1 @globoplay Falei pra assinar o P...,2021-04-14 21:08:04.180408,Prime @allana_lara1 pra o vídeo @globoplay Fal...,8,8
2,1382485540809936898,Tava aqui lutando com o globoplay pra assistir...,2021-04-14 21:08:04.180408,de Tava o pra achando aqui hoje quinta... fase...,19,17
3,1382485532291256323,@rrrrrulia to procurando no globoplay o capitu...,2021-04-14 21:08:04.180408,hj procurando de e o globoplay n achando!!!!! ...,13,12
4,1382485509302329344,"netflix, prime vídeo, globoplay, disney+ e eu ...",2021-04-14 21:08:04.180408,"xvideos prime e o queria netflix, globoplay, f...",18,17
...,...,...,...,...,...,...
990,1382418258876370956,@FsPFeVC @magavassidiaz Só na globoplay no mom...,2021-04-14 21:08:13.938277,@FsPFeVC Só na globoplay @magavassidiaz moment...,7,7
991,1382418211723968514,Titchela já divulgou o #CasaKalimanm e vcs? ❤️...,2021-04-14 21:08:13.938277,divulgou ❤️\n\nDia Titchela e o 28/04 já @glob...,14,14
992,1382418206061694984,assisti esse umas 30x e sempre passo mal\n htt...,2021-04-14 21:08:13.938277,passo https://t.co/Oug4PRKAY6 e sempre assisti...,9,9
993,1382418192161722377,"Passou, floodou\nReprodução: @globoplay @TVGlo...",2021-04-14 21:08:13.938277,"Passou, @TVGlobo\n\nhttps://t.co/Jer1OzNrO0 fl...",4,4


In [30]:
np.arange(0,1,5)

array([0])

In [34]:
interval

array([0.  , 1.25, 2.5 , 3.75, 5.  ])

In [38]:
def function_to_calc_histogram(initial_interval, final_interval,n_bins,x):

    interval = np.linspace(initial_interval, final_interval, num=n_bins)

    for j,i in enumerate(interval):


        if i == interval[len(interval)-1]:

            if x>=i:

                print("{}<".format(x))

                return "{}<".format(x)


        else:

            if x>=i and x<interval[j+1]:

                inicial = round(i, 1)

                final = round(interval[j+1],1)

                print("[{},{})".format(inicial,final))

                return "[{},{})".format(inicial,final)

0
0.0
1
1.25
2
2.5
[2.5,3.8)
3
3.75
4
5.0
erro


# Criação de uma coluna com os textos sem repetição de palavras para ser utilizado na análise exploratória

In [15]:
data_tweets_final['text_unique_words'] = data_tweets_final['text'].apply(lambda x: convert_text_to_no_repeat_words(x))

# Calculo Número de tokens

In [16]:
data_tweets_final['number_tokens'] = data_tweets_final['text'].apply(lambda x: calculate_number_words(x))

# Calculo Número de diferentes tokens

In [17]:
data_tweets_final['number_diferent_tokens'] = data_tweets_final['text'].apply(lambda x: calculate_number_diferent_words(x))

# DF top 10 MEAN

In [18]:
df_report_mean = plot_bar_count_words(text_column='text',
                                                dataframe=data_tweets_final,
                                                metric='MEAN',top=10,return_df=True)

In [19]:
df_report_mean["MEAN"].tolist()

[0.9758793969849247,
 0.3829145728643216,
 0.3829145728643216,
 0.33768844221105526,
 0.3035175879396985,
 0.23919597989949748,
 0.18592964824120603,
 0.1829145728643216,
 0.16683417085427135,
 0.1527638190954774]

In [20]:
df_report_mean["WORDS"].tolist()

['globoplay', 'https', 'co', 'que', 'de', 'no', 'da', 'eu', 'não', 'do']

# DF top 10 SUM docs

In [23]:
df_report_sum = plot_bar_count_words(text_column='text_unique_words',
                                                dataframe=data_tweets_final,
                                                metric='SUM',top=10,return_df=True)

In [25]:
df_report_sum["P_DOCS"] =  df_report_sum["SUM"]/len(data_tweets_final)

In [26]:
df_report_sum

Unnamed: 0,SUM,WORDS,P_DOCS
1641,950,globoplay,0.954774
841,380,co,0.38191
1742,380,https,0.38191
2830,258,que,0.259296
1031,252,de,0.253266
2333,213,no,0.21407
1006,158,da,0.158794
1382,153,eu,0.153769
2375,143,não,0.143719
2290,135,na,0.135678


# DF top 10 SUM

In [None]:
df_report_sum_docs = plot_bar_count_words(text_column='text',
                                                dataframe=data_tweets_final,
                                                metric='SUM',top=10,return_df=True)

In [None]:
df_report_sum_docs

# DF top 10 MEAN TF-IDF

In [None]:
df_report_tfidf_mean = plot_bar_tf_idf(text_column='text',
                                                dataframe=data_tweets_final,
                                                metric='MEAN',top=10,return_df=True)

In [None]:
df_report_tfidf_mean

# DF top 10 MAX TF-IDF

In [None]:
df_report_tfidf_max = plot_bar_tf_idf(text_column='text',
                                                dataframe=data_tweets_final,
                                                metric='MAX',top=10,return_df=True)

In [None]:
df_report_tfidf_max