In [1]:
import requests
import os
import json
from api_keys import BEARER_TOKEN
from tqdm import tqdm
import os
import pandas as pd
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Fun√ß√µes (An√°lise Explorat√≥ria)

### Fun√ß√£o para plotar bar plot com a contagem de tokens

In [2]:
def plot_bar_count_words(text_column=None,
                         label_column=None,
                         name_class=None,
                         dataframe=None,
                         metric='SUM',
                         top=50,return_df=True):
    
    corpus = dataframe[text_column].values
    
    vectorizer = CountVectorizer()
    data_vect = vectorizer.fit_transform(corpus)
    data_vect = data_vect.toarray()
    
    df_count_words =  pd.DataFrame({
    "WORDS":vectorizer.get_feature_names(),
    "MEAN":data_vect.mean(axis=0),
    "SUM":data_vect.sum(axis=0),
    "STD":data_vect.std(axis=0),
    }) 
    
    

    if return_df:
    
        return df_count_words[[metric,'WORDS']].sort_values(by=[metric],ascending=False)[0:top]
    
    else:
        
        fig = plt.figure(figsize=(15,10))
        
        ax = sns.barplot(x=metric, 
                 y="WORDS", 
                 data=df_count_words[[metric,'WORDS']].sort_values(by=[metric],
                                                                            ascending=False)[0:top])


### Fun√ß√£o para plotar bar plot com tf-idf

In [3]:
def plot_bar_tf_idf(text_column=None,
                         label_column=None,
                         name_class=None,
                         dataframe=None,
                         metric='SUM',
                         top=50,return_df=True):
    
    corpus = dataframe[text_column].values
    
    vectorizer = TfidfVectorizer()
    data_vect = vectorizer.fit_transform(corpus)
    data_vect = data_vect.toarray()
    
    df_count_words =  pd.DataFrame({
    "WORDS":vectorizer.get_feature_names(),
    "MEAN":data_vect.mean(axis=0),
    "SUM":data_vect.sum(axis=0),
    "STD":data_vect.std(axis=0),
    "MAX":data_vect.std(axis=0)
    }) 
    
    

    if return_df:
    
        return df_count_words[[metric,'WORDS']].sort_values(by=[metric],ascending=False)[0:top]
    
    else:
        
        fig = plt.figure(figsize=(15,10))
        
        ax = sns.barplot(x=metric, 
                 y="WORDS", 
                 data=df_count_words[[metric,'WORDS']].sort_values(by=[metric],
                                                                            ascending=False)[0:top])


### Fun√ß√£o para contagem de tokens

In [4]:
def calculate_number_words(text):

    quantity_of_words = text.split(" ")

    quantity_of_words = [i for i in quantity_of_words if i!=""]

    quantity_of_words = len(quantity_of_words)

    return quantity_of_words


### Fun√ß√£o para contagem de diferentes tokens

In [5]:
def calculate_number_diferent_words(text):

    quantity_of_diferent_words = text.split(" ")

    quantity_of_diferent_words = [i for i in quantity_of_diferent_words if i!=""]

    quantity_of_diferent_words = set(quantity_of_diferent_words)

    quantity_of_diferent_words = list(quantity_of_diferent_words)

    quantity_of_diferent_words = len(quantity_of_diferent_words)

    return quantity_of_diferent_words


### Fun√ß√£o para criar textos sem repeti√ß√£o de palavras para ser utilizado na an√°lise explorat√≥ria 

In [6]:
def convert_text_to_no_repeat_words(text):

    text_with_no_repeat_words = text.split(" ")

    text_with_no_repeat_words = [i for i in text_with_no_repeat_words if i!=""]

    text_with_no_repeat_words = set(text_with_no_repeat_words)

    text_with_no_repeat_words = list(text_with_no_repeat_words)

    text_with_no_repeat_words = " ".join(text_with_no_repeat_words)

    return text_with_no_repeat_words

### Fun√ß√£o para o pr√©-processamento do texto 

In [7]:
def text_cleaner(text):
    
    nltk_stopwords = stopwords.words('portuguese')

    collection_text = [ {"text" : text}]
    text = pd.DataFrame(collection_text)

    text['text'] = text['text'].astype('str')
    text['text'] = text['text'].str.lower()
    text['text'] = text['text'].str.replace('\n',' ')
    text['text'] = text['text'].str.replace('\r',' ')
    text['text'] = text['text'].apply(lambda x: norm('NFKD', x).encode('ascii', 'ignore').decode())
    text['text'] = text['text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9]',' ',x))
    text['text'] = text['text'].apply(lambda x: re.sub(r'\s+',' ',x))
    pat = r'\b(?:{})\b'.format('|'.join(nltk_stopwords))
    text['text'] = text['text'].str.replace(pat,'')
    text = text['text'].values[0]

    return text

# Fun√ß√µes (Extra√ß√£o de Tweets)

In [8]:
# To set your enviornment variables in your terminal run the following line:
# export 'BEARER_TOKEN'='<your_bearer_token>'


def create_url(query = "@globoplay -is:retweet",until_id=None):
    
    #query = "@BBB -is:retweet"
    #"from:twitterdev -is:retweet"
    # Tweet fields are adjustable.
    # Options include:
    # attachments, author_id, context_annotations,
    # conversation_id, created_at, entities, geo, id,
    # in_reply_to_user_id, lang, non_public_metrics, organic_metrics,
    # possibly_sensitive, promoted_metrics, public_metrics, referenced_tweets,
    # source, text, and withheld
    
    if until_id:
        
        url = "https://api.twitter.com/2/tweets/search/recent?query={}&max_results=100&until_id={}".format(
            query,until_id
        )
        
    else:
        
        url = "https://api.twitter.com/2/tweets/search/recent?query={}&max_results=100".format(
            query
        )
            
    return url


def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers


def connect_to_endpoint(url, headers):
    response = requests.request("GET", url, headers=headers)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()


def extract_100_tweets(query = "@BBB -is:retweet",until_id=None):
    bearer_token = BEARER_TOKEN
    url = create_url(query,until_id)
    headers = create_headers(bearer_token)
    json_response = connect_to_endpoint(url, headers)
    data_tweets = json.dumps(json_response, indent=4, sort_keys=True)
    return json_response

def extract_many_tweets(qnt_cycle=10,folder="data_tweets",start_from_id=None,query="@BBB"):
    
    
    oldest_id = None
    
    for i in tqdm(range(qnt_cycle)):
    
        
        if i == 0:
            
            #extract the 100 tweets first
            
            if start_from_id:
        
                data_tweets = extract_100_tweets(query = "{} -is:retweet".format(query),until_id=None)
            
            else:
                
                data_tweets = extract_100_tweets(query = "{} -is:retweet".format(query),until_id=start_from_id)
                
            
            df_data_tweets_temp = pd.DataFrame(data_tweets["data"])
            
            #get the current date
            
            date_extraction = datetime.now()
            
            df_data_tweets_temp["date_extraction"] = date_extraction 
            
            
            oldest_id = data_tweets['meta']['oldest_id']
            
            oldest_date = date_extraction
            
            df_data_tweets = df_data_tweets_temp.copy()
            
            # name file
            
            date_extraction_str = str(date_extraction).replace(".","-").replace(":","-").replace(" ","-")
            
            name_file = "./{}/persist_tweets_{}_{}.csv".format(folder,date_extraction_str,date_extraction_str)
            
            # persist base
            
            df_data_tweets.to_csv(name_file,sep=",")
            
    
            
        else:
            
            
            #extract more 100 tweets older
            
            data_tweets_temp = extract_100_tweets(query = "{} -is:retweet".format(query),until_id=oldest_id)
            
            df_data_tweets_temp = pd.DataFrame(data_tweets_temp["data"])
            
            
            #get the current date
            
            
            date_extraction = datetime.now()
            
            df_data_tweets_temp["date_extraction"] = date_extraction 
            
            oldest_id = data_tweets_temp['meta']['oldest_id']
            
            df_data_tweets = pd.concat([df_data_tweets,df_data_tweets_temp.copy()])
            
            date_extraction = datetime.now()
            
            df_data_tweets.reset_index(inplace=True,drop=True)
            
            
            # remove old files
            
            os.remove(name_file)
            
            
            # name file
            
            oldest_date_str = str(oldest_date).replace(".","-").replace(":","-").replace(" ","-")
            
            date_extraction_str = str(date_extraction).replace(".","-").replace(":","-").replace(" ","-")
            
            
            name_file = "./{}/persist_tweets_{}_{}.csv".format(folder,oldest_date_str,date_extraction_str)
            
            # persist base
            
            df_data_tweets.to_csv(name_file.format(folder),sep=",")
            
            

    return df_data_tweets
    
    


# Extra√ß√£o de Tweets

In [9]:
data_tweets_final = extract_many_tweets(qnt_cycle=10,folder="data_tweets",query="globoplay")#,start_from_id="1367600965277384706")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:10<00:00,  1.03s/it]


In [10]:
data_tweets_final.head(30)

Unnamed: 0,id,text,date_extraction
0,1381322101693546496,@gshow @Karolconka @globoplay üòÇüòÇüòÇüòÇüòÇüòÇüòÇüòÇ,2021-04-11 16:04:00.114498
1,1381322099709607941,@edwinmylife #bbb21 #RedeBBB juliette gil gen...,2021-04-11 16:04:00.114498
2,1381322087688732677,"@savioemanueI √© pq se voc√™ parar pra ver, hoje...",2021-04-11 16:04:00.114498
3,1381322069279932421,@gshow @Karolconka @globoplay Nossa que lixo e...,2021-04-11 16:04:00.114498
4,1381322063575715842,"@globoplay Use o PicPay para pagar amigos, bol...",2021-04-11 16:04:00.114498
5,1381322012791091211,Vou at√© assinar a Globoplay pra assistir. Mama...,2021-04-11 16:04:00.114498
6,1381322009800536067,"@gshow @Karolconka @globoplay N√£o, obrigada.",2021-04-11 16:04:00.114498
7,1381321990263439360,@gshow @Karolconka @globoplay Eu sabia que voc...,2021-04-11 16:04:00.114498
8,1381321977483382784,Poderia entrar no WWE Network. Uma das maiores...,2021-04-11 16:04:00.114498
9,1381321970990604292,@fefejunqueira @globoplay Gente o Alecrim Dour...,2021-04-11 16:04:00.114498


# Cria√ß√£o de uma coluna com os textos sem repeti√ß√£o de palavras para ser utilizado na an√°lise explorat√≥ria

In [11]:
data_tweets_final['text_unique_words'] = data_tweets_final['text'].apply(lambda x: convert_text_to_no_repeat_words(x))

# Calculo N√∫mero de tokens

In [12]:
data_tweets_final['number_tokens'] = data_tweets_final['text'].apply(lambda x: calculate_number_words(x))

# Calculo N√∫mero de diferentes tokens

In [13]:
data_tweets_final['number_diferent_tokens'] = data_tweets_final['text'].apply(lambda x: calculate_number_diferent_words(x))

# DF top 10 MEAN

In [14]:
df_report_mean = plot_bar_count_words(text_column='text',
                                                dataframe=data_tweets_final,
                                                metric='MEAN',top=10,return_df=True)

In [22]:
df_report_mean["MEAN"].tolist()

[0.8588588588588588,
 0.46546546546546547,
 0.4644644644644645,
 0.35035035035035034,
 0.31931931931931934,
 0.2182182182182182,
 0.21521521521521522,
 0.2062062062062062,
 0.17417417417417416,
 0.16816816816816818]

In [23]:
df_report_mean["WORDS"].tolist()

['globoplay', 'co', 'https', 'que', 'de', 'n√£o', 'no', 'eu', 'do', 'da']

# DF top 10 SUM

In [16]:
df_report_sum = plot_bar_count_words(text_column='text',
                                                dataframe=data_tweets_final,
                                                metric='SUM',top=10,return_df=True)

In [17]:
df_report_sum

Unnamed: 0,SUM,WORDS
1468,858,globoplay
702,465,co
1560,464,https
2686,350,que
892,319,de
2242,218,n√£o
2205,215,no
1228,206,eu
1031,174,do
872,168,da


# DF top 10 MEAN TF-IDF

In [18]:
df_report_tfidf_mean = plot_bar_tf_idf(text_column='text',
                                                dataframe=data_tweets_final,
                                                metric='MEAN',top=10,return_df=True)

In [19]:
df_report_tfidf_mean

Unnamed: 0,MEAN,WORDS
1468,0.065795,globoplay
702,0.051229,co
1560,0.051157,https
2686,0.03761,que
892,0.033094,de
2242,0.027808,n√£o
1228,0.026489,eu
2205,0.026464,no
872,0.023769,da
1031,0.022167,do


# DF top 10 MAX TF-IDF

In [20]:
df_report_tfidf_max = plot_bar_tf_idf(text_column='text',
                                                dataframe=data_tweets_final,
                                                metric='MAX',top=10,return_df=True)

In [21]:
df_report_tfidf_max

Unnamed: 0,MAX,WORDS
1760,0.073675,karolconka
1513,0.070967,gshow
2686,0.070023,que
2242,0.069073,n√£o
1468,0.067267,globoplay
1228,0.065782,eu
1560,0.065682,https
702,0.065635,co
872,0.063615,da
892,0.062431,de
