## Libraries

In [16]:
# For sending GET requests from the API
import requests
# For saving access tokens and for file management when creating and adding to the dataset
import os
# For dealing with json responses we receive from the API
import json
# For displaying the data after
import pandas as pd
# For saving the response data in CSV format
import csv
# For parsing the dates received from twitter in readable formats
import datetime
import dateutil.parser
import unicodedata
#To add wait time between requests
import time
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
import nltk

from unicodedata import normalize as norm
import re

In [17]:
df_tweets = pd.read_csv("df_tweets.csv")
df_tweets.head()

Unnamed: 0,conversation_id,in_reply_to_user_id,public_metrics,created_at,author_id,id,text
0,1466874887524421632,2230226000.0,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",2021-12-04T23:05:28.000Z,1130648708566179841,1467268847745081346,@UniversalPicsBr @anygabrielly @Fiuk quero tan...
1,1467268691884654595,,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",2021-12-04T23:04:51.000Z,1355987894603837442,1467268691884654595,Tata Werneck dá show de maturidade e pede fim ...
2,1467249854728609797,1.441891e+18,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",2021-12-04T23:03:56.000Z,1465436751388885002,1467268463433560066,@Claudia77613373 @afazendarecord @dynhoalvesre...
3,1467267671632887816,,"{'retweet_count': 0, 'reply_count': 3, 'like_c...",2021-12-04T23:00:48.000Z,185448711,1467267671632887816,Eu e mainha botamos o lady night com Fiuk aqui...
4,1467132306989191168,1.418676e+18,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",2021-12-04T22:59:40.000Z,1081607577778225153,1467267388383109120,@yaeminion Alguem me explicar por que o Fiuk é...


In [18]:

stop_words_domain=["não","da","globoplay",
                    "só","pra","vc","pois","lá","outro",
                    "outra","vou","vão","assim","outro",
                    "outra","ter","ver","agora","hoje",
                    "tudo","todos","todo","ah","acho",
                    "achamos","né","ser","vai","alguma",
                    "mas","porém","entretanto",
                    "faz","fazemos","farão",
                    "tbm","fazia","tá","tb","ia",
                    "ir","to","nela","nele","nelas",
                    "neles","naquele","naquueles",
                    "naquelas","naquela","coisa","mim",
                    "tô","aí","n",
                    "pro","é","dessa","vamos","q",
                    "desse","tava","msm","vamo","que","porque",
                    "nem","mano","manos","caras","xd","kkkk","pq","por","cara",
                    "gente","dar","sobre","tão","toda","vezes",
                    "então","viu","vemos","pode","podemos","vez",
                    "vcs","hein","quer","sim","deu","já","demos",
                    "todas","aqui","sei","sabemos","fazer","fiz",
                    "fez","fazemos","vem","vamos","ainda","tanto","nesse","pocah"] 

## Functions for exploratory analysis 

In [19]:
def plot_bar_count_words(text_column=None,
                         label_column=None,
                         name_class=None,
                         dataframe=None,
                         metric='SUM',
                         top=50,return_df=True):
    
    corpus = dataframe[text_column].values
    
 
    
    vectorizer = CountVectorizer()
    data_vect = vectorizer.fit_transform(corpus)
    data_vect = data_vect.toarray()
    
    print(vectorizer.get_feature_names())
    
    df_count_words =  pd.DataFrame({
    "WORDS":vectorizer.get_feature_names() ,
    "MEAN":data_vect.mean(axis=0),
    "SUM":data_vect.sum(axis=0),
    "STD":data_vect.std(axis=0),
    }) 
    
    

    if return_df:
    
        return df_count_words[[metric,'WORDS']].sort_values(by=[metric],ascending=False)[0:top]
    
    else:
        
        fig = plt.figure(figsize=(15,10))
        
        ax = sns.barplot(x=metric, 
                 y="WORDS", 
                 data=df_count_words[[metric,'WORDS']].sort_values(by=[metric],
                                                                            ascending=False)[0:top])
        

In [20]:

def convert_text_to_no_repeat_words(text):

    text_with_no_repeat_words = text.split(" ")

    text_with_no_repeat_words = [i for i in text_with_no_repeat_words if i!=""]

    text_with_no_repeat_words = set(text_with_no_repeat_words)

    text_with_no_repeat_words = list(text_with_no_repeat_words)

    text_with_no_repeat_words = " ".join(text_with_no_repeat_words)

    return text_with_no_repeat_words


In [37]:
def extract_hashtags(tweet):
    
    list_hashtags = re.findall(r"#[a-zA-Zà-úÀ-Ú0-9]+",tweet)
    
    string_only_hashtags = " ".join(list_hashtags)
    
    return string_only_hashtags


In [29]:
def extract_citation(tweet):
    
    list_hashtags = re.findall(r"@[a-zA-Zà-úÀ-Ú0-9]+",tweet)
    
    string_only_hashtags = " ".join(list_hashtags)
    
    return string_only_hashtags

## Function to clean the texts

In [22]:
def text_cleaner(text,stop_words_domain =[]):

    
    nltk_stopwords =  stopwords.words('portuguese') + stop_words_domain

    nltk_stopwords_processed = [norm('NFKD', i).encode('ascii', 'ignore').decode().lower() for i in nltk_stopwords]

    regex_stop_words = '|'.join(nltk_stopwords)

    
    regex_remove_https = 'https([a-zA-Zà-úÀ-Ú0-9]|[-()\#/@;:<>{}`+=~|.!?,])+'


    text_without_https = re.sub(r"(\s|^){0}(\s{0})*($|\s)".format(regex_remove_https)," ",text)


    text_without_special_caracteres = re.sub(r"[^a-zA-ZÀ-Úà-ú]+"," ",text_without_https)

    text_without_alone_caractere = re.sub(r"\s[a-zA-ZÀ-Úà-ú]\s|\s[a-zA-ZÀ-Úà-ú]$|^[a-zA-ZÀ-Úà-ú]\s"," ",text_without_special_caracteres)
    

    text_pattern_space = re.sub(r"\s+"," ",text_without_alone_caractere)

    
    text_split = text_pattern_space.split(" ")

    
    text_list = [i for i in text_split  if norm('NFKD', i).encode('ascii', 'ignore').decode().lower() not in nltk_stopwords_processed]


    text_final = " ".join(text_list)


    return text_final

# Test Plots

In [23]:
df_report_sum_docs = plot_bar_count_words(text_column='text_clean',
                                                dataframe=df_tweets,
                                                metric='SUM',top=10,return_df=True)

KeyError: 'text_clean'

In [24]:
df_report_sum_docs

Unnamed: 0,SUM,WORDS
8,17,#flamengo
2,8,#crf
20,3,#zanerattonomengao
16,2,#vamosflamengo
15,2,#tpf
12,2,#mercadodabola
0,2,#carloscarvalhal
6,2,#fla
10,2,#informei
7,1,#flacandinho


In [25]:
df_tweets["text_clean"] = df_tweets["text"].apply(lambda x: text_cleaner(text = x,stop_words_domain=stop_words_domain))

In [None]:
## Test get #

In [None]:
tweet = "duhudhuhd #tweet #tweet1 usuhsdsnsund #tweet2 jdisdisndsjdis #tweet3 ususjus"

In [None]:
re.search("#([a-zA-Zà-úÀ-Ú0-9]|[-()\#/@;:<>{}`+=~|.!?,])+",tweet).group()

In [None]:
#(?<![#@])\b\w+\b

re.findall("#[a-zA-Zà-úÀ-Ú0-9]+",tweet)

In [None]:
re.match("#([a-zA-Zà-úÀ-Ú0-9]|[-()\#/@;:<>{}`+=~|.!?,])+",tweet)

In [34]:
df_tweets["hashtags"] = df_tweets["text"].apply(lambda x: extract_hashtags(tweet = x))

In [35]:
df_tweets.head()

Unnamed: 0,conversation_id,in_reply_to_user_id,public_metrics,created_at,author_id,id,text,text_clean,hashtags,citations
0,1466874887524421632,2230226000.0,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",2021-12-04T23:05:28.000Z,1130648708566179841,1467268847745081346,@UniversalPicsBr @anygabrielly @Fiuk quero tan...,UniversalPicsBr anygabrielly Fiuk quero filme,@UniversalPicsBr @anygabrielly @Fiuk,@UniversalPicsBr @anygabrielly @Fiuk
1,1467268691884654595,,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",2021-12-04T23:04:51.000Z,1355987894603837442,1467268691884654595,Tata Werneck dá show de maturidade e pede fim ...,Tata Werneck show maturidade pede fim ataques ...,,
2,1467249854728609797,1.441891e+18,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",2021-12-04T23:03:56.000Z,1465436751388885002,1467268463433560066,@Claudia77613373 @afazendarecord @dynhoalvesre...,Claudia afazendarecord dynhoalvesreal sthefan...,@Claudia77613373 @afazendarecord @dynhoalvesre...,@Claudia77613373 @afazendarecord @dynhoalvesre...
3,1467267671632887816,,"{'retweet_count': 0, 'reply_count': 3, 'like_c...",2021-12-04T23:00:48.000Z,185448711,1467267671632887816,Eu e mainha botamos o lady night com Fiuk aqui...,mainha botamos lady night Fiuk gosta sofrer de...,,
4,1467132306989191168,1.418676e+18,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",2021-12-04T22:59:40.000Z,1081607577778225153,1467267388383109120,@yaeminion Alguem me explicar por que o Fiuk é...,yaeminion Alguem explicar Fiuk famoso filho F...,@yaeminion,@yaeminion


In [36]:
df_report_sum_docs = plot_bar_count_words(text_column='hashtags',
                                                dataframe=df_tweets,
                                                metric='SUM',top=50,return_df=True)
df_report_sum_docs

['1994seok', '42', 'aalvesbea', 'acervotata', 'acostumadinha', 'adriana08269162', 'aebanatomy', 'afazenda', 'afazendarecord', 'al3xsandeer', 'alphonsegreend', 'anaccheiratoba', 'annavivissima', 'anotherfavs', 'antrabrasil', 'anygabrielly', 'anygsource', 'areas', 'autumnnights15', 'ayeshaafrontosa', 'bbb', 'beaufalling', 'bedinvibes', 'bicmuller', 'bidigo2', 'bill', 'boninho', 'born2bmild1', 'br', 'brennerrodolfo', 'brigasdiarias', 'bruninhacoment9', 'brunnosarttori', 'bruno', 'brunomartinsok', 'buterasooya', 'ca', 'cabewalking', 'cabulosamen', 'camiwitoria', 'candidamfpinto', 'carasbrasil', 'carla', 'carladiaz', 'catarinalucindo', 'ccxpworlds', 'centraldadiaz', 'centrallricoo', 'cette', 'cfsunshines', 'chaetoit', 'chato', 'chernozai', 'chicobarney', 'claudia77613373', 'claudiogabrielj', 'cleo', 'clintboladao', 'colunatelinha', 'commentslaraa', 'comvocefiuk', 'damadeferroofic', 'daviphany', 'dearestsadness', 'dedeqcs', 'desculpahomem', 'devinbker', 'diazvivix', 'diegotoledox', 'dorinhad

Unnamed: 0,SUM,WORDS
85,119,fiuk
250,72,tatawerneck
131,35,juliette
88,25,fofoqueioficial
142,24,ladynight
3,21,acervotata
265,17,universalpicsbr
266,17,updatechartuc
15,16,anygabrielly
167,16,manalucii


In [38]:
df_report_sum_docs["WORDS"] =  "#" + df_report_sum_docs["WORDS"]

In [39]:
df_report_sum_docs

Unnamed: 0,SUM,WORDS
85,119,#fiuk
250,72,#tatawerneck
131,35,#juliette
88,25,#fofoqueioficial
142,24,#ladynight
3,21,#acervotata
265,17,#universalpicsbr
266,17,#updatechartuc
15,16,#anygabrielly
167,16,#manalucii


In [31]:
df_tweets["citations"] = df_tweets["text"].apply(lambda x: extract_citation(tweet = x))

In [40]:
df_report_sum_docs = plot_bar_count_words(text_column='citations',
                                                dataframe=df_tweets,
                                                metric='SUM',top=50,return_df=True)
df_report_sum_docs

['1994seok', '42', 'aalvesbea', 'acervotata', 'acostumadinha', 'adriana08269162', 'aebanatomy', 'afazendarecord', 'al3xsandeer', 'alphonsegreend', 'anaccheiratoba', 'annavivissima', 'anotherfavs', 'antrabrasil', 'anygabrielly', 'anygsource', 'areas', 'autumnnights15', 'ayeshaafrontosa', 'beaufalling', 'bedinvibes', 'bicmuller', 'bidigo2', 'bill', 'boninho', 'born2bmild1', 'br', 'brennerrodolfo', 'brigasdiarias', 'bruninhacoment9', 'brunnosarttori', 'bruno', 'brunomartinsok', 'buterasooya', 'cabewalking', 'cabulosamen', 'camiwitoria', 'candidamfpinto', 'carasbrasil', 'carla', 'carladiaz', 'catarinalucindo', 'centraldadiaz', 'centrallricoo', 'cette', 'cfsunshines', 'chaetoit', 'chernozai', 'chicobarney', 'claudia77613373', 'claudiogabrielj', 'cleo', 'clintboladao', 'commentslaraa', 'comvocefiuk', 'damadeferroofic', 'daviphany', 'dearestsadness', 'dedeqcs', 'desculpahomem', 'devinbker', 'diazvivix', 'diegotoledox', 'dorinhadaorf', 'dudaantonacio1', 'dudawolfsfc', 'dudazwte', 'dummienovip'

Unnamed: 0,SUM,WORDS
78,108,fiuk
226,66,tatawerneck
119,35,juliette
80,25,fofoqueioficial
3,21,acervotata
237,17,universalpicsbr
238,17,updatechartuc
153,16,manalucii
14,16,anygabrielly
216,14,siteptbr
