## Libraries

In [1]:
# For sending GET requests from the API
import requests
# For saving access tokens and for file management when creating and adding to the dataset
import os
# For dealing with json responses we receive from the API
import json
# For displaying the data after
import pandas as pd
# For saving the response data in CSV format
import csv
# For parsing the dates received from twitter in readable formats
import datetime
import dateutil.parser
import unicodedata
#To add wait time between requests
import time
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
import nltk

from unicodedata import normalize as norm
import re

## Functions for exploratory analysis 

In [2]:
def plot_bar_count_words(text_column=None,
                         label_column=None,
                         name_class=None,
                         dataframe=None,
                         metric='SUM',
                         top=50,return_df=True):
    
    corpus = dataframe[text_column].values
    
    vectorizer = CountVectorizer()
    data_vect = vectorizer.fit_transform(corpus)
    data_vect = data_vect.toarray()
    
    df_count_words =  pd.DataFrame({
    "WORDS":vectorizer.get_feature_names(),
    "MEAN":data_vect.mean(axis=0),
    "SUM":data_vect.sum(axis=0),
    "STD":data_vect.std(axis=0),
    }) 
    
    

    if return_df:
    
        return df_count_words[[metric,'WORDS']].sort_values(by=[metric],ascending=False)[0:top]
    
    else:
        
        fig = plt.figure(figsize=(15,10))
        
        ax = sns.barplot(x=metric, 
                 y="WORDS", 
                 data=df_count_words[[metric,'WORDS']].sort_values(by=[metric],
                                                                            ascending=False)[0:top])
        





def convert_text_to_no_repeat_words(text):

    text_with_no_repeat_words = text.split(" ")

    text_with_no_repeat_words = [i for i in text_with_no_repeat_words if i!=""]

    text_with_no_repeat_words = set(text_with_no_repeat_words)

    text_with_no_repeat_words = list(text_with_no_repeat_words)

    text_with_no_repeat_words = " ".join(text_with_no_repeat_words)

    return text_with_no_repeat_words


## Function to clean the texts

In [3]:
def text_cleaner(text,stop_words_domain =[]):

    
    nltk_stopwords =  stopwords.words('portuguese') + stop_words_domain

    nltk_stopwords_processed = [norm('NFKD', i).encode('ascii', 'ignore').decode().lower() for i in nltk_stopwords]

    regex_stop_words = '|'.join(nltk_stopwords)

    
    regex_remove_https = 'https([a-zA-Zà-úÀ-Ú0-9]|[-()\#/@;:<>{}`+=~|.!?,])+'


    text_without_https = re.sub(r"(\s|^){0}(\s{0})*($|\s)".format(regex_remove_https)," ",text)


    text_without_special_caracteres = re.sub(r"[^a-zA-ZÀ-Úà-ú]+"," ",text_without_https)

    text_without_alone_caractere = re.sub(r"\s[a-zA-ZÀ-Úà-ú]\s|\s[a-zA-ZÀ-Úà-ú]$|^[a-zA-ZÀ-Úà-ú]\s"," ",text_without_special_caracteres)
    

    text_pattern_space = re.sub(r"\s+"," ",text_without_alone_caractere)

    
    text_split = text_pattern_space.split(" ")

    
    text_list = [i for i in text_split  if norm('NFKD', i).encode('ascii', 'ignore').decode().lower() not in nltk_stopwords_processed]


    text_final = " ".join(text_list)


    return text_final

# Test API Twitter

## Create method auth

In [4]:
def auth():
    return os.getenv('TOKEN')

In [5]:
bearer = auth()

In [6]:
bearer

## Create method headers

In [None]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

## Create URL

In [None]:
def create_url(keyword, start_date, end_date, max_results = 10):
    
    search_url = "https://api.twitter.com/2/tweets/search/recent" #Change to the endpoint you want to collect data from

    
    
    #change params based on the endpoint you are using
    query_params = {'query': keyword,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': max_results,
                    'expansions': 'author_id',
                    'tweet.fields': 'author_id,conversation_id,created_at,id,in_reply_to_user_id,public_metrics,text',
                    'user.fields': 'id,name,username,created_at',
                    'next_token': {}}
    return (search_url, query_params)

## Connect to endpoint

In [None]:

def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

## Test First request

In [None]:
#Inputs for the request
bearer_token = auth()
headers = create_headers(bearer_token)
keyword = "fiuk lang:pt -is:retweet"
start_time = "2021-12-03T00:00:00.000Z"
end_time = "2021-12-04T23:07:00.000Z"
max_results = 100

In [None]:
url = create_url(keyword, start_time,end_time, max_results)
json_response = connect_to_endpoint(url[0], headers, url[1])

In [None]:
print(json.dumps(json_response, indent=4, sort_keys=True))

## Save results in CSV

In [None]:
pd.DataFrame.from_dict(json_response["data"])

## Paginate

In [None]:
def paginate(url,headers,next_token=""):

    if next_token:
        
        data = connect_to_endpoint(url[0], headers, url[1],next_token=next_token)
        
    else:
        
        data = connect_to_endpoint(url[0], headers, url[1])

    yield data

    if "next_token" in data.get("meta",{}):

        yield from paginate(url,headers,data["meta"]["next_token"])


## Function to get CSV from twitter

In [None]:
def get_csv_from_twitter(keyword, start_time,end_time,qnt):

    bearer_token = auth()
    
    headers = create_headers(bearer_token)

    df_tweets = pd.DataFrame(columns=['conversation_id', 'in_reply_to_user_id', 'public_metrics',
           'created_at', 'author_id', 'id', 'text'])
    
    
    limit_iterations = qnt//100 if qnt//100 > 0 else 1
        
    count = 0
    
    url = create_url(keyword, start_time,end_time, max_results)
    
    for json_response in paginate(url,headers):
        
        

        df_tweets = pd.concat([df_tweets,pd.DataFrame.from_dict(json_response["data"])])

        count+=1

        if count == limit_iterations:

            break

    df_tweets = df_tweets.reset_index(drop=True)
    
    return df_tweets
    

In [None]:
qnt = 10000

In [None]:
df_tweets = get_csv_from_twitter(keyword, start_time,end_time,qnt)

df_tweets.head()

In [None]:
df_tweets.to_csv("df_tweets.csv",index=False)

# Test Plots

In [None]:
df_report_sum_docs = plot_bar_count_words(text_column='text_clean',
                                                dataframe=df_tweets,
                                                metric='SUM',top=10,return_df=True)

In [None]:
df_report_sum_docs

In [None]:
df_tweets["text_clean"] = df_tweets["text"].apply(lambda x: text_cleaner(text = x,stop_words_domain=stop_words_domain))

In [None]:
stopwords.words('portuguese') + []

In [None]:

stop_words_domain=["não","da","globoplay",
                    "só","pra","vc","pois","lá","outro",
                    "outra","vou","vão","assim","outro",
                    "outra","ter","ver","agora","hoje",
                    "tudo","todos","todo","ah","acho",
                    "achamos","né","ser","vai","alguma",
                    "mas","porém","entretanto",
                    "faz","fazemos","farão",
                    "tbm","fazia","tá","tb","ia",
                    "ir","to","nela","nele","nelas",
                    "neles","naquele","naquueles",
                    "naquelas","naquela","coisa","mim",
                    "tô","aí","n",
                    "pro","é","dessa","vamos","q",
                    "desse","tava","msm","vamo","que","porque",
                    "nem","mano","manos","caras","xd","kkkk","pq","por","cara",
                    "gente","dar","sobre","tão","toda","vezes",
                    "então","viu","vemos","pode","podemos","vez",
                    "vcs","hein","quer","sim","deu","já","demos",
                    "todas","aqui","sei","sabemos","fazer","fiz",
                    "fez","fazemos","vem","vamos","ainda","tanto","nesse","pocah"] 