Esse código foi rodado no google colab por exigir mais poder computacional do que achamos razoável para nossas máquinas.

Apagamos os outputs para diminuir o consumo de memória no github. Os outputs podem ser vistos no colab:

link para o colab: https://colab.research.google.com/drive/11WK5Boyoxgl5mL8TB3RSs1oBliR44aI7?usp=sharing

# Importando dados

In [None]:
import pandas as pd
import os
import json
import plotly.express as px
from google.colab import drive
import torch

RANDOM_SEED = 33

In [None]:
drive.mount('/content/drive')

In [None]:
RELEVANT_FIELDS = ['type','id','text','retweetCount','replyCount','likeCount','createdAt','bookmarkCount','isReply','author']
RELEVANT_AUTHOR_FIELDS = ['type','userName','name','isVerified','description','followers','following','createdAt','favouritesCount']

def extract_df_from_json_tweets_data(path_tweets):
    dict_list = []

    # For each JSON
    for filename in os.listdir(path_tweets):
        if filename[-4:] != 'json':
            continue

        with open(os.path.join(path_tweets, filename), 'rt') as f:
            curr_json_list = json.load(f)

        # For each single tweet in a JSON
        for curr_json in curr_json_list:
            # Extract only relevant fields from tweet
            relevant_json = {k:v for k,v in curr_json.items() if k in RELEVANT_FIELDS}
            relevant_json_author = {f'author_{k}':v for k,v in relevant_json['author'].items() if k in RELEVANT_AUTHOR_FIELDS}

            # Delete semi-structured author field in `relevant_json`
            del relevant_json['author']

            # Merging the two dataframes and specifying original file
            new_dict = {**relevant_json, **relevant_json_author}
            new_dict['src_file'] = filename
            dict_list.append(new_dict)

    df = pd.DataFrame(dict_list)
    return df

In [None]:
PATH_PETR4_FILES = '/content/drive/MyDrive/Projeto Ciência de Dados/Scrapping Tweets/PETR4'
PATH_VALE3_FILES = '/content/drive/MyDrive/Projeto Ciência de Dados/Scrapping Tweets/VALE3'

df_petr4 = extract_df_from_json_tweets_data(PATH_PETR4_FILES)
df_vale3 = extract_df_from_json_tweets_data(PATH_VALE3_FILES)

In [None]:
df_petr4['stock'] = 'PETR4'
df_vale3['stock'] = 'VALE3'

df_stocks = pd.concat([df_petr4, df_vale3], ignore_index=True)

In [None]:
df_stocks.head(1)

### Defining Dataframe Types

In [None]:
df_stocks.dtypes

Converting `createdAt` fields to datetime

In [None]:
df_stocks['createdAt'] = pd.to_datetime(df_stocks['createdAt'], format='%a %b %d %H:%M:%S %z %Y')
df_stocks['author_createdAt'] = pd.to_datetime(df_stocks['author_createdAt'], format='%a %b %d %H:%M:%S %z %Y')
df_stocks.dtypes

### Setting dataframe index as tweet creation date

In [None]:
df_stocks = df_stocks.set_index('createdAt')
df_stocks.head(1)

# Análise Exploratória

## Extraindo embeddings

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForPreTraining
from transformers import AutoModel
from tqdm.notebook import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)
model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased').to(device)

def get_embeddings(texts, tokenizer, model, batch_size=32, max_length=512):
    all_embeddings = []
    for batch in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[batch:batch+batch_size]
        # Tokenizar os textos
        batch_inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=max_length)
        batch_inputs = {key: value.to(device) for key, value in batch_inputs.items()}
        # Obter os embeddings
        with torch.no_grad():
            outputs = model(**batch_inputs)
            embeddings = outputs.pooler_output
            all_embeddings.append(embeddings)

    return torch.cat(all_embeddings, dim=0)

In [None]:
batch_size = 32
max_length = 512

stock_embeds = get_embeddings(df_stocks['text'].tolist(), tokenizer, model, batch_size=batch_size, max_length=max_length)

In [None]:
# # # # with open(f'{PATH_STOCK_FILES}/embeddings/embeddings_BERTimbau_base.pt', 'wb') as f:
# # # #     torch.save(stock_embeds, f)

## Redução de dimensionalidade com U-MAP

In [None]:
!pip install umap-learn

In [None]:
import umap

In [None]:
def get_stock_embeddings(stock, model):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    with open(f'/content/drive/MyDrive/Projeto Ciência de Dados/Scrapping Tweets/{stock}/embeddings/embeddings_{model}.pt', 'rb') as f:
        embeddings = torch.load(f, map_location=torch.device(device))
    return embeddings

### PETR4

#### BERTimbau

In [None]:
petr4_bertimbau_embeddings = get_stock_embeddings('PETR4', 'BERTimbau_base')

In [None]:
reducer = umap.UMAP(n_components=2, random_state=RANDOM_SEED)

In [None]:
umap_features = reducer.fit_transform(petr4_bertimbau_embeddings)

In [None]:
df_petr4['umap_dim1'] = umap_features[:,0]
df_petr4['umap_dim2'] = umap_features[:,1]

In [None]:
px.scatter(df_petr4, x='umap_dim1', y='umap_dim2', hover_data=['text'], title='PETR4 BERTimbau Embeddings UMAP Dimensionality Reduction')

#### XLM-RoBERTa-Large

In [None]:
petr4_embeddings = get_stock_embeddings('PETR4', 'XLM_RoBERTa_large')

reducer = umap.UMAP(n_components=2, random_state=RANDOM_SEED)
umap_features = reducer.fit_transform(petr4_embeddings)

df_petr4['umap_dim1'] = umap_features[:,0]
df_petr4['umap_dim2'] = umap_features[:,1]

px.scatter(df_petr4, x='umap_dim1', y='umap_dim2', hover_data=['text'], title='PETR4 RoBERTa Embeddings UMAP Dimensionality Reduction')

### VALE3

#### BERTimbau Base

In [None]:
vale3_embeddings = get_stock_embeddings('VALE3', 'BERTimbau_base')

reducer = umap.UMAP(n_components=2, random_state=RANDOM_SEED)
umap_features = reducer.fit_transform(vale3_embeddings)

df_vale3['umap_dim1'] = umap_features[:,0]
df_vale3['umap_dim2'] = umap_features[:,1]

px.scatter(df_vale3, x='umap_dim1', y='umap_dim2', hover_data=['text'], title='VALE3 BERTimbau Embeddings UMAP Dimensionality Reduction')

#### XLM-RoBERTa-Large

In [None]:
vale3_embeddings = get_stock_embeddings('VALE3', 'XLM_RoBERTa_large')

reducer = umap.UMAP(n_components=2, random_state=RANDOM_SEED)
umap_features = reducer.fit_transform(vale3_embeddings)

df_vale3['umap_dim1'] = umap_features[:,0]
df_vale3['umap_dim2'] = umap_features[:,1]

px.scatter(df_vale3, x='umap_dim1', y='umap_dim2', hover_data=['text'], title='VALE3 RoBERTa Embeddings UMAP Dimensionality Reduction')