In [10]:
import os
import shutil
import pandas as pd
import numpy as np

from scipy.special import softmax
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import AutoModelForPreTraining
from transformers import BertTokenizer, AutoTokenizer, DataCollatorWithPadding, BertForSequenceClassification

from datasets import load_metric
from datasets import load_dataset
from datasets import load_from_disk

import helper as hp

In [11]:
import torch

torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
# # Hugging Face pre-trained
# tokenizer = AutoTokenizer.from_pretrained("nlpaueb/sec-bert-base")
# model = AutoModelForPreTraining.from_pretrained("nlpaueb/sec-bert-base")

In [13]:
# checkpoint = "bert-base-uncased"
checkpoint = "bert-base-cased"
# checkpoint = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [14]:
# raw_datasets = load_from_disk("finetune_data_trial")
raw_datasets = load_from_disk("data/to_predict/")

def tokenize_function(example):
    return tokenizer(example["text"], padding=True, truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # pad all the examples to the length of the longest element when we batch elements together — dynamic padding.

Map: 100%|██████████| 9/9 [00:00<00:00, 363.15 examples/s]


In [15]:
tokenized_datasets

DatasetDict({
    Tweet_filtered_TSLA: Dataset({
        features: ['date', 'text', 'stock', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10
    })
    stock_tweets_filtered_TSLA: Dataset({
        features: ['date', 'text', 'stock', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9
    })
    tweets_remaining_filtered_TSLA: Dataset({
        features: ['date', 'text', 'stock', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8
    })
})

Use the saved model to predit on the unlabled testing data.

In [16]:
# Define the path to the saved model directory
saved_model_dir = "saved_model"

# Load the saved tokenizer
tokenizer = BertTokenizer.from_pretrained(saved_model_dir)

# Load the saved model
model = BertForSequenceClassification.from_pretrained(saved_model_dir).to(device)
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [17]:
hp.make_predictions_and_save_csv(model, tokenized_datasets, raw_datasets, 'Tweet_filtered_TSLA')

100%|██████████| 2/2 [00:00<00:00,  9.75it/s]

Probabilities:
 [[0.00221053 0.9977895 ]
 [0.00269205 0.9973079 ]
 [0.00289298 0.99710697]
 [0.00276789 0.99723214]
 [0.00282243 0.9971776 ]
 [0.00282918 0.99717087]
 [0.00293014 0.99706984]
 [0.00292757 0.99707246]
 [0.00481214 0.9951879 ]
 [0.00252157 0.9974784 ]]
Predicted Labels:
 [1 1 1 1 1 1 1 1 1 1]
Predictions have been saved to predictions_Tweet_filtered_TSLA.csv





In [18]:
hp.make_predictions_and_save_csv(model, tokenized_datasets, raw_datasets, 'stock_tweets_filtered_TSLA')

100%|██████████| 2/2 [00:00<00:00, 10.22it/s]

Probabilities:
 [[0.00232229 0.99767774]
 [0.01258642 0.98741364]
 [0.00146793 0.99853206]
 [0.00389425 0.99610573]
 [0.9772842  0.02271576]
 [0.9270123  0.07298761]
 [0.99546    0.00453997]
 [0.00168596 0.998314  ]
 [0.00135022 0.9986498 ]]
Predicted Labels:
 [1 1 1 1 0 0 0 1 1]
Predictions have been saved to predictions_stock_tweets_filtered_TSLA.csv





In [19]:
hp.make_predictions_and_save_csv(model, tokenized_datasets, raw_datasets, 'tweets_remaining_filtered_TSLA')

100%|██████████| 1/1 [00:00<?, ?it/s]

Probabilities:
 [[0.00858674 0.99141324]
 [0.00181861 0.99818146]
 [0.00186847 0.9981316 ]
 [0.00290662 0.9970933 ]
 [0.00187174 0.99812824]
 [0.00118247 0.9988175 ]
 [0.0030168  0.99698323]
 [0.00129619 0.99870384]]
Predicted Labels:
 [1 1 1 1 1 1 1 1]
Predictions have been saved to predictions_tweets_remaining_filtered_TSLA.csv



