In [None]:
import pandas as pd
import numpy as np
import pickle
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score

import tensorflow as tf
from tensorflow import keras

keras.utils.set_random_seed(42)

In [None]:
# install transformer package
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 12.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 33.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 19.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.24.0


In [None]:
from transformers import BertTokenizer

In [None]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-1dac25fc-7394-c720-78fc-941df0ffe75c)


In [None]:
df = pd.read_csv('https://www.dropbox.com/s/bftxg089s8bx86f/Financial%20Sentiment%20Data.csv?dl=1')

In [None]:
train_X, test_X, train_y, test_y = train_test_split(df['Sentence'], df['Sentiment'], test_size=0.25, random_state=42, stratify=df['Sentiment'])

In [None]:
n_neu = train_y.value_counts()['neutral']
n_pos = train_y.value_counts()['positive']
n_neg = train_y.value_counts()['negative']

train_y.value_counts()

neutral     2347
positive    1389
negative     645
Name: Sentiment, dtype: int64

In [None]:
# balance data -> upsample minority class
train = pd.concat([train_X, train_y], axis=1)
train_pos = pd.concat([train[train.Sentiment=='positive'], 
                      train[train.Sentiment=='positive'].sample(n=(n_neu-n_pos), replace=True, random_state=42)])
train_neg = pd.concat([train[train.Sentiment=='negative'],
                       train[train.Sentiment=='negative'].sample(n=(n_neu-n_neg), replace=True, random_state=42)])
train_balanced = pd.concat([train_pos, train_neg, train[train.Sentiment=='neutral']])
train_X, train_y = train_balanced['Sentence'], train_balanced['Sentiment']

train_y.value_counts()

positive    2347
negative    2347
neutral     2347
Name: Sentiment, dtype: int64

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
max_query_length = 100

In [None]:
# tokenize both the training set and the test set
source_train = tokenizer(train_X.values.tolist(), max_length=max_query_length, return_tensors='tf', truncation=True, padding='max_length')
source_test = tokenizer(test_X.values.tolist(), max_length=max_query_length, return_tensors='tf', truncation=True, padding='max_length')

In [None]:
# tokenize labels
text_vectorization_label = keras.layers.TextVectorization()
text_vectorization_label.adapt(train_y.astype(str))
num_labels = text_vectorization_label.vocabulary_size()

label_train = text_vectorization_label(train_y.astype(str))
label_test = text_vectorization_label(test_y.astype(str))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
fine_tuned = keras.models.load_model('/content/drive/MyDrive/content/drive/MyDrive/ANLY-580/final_project/fine_tuned_bert_1')

In [None]:
# Get input
new_input = fine_tuned.input
# Find the layer to connect
hidden_layer = fine_tuned.layers[-2].output['pooler_output']
# Connect a new layer on it
x = keras.layers.Dense(256, activation='relu')(hidden_layer)
x = keras.layers.Dropout(0.5)(x)
x = keras.layers.Dense(128, activation='relu')(x)
x = keras.layers.Dropout(0.5)(x)
x = keras.layers.Dense(128, activation='relu')(x)
x = keras.layers.Dropout(0.5)(x)
new_output = keras.layers.Dense(num_labels, activation="softmax")(x)
# Build a new model
model = keras.Model(inputs=new_input, outputs=new_output)

for layer in model.layers:#[:-7]:
  layer.trainable = True

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 100)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 100)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 100)]        0           []                               
                                                                                                  
 bert (Custom>TFBertMainLayer)  {'pooler_output': (  108310272   ['input_ids[0][0]',              
                                None, 768),                       'token_type_ids[0][0]',     

In [None]:
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["sparse_categorical_accuracy"])

In [None]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [None]:
from keras import backend as K
K.set_value(model.optimizer.learning_rate, 0.00005)

In [None]:
BATCH_SIZE = 64
epochs = 4

# Fit
history = model.fit([source_train['input_ids'], source_train['token_type_ids'], source_train['attention_mask']], label_train,
                    batch_size=BATCH_SIZE,
                    validation_data=([source_test['input_ids'], source_test['token_type_ids'], source_test['attention_mask']], label_test),
                    epochs=epochs)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
gme_subreddit = pd.read_csv('/content/drive/MyDrive/export.csv')

In [None]:
# tokenize the gme dataset
gme_text = tokenizer(gme_subreddit['selftext'].values.tolist(), max_length=max_query_length, return_tensors='tf', truncation=True, padding='max_length')

In [None]:
pred = np.argmax(model.predict(x=[gme_text['input_ids'], gme_text['token_type_ids'], gme_text['attention_mask']]),axis=1)



In [None]:
pred

array([3, 4, 2, ..., 3, 3, 3])

In [None]:
display_labels=['neu','pos','neg']
pred_label = np.array(display_labels)[pred-2]
pd.Series(pred_label).value_counts()

pos    61656
neu    18765
neg    13877
dtype: int64

In [None]:
pd.DataFrame({'date':gme.date,'sentiment': pred_label}).to_csv('/content/drive/MyDrive/ANLY-580/final_project/gme_sentiment.csv', index=False)