# Import Packages

In [None]:
from transformers import AutoTokenizer
import numpy as np
import evaluate
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


# 1. Tokenize dataset and 2. Train Test Splitting

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
def preprocess_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True)

In [None]:
TRAINING_DATA = pd.read_csv('../data/jonpg_prepped_data.csv')[['headlines','label']]
print(len(TRAINING_DATA))
text = TRAINING_DATA['headlines'].tolist()

labels = TRAINING_DATA['label'].tolist()


# Split another testing set in case I want to do my own testing
# text_train, text_test, label_train, label_test = train_test_split(
#     train_text, train_label, test_size=0.10, random_state=42)
training_text, testing_text, training_labels, testing_labels = train_test_split(
    text, labels, test_size=0.20, random_state=42)

train_dataset = Dataset.from_dict({'text':training_text, 'label':training_labels})
eval_dataset = Dataset.from_dict({'text':testing_text, 'label':testing_labels})

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = eval_dataset.map(preprocess_function, batched=True)

45463


Map: 100%|██████████| 36370/36370 [00:04<00:00, 8744.72 examples/s]
Map: 100%|██████████| 9093/9093 [00:00<00:00, 9105.92 examples/s]


# 3. Fine-tune Data to the Model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis", num_labels=3)
model.config.id2label = {0:'decrease', 1:'sustain',2:'increase'}

In [31]:
training_args = TrainingArguments(
    output_dir='C:/Users/Jon/Documents/Career/Projects/SDSPNLP/results',
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [32]:
trainer.train()
trainer.save_model('C:/Users/Jon/Documents/Career/Projects/SDSPNLP/results/jonpg_model')

                                                   
  0%|          | 1/1707 [3:05:06<10:59:08, 23.18s/it]  

{'loss': 1.1324, 'grad_norm': 2.4289326667785645, 'learning_rate': 2.780123131046614e-05, 'epoch': 0.22}


                                                     
  0%|          | 1/1707 [6:08:53<10:59:08, 23.18s/it]   

{'loss': 1.1027, 'grad_norm': 2.158280372619629, 'learning_rate': 2.5602462620932278e-05, 'epoch': 0.44}


                                                     
  0%|          | 1/1707 [9:17:54<10:59:08, 23.18s/it]   

{'loss': 1.1004, 'grad_norm': 1.720942735671997, 'learning_rate': 2.3403693931398417e-05, 'epoch': 0.66}


                                                     
  0%|          | 1/1707 [12:24:00<10:59:08, 23.18s/it]   

{'loss': 1.0957, 'grad_norm': 2.0229299068450928, 'learning_rate': 2.120492524186456e-05, 'epoch': 0.88}


                                                      
  0%|          | 1/1707 [15:31:13<10:59:08, 23.18s/it]   

{'loss': 1.0929, 'grad_norm': 1.4955581426620483, 'learning_rate': 1.9006156552330694e-05, 'epoch': 1.1}


                                                      
  0%|          | 1/1707 [18:38:31<10:59:08, 23.18s/it]   

{'loss': 1.0921, 'grad_norm': 2.103712320327759, 'learning_rate': 1.6807387862796836e-05, 'epoch': 1.32}


                                                      
  0%|          | 1/1707 [21:46:15<10:59:08, 23.18s/it]   

{'loss': 1.0919, 'grad_norm': 1.5835533142089844, 'learning_rate': 1.4608619173262973e-05, 'epoch': 1.54}


                                                      
  0%|          | 1/1707 [24:54:01<10:59:08, 23.18s/it]   

{'loss': 1.0899, 'grad_norm': 3.2498905658721924, 'learning_rate': 1.2409850483729111e-05, 'epoch': 1.76}


                                                      
  0%|          | 1/1707 [28:05:13<10:59:08, 23.18s/it]   

{'loss': 1.0873, 'grad_norm': 1.8603187799453735, 'learning_rate': 1.0211081794195251e-05, 'epoch': 1.98}


                                                      
  0%|          | 1/1707 [31:11:38<10:59:08, 23.18s/it]   

{'loss': 1.0832, 'grad_norm': 1.5468379259109497, 'learning_rate': 8.01231310466139e-06, 'epoch': 2.2}


                                                      
  0%|          | 1/1707 [34:20:11<10:59:08, 23.18s/it]  

{'loss': 1.0789, 'grad_norm': 4.630640506744385, 'learning_rate': 5.813544415127529e-06, 'epoch': 2.42}


                                                      
  0%|          | 1/1707 [37:33:08<10:59:08, 23.18s/it]  

{'loss': 1.0748, 'grad_norm': 4.236911296844482, 'learning_rate': 3.6147757255936676e-06, 'epoch': 2.64}


