# Import Packages

In [1]:
import numpy as np
import evaluate
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
import torch
from datasets import Dataset
from typing import Optional

  from .autonotebook import tqdm as notebook_tqdm


# 1. Tokenize dataset and 2. Train Test Splitting

In [2]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
def preprocess_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True)

In [4]:
TRAINING_DATA = pd.read_csv('../data/jonpg_prepped_data.csv')[['headlines','label']]
print(len(TRAINING_DATA))
text = TRAINING_DATA['headlines'].tolist()

labels = TRAINING_DATA['label'].tolist()


# Split another testing set in case I want to do my own testing
# text_train, text_test, label_train, label_test = train_test_split(
#     train_text, train_label, test_size=0.10, random_state=42)
training_text, testing_text, training_labels, testing_labels = train_test_split(
    text, labels, test_size=0.20, random_state=42)

train_dataset = Dataset.from_dict({'text':training_text, 'label':training_labels})
eval_dataset = Dataset.from_dict({'text':testing_text, 'label':testing_labels})

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = eval_dataset.map(preprocess_function, batched=True)

45463


Map: 100%|██████████| 36370/36370 [00:04<00:00, 8793.76 examples/s]
Map: 100%|██████████| 9093/9093 [00:00<00:00, 9133.18 examples/s]


# 3. Fine-tune Data to the Model

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis", num_labels=3)
model.config.id2label = {0:'decrease', 1:'sustain',2:'increase'}

In [7]:
training_args = TrainingArguments(
    output_dir='C:/Users/Jon/Documents/Career/Projects/SDSPNLP/results',
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val
)

In [8]:
trainer.train()
trainer.save_model('C:/Users/Jon/Documents/Career/Projects/SDSPNLP/results/jonpg_model')

 22%|██▏       | 500/2274 [3:16:25<11:35:23, 23.52s/it]

{'loss': 1.1502, 'grad_norm': 2.401078462600708, 'learning_rate': 2.3403693931398417e-05, 'epoch': 0.22}


 44%|████▍     | 1000/2274 [6:18:51<7:38:13, 21.58s/it]

{'loss': 1.0999, 'grad_norm': 2.466158390045166, 'learning_rate': 1.6807387862796836e-05, 'epoch': 0.44}


 66%|██████▌   | 1500/2274 [9:17:11<4:43:32, 21.98s/it]

{'loss': 1.0914, 'grad_norm': 1.3107244968414307, 'learning_rate': 1.0211081794195251e-05, 'epoch': 0.66}


 88%|████████▊ | 2000/2274 [12:16:25<1:37:08, 21.27s/it]

{'loss': 1.0925, 'grad_norm': 2.427618980407715, 'learning_rate': 3.6147757255936676e-06, 'epoch': 0.88}


100%|██████████| 2274/2274 [13:55:14<00:00, 22.04s/it]  

{'train_runtime': 50114.8865, 'train_samples_per_second': 0.726, 'train_steps_per_second': 0.045, 'train_loss': 1.10621349163609, 'epoch': 1.0}





# 4. Create an Inference Function

In [1]:
import re
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

def model_call(
        inputs: list,
        output_max_prob: bool = True):
    """model_call takes in 
    parameter:inputs -> string or list of strings 
    and outputs the jonpg_model 
    results: tensor(decrease, sustain, increase)
    
    output_max_prob will return the probabilities and their respective classes."""

    # First we need to clean the data 
    def replace_currency_symbols(text):
    # Dictionary mapping currency symbols to their acronyms
        currency_dict = {
            '£': 'GBP ',   # British Pound
            '€': 'EUR ',   # Euro
            '$': 'USD ',   # US Dollar
            '¥': 'JPY ',   # Japanese Yen
            '₹': 'INR ',   # Indian Rupee
            '₽': 'RUB ',   # Russian Ruble
            '₩': 'KRW ',   # South Korean Won
            '฿': 'THB ',   # Thai Baht
            '₺': 'TRY ',   # Turkish Lira
            '₪': 'ILS '    # Israeli Shekel
        }

        # Regular expression pattern to match any of the currency symbols
        pattern = re.compile('|'.join(re.escape(symbol) for symbol in currency_dict.keys()))

        # Function to replace a matched symbol with its acronym
        def replace_symbol(match):
            return currency_dict[match.group(0)]

        # Replace all currency symbols in the text
        return pattern.sub(replace_symbol, text)
    def clean_text(text):
        # Add a space before and after punctuation marks
        text = re.sub(r'([.,!?;:])', r' \1 ', replace_currency_symbols(text))
        
        # Remove newlines and tabs
        text = text.replace('\n', ' ').replace('\t', ' ')
        
        # Remove any extra spaces that might have been introduced
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    model = AutoModelForSequenceClassification.from_pretrained('C:/Users/Jon/Documents/Career/Projects/SDSPNLP/results/jonpg_model')
    tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
    # Run the str or list through the clean text function
    if type(inputs) == str:
        inputs = clean_text(inputs)
        
        tv = torch.tensor([tokenizer.encode(inputs, padding="max_length", max_length=64, truncation=True)])
    else:
        inputs = [clean_text(i) for i in inputs]
        
        tv = torch.tensor([tokenizer.encode(x, padding="max_length", max_length=64, truncation=True) for x in inputs])
    

    # Now call in the model from file with tokenize cleaned inputs
    attention_mask = (tv != 0).type(torch.int64)
    # Grab logits or probabilities and return them
    outputs = model(tv, attention_mask=attention_mask)[0].detach().cpu()

    if output_max_prob:
        outputs = torch.sigmoid(outputs)

    return outputs.numpy()
        

  from .autonotebook import tqdm as notebook_tqdm


### Examples

In [2]:
model_call(inputs=['Stocks are going up $$$$$$$$$$$$$$$!!!!', 'something weird is up with Cramer.'], output_max_prob=False)

array([[ 0.02425274, -0.00695869,  0.02736503],
       [ 0.02038652,  0.0454974 , -0.02326715]], dtype=float32)

In [3]:
model_call(inputs='something is going on in the stock market .', output_max_prob=True)

array([[0.50625896, 0.51019675, 0.4976106 ]], dtype=float32)

# Model Explanation

In [4]:
# Run model_call with necassary packages first
import shap
tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

In [5]:
labels = [
    'decrease',
    'sustain',
    'increase'
]


In [6]:
explainer = shap.Explainer(model_call, tokenizer, output_names=labels)

In [7]:
text_example = ['Stocks are going up $$$$$$$$$$$$$$$!!!!', 'something weird is up with Cramer.', 'something is going on in the stock market .']

shap_values = explainer(text_example)

PartitionExplainer explainer: 4it [00:50, 16.75s/it]                       


In [8]:
shap.plots.text(shap_values)