In [1]:
import pandas as pd
df = pd.read_csv('/Users/kailiu/StockMarketPrediction-/data/all-data.csv',encoding='latin1')
df.head()

Unnamed: 0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [2]:
#rename the columns
df = df.rename(columns={'neutral':'label','According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .':'Message'})
df.head()

Unnamed: 0,label,Message
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [3]:
## transform df to dataset
from datasets import Dataset
df = Dataset.from_pandas(df[['label', 'Message']])
df

Dataset({
    features: ['label', 'Message'],
    num_rows: 4845
})

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

def preprocess_function(examples):
    return tokenizer(examples["Message"], truncation=True, padding=True, return_tensors="pt")

tokenized_ds = df.map(preprocess_function, batched=True)
print(tokenized_ds)

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Map:   0%|          | 0/4845 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'Message', 'input_ids', 'attention_mask'],
    num_rows: 4845
})


In [5]:
id2label = { "0": "negative", 
    "1": "neutral", 
    "2": "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}
tokenized_ds = tokenized_ds.map(lambda examples: {'label': [label2id[label] for label in examples['label']]}, batched=True)


Map:   0%|          | 0/4845 [00:00<?, ? examples/s]

In [6]:
# split the dataset into train and validation
tokenized_ds = tokenized_ds.train_test_split(test_size=0.1)
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['label', 'Message', 'input_ids', 'attention_mask'],
        num_rows: 4360
    })
    test: Dataset({
        features: ['label', 'Message', 'input_ids', 'attention_mask'],
        num_rows: 485
    })
})

In [7]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [8]:
import evaluate

accuracy = evaluate.load("accuracy")

In [9]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [10]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,AdamW

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilroberta-base", num_labels=3, id2label=id2label, label2id=label2id
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


learning_rate: 2e-05
train_batch_size: 8
eval_batch_size: 8
seed: 42
optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
lr_scheduler_type: linear
num_epochs: 5

In [11]:
class CustomTrainer(Trainer):
    def create_optimizer(self):
        # Create a custom AdamW optimizer with specific betas and epsilon
        self.optimizer = AdamW(
            self.model.parameters(),
            lr=self.args.learning_rate,
            betas=(0.9, 0.999),  # Set the betas as specified
            eps=1e-08  # Set the epsilon as specified
        )
        return self.optimizer

# Define your training arguments
training_args = TrainingArguments(
    output_dir="models/distilbert-base-uncased-financial-finetune",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
    seed=42,
    lr_scheduler_type="linear"
)

# Use the custom trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = CustomTrainer(


  0%|          | 0/2725 [00:00<?, ?it/s]

{'loss': 0.5068, 'grad_norm': 19.376386642456055, 'learning_rate': 1.63302752293578e-05, 'epoch': 0.92}


  0%|          | 0/61 [00:00<?, ?it/s]

{'eval_loss': 0.33032816648483276, 'eval_accuracy': 0.8597938144329897, 'eval_runtime': 2.4682, 'eval_samples_per_second': 196.497, 'eval_steps_per_second': 24.714, 'epoch': 1.0}
{'loss': 0.3416, 'grad_norm': 28.91315460205078, 'learning_rate': 1.2660550458715597e-05, 'epoch': 1.83}


  0%|          | 0/61 [00:00<?, ?it/s]

{'eval_loss': 0.48314642906188965, 'eval_accuracy': 0.8804123711340206, 'eval_runtime': 1.9975, 'eval_samples_per_second': 242.804, 'eval_steps_per_second': 30.538, 'epoch': 2.0}
{'loss': 0.2431, 'grad_norm': 3.1638665199279785, 'learning_rate': 8.990825688073395e-06, 'epoch': 2.75}


  0%|          | 0/61 [00:00<?, ?it/s]

{'eval_loss': 0.6459629535675049, 'eval_accuracy': 0.865979381443299, 'eval_runtime': 1.9772, 'eval_samples_per_second': 245.295, 'eval_steps_per_second': 30.852, 'epoch': 3.0}
{'loss': 0.1567, 'grad_norm': 9.064741134643555, 'learning_rate': 5.3211009174311936e-06, 'epoch': 3.67}


  0%|          | 0/61 [00:00<?, ?it/s]

{'eval_loss': 0.7302596569061279, 'eval_accuracy': 0.8618556701030928, 'eval_runtime': 2.0135, 'eval_samples_per_second': 240.869, 'eval_steps_per_second': 30.295, 'epoch': 4.0}
{'loss': 0.1086, 'grad_norm': 13.297479629516602, 'learning_rate': 1.6513761467889911e-06, 'epoch': 4.59}


  0%|          | 0/61 [00:00<?, ?it/s]

{'eval_loss': 0.7604742646217346, 'eval_accuracy': 0.8536082474226804, 'eval_runtime': 2.035, 'eval_samples_per_second': 238.331, 'eval_steps_per_second': 29.976, 'epoch': 5.0}
{'train_runtime': 415.6933, 'train_samples_per_second': 52.443, 'train_steps_per_second': 6.555, 'train_loss': 0.2576276971659529, 'epoch': 5.0}


TrainOutput(global_step=2725, training_loss=0.2576276971659529, metrics={'train_runtime': 415.6933, 'train_samples_per_second': 52.443, 'train_steps_per_second': 6.555, 'total_flos': 726110226812880.0, 'train_loss': 0.2576276971659529, 'epoch': 5.0})

In [12]:
from accelerate import Accelerator

# Reinitialize the Accelerator
accelerator = Accelerator()

trainer.evaluate()

  0%|          | 0/61 [00:00<?, ?it/s]

{'eval_loss': 0.33032816648483276,
 'eval_accuracy': 0.8597938144329897,
 'eval_runtime': 2.0446,
 'eval_samples_per_second': 237.211,
 'eval_steps_per_second': 29.835,
 'epoch': 5.0}

In [13]:
trainer.save_model("../models/distilbert-base-uncased-financial-finetune")