In [25]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
import pandas as pd
df = pd.read_csv('../data/processed_data_news.csv')
df.head()

Unnamed: 0,pubDate,summary,label,score
0,2023-09-01,Amazon.com (NASDAQ:AMZN) has taken a bearish s...,1,0.759611
1,2022-08-05,E-commerce giant Amazon (AMZN) on Friday annou...,1,0.593363
2,2024-09-25,Amazon.com (NASDAQ:AMZN) has seen unusual acti...,0,0.746989
3,2024-10-23,Amazon.com (NASDAQ:AMZN) shares are currently ...,0,0.459074
4,2022-08-24,"Amazon is shutting down Amazon Care, a healthc...",1,0.811488


In [27]:
## transform df to dataset
from datasets import Dataset
df = Dataset.from_pandas(df[['summary', 'label']])
df


Dataset({
    features: ['summary', 'label'],
    num_rows: 1225
})

In [28]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

def preprocess_function(examples):
    return tokenizer(examples["summary"], truncation=True)

tokenized_ds = df.map(preprocess_function, batched=True)
print(tokenized_ds)

Map:   0%|          | 0/1225 [00:00<?, ? examples/s]

Dataset({
    features: ['summary', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1225
})


In [29]:
# split the dataset into train and validation
tokenized_ds = tokenized_ds.train_test_split(test_size=0.1)


In [30]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [31]:
import evaluate

accuracy = evaluate.load("accuracy")

In [32]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [33]:
id2label = { "0": "negative", 
    "1": "neutral", 
    "2": "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

In [34]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
training_args = TrainingArguments(
    output_dir="models/distilbert-base-uncased-financial-finetune",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.711216,0.731707
2,No log,0.616408,0.764228
3,No log,0.665655,0.764228
4,No log,0.663677,0.747967
5,No log,0.664675,0.756098


TrainOutput(global_step=345, training_loss=0.5258277782495471, metrics={'train_runtime': 164.8162, 'train_samples_per_second': 33.431, 'train_steps_per_second': 2.093, 'total_flos': 360571822847244.0, 'train_loss': 0.5258277782495471, 'epoch': 5.0})

In [36]:
from accelerate import Accelerator

# Reinitialize the Accelerator
accelerator = Accelerator()

trainer.evaluate()

{'eval_loss': 0.6164084672927856,
 'eval_accuracy': 0.7642276422764228,
 'eval_runtime': 0.9963,
 'eval_samples_per_second': 123.463,
 'eval_steps_per_second': 8.03,
 'epoch': 5.0}

In [37]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/liukai1987/distilbert-base-uncased-financial-finetune/commit/a2e52d9995bb6bfd17811b4c7799f4c3aee83509', commit_message='End of training', commit_description='', oid='a2e52d9995bb6bfd17811b4c7799f4c3aee83509', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
trainer.save_model("models/distilbert-base-uncased-financial-finetune")