In [1]:
# !pip install datasets
# !pip install torch
# !pip install transformers
# !pip install evaluate

# # !pip install accelerate -U

# !pip install torch


In [2]:
import pandas as pd
import numpy as np

from datasets import Dataset, DatasetDict, load_metric, load_dataset
import torch
from transformers import DataCollatorWithPadding


2024-03-30 08:12:47.393006: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-30 08:12:47.393211: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-30 08:12:47.570882: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
data = pd.read_csv('/kaggle/input/financial-sentiment-analysis/data.csv')
data

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


### Convert Pandas DataFrame to Datasets Format; Then Split into Train/Test/Valid Datasets

In [4]:

# Convert pandas dataframe to a dataset
dataset = Dataset.from_pandas(data)

# Split the datasetdict into train/test/valid subsets
train_testvalid = dataset.train_test_split(test_size=0.30)
test_valid = train_testvalid['test'].train_test_split(test_size=0.50)

# Combine the train/test/valid into one datasetdict
dataset = DatasetDict({
    'train' : train_testvalid['train'],
    'test' : test_valid['test'],
    'valid' : test_valid['train']
})

print('Training Data Shape:', dataset['train'].shape)
print('Testing Data Shape:', dataset['test'].shape)
print('Validation Data Shape:', dataset['valid'].shape)

Training Data Shape: (4089, 2)
Testing Data Shape: (877, 2)
Validation Data Shape: (876, 2)


In [5]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, AutoTokenizer, AutoModel

checkpoint = "bert-base-uncased"                          

In [6]:
#instatiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModel.from_pretrained(checkpoint, num_labels=3)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

### Preprocessing and tokenization of input data

In [7]:
#tokenization function
def tokenize_function(example):
    return tokenizer(example["Sentence"], truncation=True)

#converting the labels
def label(x):
    if x['Sentiment'] == 'positive':
      return {'Sentiment': 1}
    elif x['Sentiment'] == 'negative':
      return {'Sentiment': 2}
    else:
      return {'Sentiment': 0}



In [8]:
dataset = dataset.map(label)


# tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/4089 [00:00<?, ?ex/s]

  0%|          | 0/877 [00:00<?, ?ex/s]

  0%|          | 0/876 [00:00<?, ?ex/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['Sentence', 'Sentiment', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4089
    })
    test: Dataset({
        features: ['Sentence', 'Sentiment', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 877
    })
    valid: Dataset({
        features: ['Sentence', 'Sentiment', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 876
    })
})

In [10]:
#we are going to remove the columns we dont need
tokenized_datasets = tokenized_datasets.remove_columns(["Sentence"])

#rename the sentiment column to "labels" so that it can be understood by the bert model
tokenized_datasets = tokenized_datasets.rename_column("Sentiment", "labels")

In [11]:
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

### Define pytorch data loaders

In [12]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)

eval_dataloader = DataLoader(
    tokenized_datasets["valid"], batch_size=8, collate_fn=data_collator
)

test_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=8, collate_fn=data_collator
)