In [1]:
import warnings
warnings.filterwarnings('ignore') # to avoid warnings

import random
import pandas as pd
from tqdm import tqdm
import numpy as np

"""
Sklearn Libraries
"""
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

"""
Transformer Libraries
"""
from transformers import BertTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup

"""
Pytorch Libraries
"""
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset


In [2]:
financial_data = pd.read_csv('./data/NEWS_data.csv', encoding='latin-1')

In [None]:
# label(positive = 1, nagative = 0, ambiguous = 2)
financial_data.head()

In [4]:
X_data = financial_data['news']
Y_data = financial_data['label']

In [5]:
X_train_argriculture, X_val_argriculture, Y_train_argriculture, Y_val_argriculture = train_test_split(X_data, Y_data, test_size=0.2, random_state=42)

In [6]:
# Get the FinBERT Tokenizer
finbert_tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert", 
                                          do_lower_case=True)

# Encode the Training data
encoded_data_train = finbert_tokenizer.batch_encode_plus(
    X_train_argriculture, 
    return_tensors='pt',
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    truncation=True 
)

# Encode the Validation data
encoded_data_val = finbert_tokenizer.batch_encode_plus(
    X_val_argriculture, 
    return_tensors='pt',
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    truncation=True
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = np.array(Y_train_argriculture)
labels_train = torch.tensor(labels_train)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = np.array(Y_val_argriculture)
labels_val = torch.tensor(labels_val)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [3]:
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

Downloading:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

: 

: 

In [None]:
batch_size = 5

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)