In [1]:
# Connect to Drive for Colab Execution
from google.colab import drive
import os


drive.mount('/content/gdrive/', force_remount=True)

# get to correct Directory
%cd gdrive/MyDrive
os.chdir('./Adv_ML')
os.listdir('./')

Mounted at /content/gdrive/
/content/gdrive/MyDrive


['sentiment_dataset.csv',
 'RL_agent_nb.ipynb',
 'Sentiment_Dset.csv',
 'bert_language_model_with_sequence_classification.ipynb',
 'model_without_language_model.ckpt',
 'Creare presentazioni memorabili.gslides',
 'TrainBertModel.ipynb']

In [2]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
import torch
from tqdm import  tqdm_notebook
from tqdm import tqdm
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from torch.optim import *
from transformers import get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification



## Set GPU device if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
# Load the dataset
dataset_name = 'sentiment_dataset.csv'
file_path = "./" + dataset_name
directory_path = "./"
df = pd.read_csv(file_path, delimiter=",")

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2)

## DATA CLEAN -> LOWERCASE, PUNCTUATION, REMOVE STOPWORDS


## OBTAIN MAX LENGTH OF TOKENIZED DSET

In [9]:
class NLPDataset(Dataset):
    def __init__(self, phrases, labels, tokenizer, max_len):
        self.phrases = phrases
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.phrases)

    def __getitem__(self, item):
        phrase = str(self.phrases[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            phrase,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )

        return {
            'phrase_text': phrase,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [10]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create datasets
max_len = 258
train_dataset = NLPDataset(train_df['phrase'].to_numpy(), train_df['risk_level'].to_numpy(), tokenizer, max_len)
val_dataset = NLPDataset(val_df['phrase'].to_numpy(), val_df['risk_level'].to_numpy(), tokenizer, max_len)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [11]:
next(iter(train_loader))

{'phrase_text': ['In pursuit of low-risk investments with predictable returns within a medium timeframe, balancing patience and progress.',
  'Conservatively approaching high-growth ventures with potential for significant returns over the long haul',
  "I'm looking for low-risk investments with predictable returns in the short term",
  'Committed to identifying stable, low-volatility assets for consistent gains regardless of time horizon',
  'In pursuit of low-risk investments with predictable returns in the short term',
  'Broadening horizons with stable, low-volatility assets for consistent gains in the short term, aiming for immediate profits.',
  'Committed to identifying in high-risk, high-reward investments in the short term',
  "I'm looking for in high-risk, high-reward investments and am looking at the medium term",
  "I'm looking for moderately aggressive investments for balanced growth within a medium timeframe",
  'Optimistically engaging in safe investment options can inves

In [12]:
# Training Parameters
lr = 2e-5
max_grad_norm = 1.0
num_total_steps = 1000
num_warmup_steps = 100
warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1


# Fetch Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=3)

optimizer = AdamW(model.parameters(), lr=lr)
loss_fn = CrossEntropyLoss().to(device)

### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps = -1)  #

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.to(device)

total_step = len(train_loader)

# Store our loss and accuracy for plotting
train_loss_set = []


epochs = 30

# trange is a tqdm wrapper around the normal python range
for epoch in tqdm_notebook(range(epochs)):

    # Training
    # Set our model to training mode (as opposed to evaluation mode)
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    # Train the data for one epoch
    for i, batch in enumerate(train_loader):
      # Unpack batch and move to GPU
      b_input_ids = batch['input_ids'].to(device)
      b_input_mask = batch['attention_mask'].to(device)
      b_labels = batch['labels'].to(device)

      # Forward pass
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
      loss = outputs[0]
      train_loss_set.append(loss.item())
      # Backward pass
      loss.backward()
      # Update parameters and take a step using the computed gradient
      optimizer.step()
      scheduler.step()
      optimizer.zero_grad()
      if (i) % 50 == 0:
        print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                   .format(epoch+1, epochs, i+1, total_step, loss.item()))

In [14]:
# Test the model
with torch.no_grad():
    correct = 0
    total = 0
    for i, batch in enumerate(val_loader):
      # Unpack the inputs from our dataloader
      b_input_ids = batch['input_ids'].to(device)
      b_input_mask = batch['attention_mask'].to(device)
      b_labels = batch['labels'].to(device)

      # Forward pass
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      # print (outputs)
      prediction = torch.argmax(outputs[0],dim=1)
      total += b_labels.size(0)
      correct+=(prediction==b_labels).sum().item()

In [15]:
print('Test Accuracy of the model on val data is: {} %'.format(100 * correct / total))

Test Accuracy of the model on val data is: 98.56459330143541 %


In [16]:
# Save the Model
torch.save(model.state_dict(), directory_path+'model_without_language_model.ckpt')