In [74]:
import re
import os
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaModel, RobertaConfig, RobertaTokenizer, AutoTokenizer

In [75]:
rootPath = "C:/Users/Roger/Documents/PycharmProjects/mathsForDL1/finalProject/"
relTrainCSVpath = "CONDA/data/CONDA_train.csv"
relTestCSVpath = "CONDA/data/CONDA_test.csv"
relValidationCSVpath = "CONDA/data/CONDA_valid.csv"

In [76]:
trainDF = pd.read_csv(os.path.join(rootPath,relTrainCSVpath))
testDF = pd.read_csv(os.path.join(rootPath,relTestCSVpath))
valDF = pd.read_csv(os.path.join(rootPath,relValidationCSVpath))

In [77]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [78]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.utterance = dataframe.utterance
        self.targets = self.data.intentClass
        self.max_len = max_len

    def __len__(self):
        return len(self.utterance)

    def __getitem__(self, index):
        utterance = str(self.utterance[index])
        utterance = " ".join(utterance.split())

        inputs = self.tokenizer.encode_plus(
            utterance,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
            }

In [79]:
def preprocess_dataset(text):
    if isinstance(text, str):
        cleaned_text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        cleaned_text = re.sub(r'\s*\[SEPA\]\s*', '', cleaned_text) #Remove the string "[SEPA]"
        return cleaned_text
    else:
        return str(text)

In [80]:
# print(trainDF['utterance'][:6])
trainDF['utterance'] = trainDF['utterance'].apply(preprocess_dataset)
testDF['utterance'] = testDF['utterance'].apply(preprocess_dataset)
valDF['utterance'] =valDF['utterance'].apply(preprocess_dataset)
# trainDF['utterance']

In [81]:
trainencodedDF = pd.get_dummies(trainDF['intentClass'], prefix='intentClass')
# testencodedDF = pd.get_dummies(testDF['intentClass'], prefix='intentClass')
valencodedDF = pd.get_dummies(valDF['intentClass'], prefix='intentClass')

train_encoded_list = trainencodedDF.values.tolist()
# test_encoded_list = testencodedDF.values.tolist()
val_encoded_list = valencodedDF.values.tolist()

trainDF['Encoded'] = train_encoded_list
# testDF['Encoded'] = testencodedDF
valDF['Encoded'] = val_encoded_list

In [82]:
train_size = 0.8


train_dataset=trainDF.sample(frac=train_size,random_state=200)
test_dataset=trainDF.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(trainDF.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (26921, 11)
TRAIN Dataset: (21537, 11)
TEST Dataset: (5384, 11)


In [83]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [86]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.

class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.classifier = torch.nn.Linear(768, 4)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, return_dict = False)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        output = self.classifier(pooler)
        return output

In [87]:
model = RobertaClass()
model.to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

In [88]:
# Creating the loss function and optimizer
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [89]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [90]:
for epoch in range(EPOCHS):
    train(epoch)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


TypeError: new(): invalid data type 'str'