In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |▏                               | 10kB 23.5MB/s eta 0:00:01[K     |▎                               | 20kB 16.8MB/s eta 0:00:01[K     |▌                               | 30kB 14.0MB/s eta 0:00:01[K     |▋                               | 40kB 13.2MB/s eta 0:00:01[K     |▉                               | 51kB 8.5MB/s eta 0:00:01[K     |█                               | 61kB 9.1MB/s eta 0:00:01[K     |█▏                              | 71kB 9.3MB/s eta 0:00:01[K     |█▎                              | 81kB 10.3MB/s eta 0:00:01[K     |█▌                              | 92kB 9.7MB/s eta 0:00:01[K     |█▋                              | 102kB 8.3MB/s eta 0:00:01[K     |█▉                              | 112kB 8.3MB/s eta 0:00:01[K     |██                              | 122kB 

In [2]:
data_path = 'drive/MyDrive/datasets/roberta_senti.tsv'

In [3]:
import numpy as np
import pandas as pd
import torch
import transformers
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch import cuda
from transformers import RobertaTokenizer, RobertaModel
from sklearn.model_selection import train_test_split
import seaborn as sns
import json
import logging
logging.basicConfig(level=logging.ERROR)

In [4]:
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
dataset = pd.read_csv(data_path, delimiter='\t')
dataset.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [6]:
dataset.shape

(156060, 4)

In [7]:
dataset.Sentiment.unique()

array([1, 2, 3, 4, 0])

In [8]:
dataset.describe()

Unnamed: 0,PhraseId,SentenceId,Sentiment
count,156060.0,156060.0,156060.0
mean,78030.5,4079.732744,2.063578
std,45050.785842,2502.764394,0.893832
min,1.0,1.0,0.0
25%,39015.75,1861.75,2.0
50%,78030.5,4017.0,2.0
75%,117045.25,6244.0,3.0
max,156060.0,8544.0,4.0


In [9]:
new_df = dataset[['Phrase','Sentiment']]
new_df.head()

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
1,A series of escapades demonstrating the adage ...,2
2,A series,2
3,A,2
4,series,2


In [10]:
def prepare(x):
    return ' '.join(str(x).split())
new_df.Phrase.apply(prepare)

0         A series of escapades demonstrating the adage ...
1         A series of escapades demonstrating the adage ...
2                                                  A series
3                                                         A
4                                                    series
                                ...                        
156055                                            Hearst 's
156056                            forced avuncular chortles
156057                                   avuncular chortles
156058                                            avuncular
156059                                             chortles
Name: Phrase, Length: 156060, dtype: object

In [11]:
new_df.head()

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
1,A series of escapades demonstrating the adage ...,2
2,A series,2
3,A,2
4,series,2


In [12]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




In [31]:
import torch.nn.functional as F
class SentiData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Phrase
        self.targets = self.data.Sentiment
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer(
            text,
            padding='max_length',
            max_length=512,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': F.one_hot(torch.tensor(self.targets[index]), num_classes=5)
        }

In [32]:
train_size = 0.8
train_data = new_df.sample(frac=train_size, random_state=200)
test_data = new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

In [15]:
train_encoddings = tokenizer(train_data.Phrase.to_list(), padding=True, truncation=True, max_length=512, return_token_type_ids=True)
test_encoddings = tokenizer(test_data.Phrase.to_list(), padding=True, truncation=True, max_length=512, return_token_type_ids=True)

In [16]:
class SentimentData(torch.utils.data.Dataset):
    def __init__(self, encodding, targets):
        self.encodding = encodding
        self.targets = targets
    
    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        item = {k: torch.tensor(self.targets[index]) for k,v in self.encodding.items()}
        item['targets'] = torch.tensor(self.targets[index])
        return item

In [33]:
# train_set = SentimentData(train_encoddings, train_data.Sentiment.to_list())
# test_set = SentimentData(test_encoddings, test_data.Sentiment.to_list())
train_set = SentiData(train_data, tokenizer, MAX_LEN)
test_set = SentiData(test_data, tokenizer, MAX_LEN)

In [34]:
train_params = {'batch_size': TRAIN_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}
valid_params = train_params = {'batch_size': VALID_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}

In [35]:
train_loader = DataLoader(train_set, **train_params)
test_loader = DataLoader(test_set, **valid_params)

In [36]:
class RobertaSenti(torch.nn.Module):
    def __init__(self):
        super(RobertaSenti, self).__init__()
        self.l1 = RobertaModel.from_pretrained('roberta-base')
        self.pre_classfiler = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classfiler = torch.nn.Linear(768, 5)

    def forward(self, input_ids, mask, token_type_ids):
        o1 = self.l1(input_ids=input_ids, attention_mask=mask, token_type_ids=token_type_ids)
        hidden_state = o1[0]
        pooler = self.pre_classfiler(hidden_state)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classfiler(pooler)
        return output

In [37]:
model = RobertaSenti()
model.to(device)

RobertaSenti(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), e

In [41]:
def calcuc_acc(preds, targets):
    return (preds==targets).sum().item()

In [42]:
loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [43]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _, data in tqdm(enumerate(train_loader, 0)):
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        targets = data['targets'].to(device)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_func(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuc_acc(big_idx, targets)
        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

In [44]:
for i in range(1):
    train(i)

1it [00:00,  3.38it/s]

Training Loss per 5000 steps: 6.241222858428955
Training Accuracy per 5000 steps: 0.0


5001it [20:25,  4.08it/s]

Training Loss per 5000 steps: 0.4464766506386492
Training Accuracy per 5000 steps: 415.6018796240752


10001it [40:49,  4.07it/s]

Training Loss per 5000 steps: 0.37389095457647514
Training Accuracy per 5000 steps: 422.7002299770023


15001it [1:01:16,  4.08it/s]

Training Loss per 5000 steps: 0.3459600212539079
Training Accuracy per 5000 steps: 426.28158122791814


20001it [1:21:42,  4.07it/s]

Training Loss per 5000 steps: 0.3299183544467355
Training Accuracy per 5000 steps: 428.32733363331835


25001it [1:42:09,  4.08it/s]

Training Loss per 5000 steps: 0.32028371942940737
Training Accuracy per 5000 steps: 429.5498180072797


30001it [2:02:35,  4.06it/s]

Training Loss per 5000 steps: 0.3129374257041677
Training Accuracy per 5000 steps: 430.59564681177295


31212it [2:07:32,  4.08it/s]

The Total Accuracy for Epoch 0: 430.8342945021146
Training Loss Epoch: 0.311398440821763
Training Accuracy Epoch: 430.8342945021146





In [40]:
for _, data in enumerate(train_loader, 0):
    print(data['targets'])
    break

tensor([[0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0]])
