In [1]:
import numpy as np
import pandas as pd
import string
from statistics import mean
 
import copy
import os
import re
import torch
import torch.nn as nn
import torch.optim
import nltk
 
from nltk.corpus import stopwords
import spacy
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
 
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
 
!pip install transformers
!pip install pytorch-transformers
 
import transformers
from transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
from transformers import RobertaModel, RobertaTokenizer, RobertaForSequenceClassification
from transformers import AdamW
 
from tqdm import tqdm, trange
 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
def clean_data(x):
    puncts = []
    # puncts = puncts.replace("-", "")
    # stop_words = set(stopwords.words('english'))
    sentences = []
    # x = [''.join(c.lower() for c in s.replace("-", " ") if c not in puncts) for s in x]
    x = [''.join(c.lower() for c in s if c not in puncts) for s in x]
    # x = [''.join(c for c in str(s)[1:-4].replace("-", " ") if c not in puncts) for s in x]
    for sent in x:
        # text_no_nums = re.sub(r'\d+', '', sent)
        # text_no_doublespace = re.sub('\s+', ' ', text_no_nums).strip()
        sentences.append(sent)
    return sentences  

def preprocess_targets(y):
    # dic = {'1.0': 0, '2.0': 1, '3.0': 2, '4.0': 3, '5.0': 4}
    # dic_SST2 = {0: "POS", 1: "NEG"}
    dic = {'NEG': 1, 'POS': 0}
    for t in range(len(y)):
        y[t] = dic[y[t]]
    y = torch.tensor(y, dtype=torch.long)   
    return y  

In [4]:
# path = 'drive/My Drive/third_sem/DL_NLP/Assignment3/digital_music_5.'
# path = 'drive/My Drive/third_sem/DL_NLP/Assignment2/SST-2.'
path = "/content/drive/MyDrive/third_sem/DL_NLP/Project/dataset/SST-2/"

train_path = path + 'train.tsv'
val_path = path + 'dev.tsv'
test_path = path + 'test.tsv'

df = pd.read_csv(train_path, sep='\t')
print(df)
x_train = df["sentence"].values
y_train = df["label"].values
print(x_train[-1])
print(y_train[-1])

df = pd.read_csv(val_path, sep='\t')
x_val = df["sentence"].values
y_val = df["label"].values

# x_train = clean_data(x_train)
print(x_train[:5])

# x_val = clean_data(x_val)
print("hereee",x_val[:2])
# # x_train = [x + " [SEP] [CLS]" for x in x_train]

# df = pd.read_csv(test_path, sep='\t')
# x_test = df["sentence"].values
# print(df)


                                                sentence  label
0           hide new secretions from the parental units       0
1                   contains no wit , only labored gags       0
2      that loves its characters and communicates som...      1
3      remains utterly satisfied to remain the same t...      0
4      on the worst revenge-of-the-nerds clichés the ...      0
...                                                  ...    ...
67344                               a delightful comedy       1
67345                   anguish , anger and frustration       0
67346  at achieving the modest , crowd-pleasing goals...      1
67347                                  a patient viewer       1
67348  this new jangle of noise , mayhem and stupidit...      0

[67349 rows x 2 columns]
this new jangle of noise , mayhem and stupidity must be a serious contender for the title . 
0
['hide new secretions from the parental units '
 'contains no wit , only labored gags '
 'that loves its charac

In [5]:
tokenizer_bert = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer_xl = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

In [6]:
def get_ids_and_mask(tokenizer, x, model):
    input_ids = []
    input_mask = []
    token_type_ids = []
    if model == "xl":
        x = [k + " [SEP] [CLS]" for k in x]
    for k in range(len(x)):
        vec = tokenizer.encode_plus(x[k], add_special_tokens=True, max_length=40, pad_to_max_length=True, return_token_type_ids=True)
        input_ids.append(vec['input_ids'])
        input_mask.append(vec['attention_mask'])
        token_type_ids.append(vec['token_type_ids'])

    print(tokenizer.decode(input_ids[0]))
    return input_ids, input_mask, token_type_ids

def dataloader(sentence, mask, token_type_ids, label, d_type, batch_size=32):
    loader = None
    sentence = torch.tensor(sentence)
    mask = torch.tensor(mask)
    token_type_ids = torch.tensor(token_type_ids)
    label = torch.tensor(label)

    print("data shape", sentence.shape)
    print("label shape", label.shape)
    print("mask shape", mask.shape)
    print("tok type id shape", token_type_ids.shape)

    # batch_size = 32

    if d_type == "train":
        train = TensorDataset(sentence, mask, token_type_ids, label)
        sampler = RandomSampler(train)
        loader = DataLoader(train, batch_size=batch_size, sampler=sampler)

    else:
        test = TensorDataset(sentence, mask, token_type_ids, label)
        samp = SequentialSampler(test)
        loader = DataLoader(test, batch_size=batch_size, sampler=samp)

    return loader


In [7]:
###############   TRAINING OF MODEL  ##################

def train_model(model, optim, epochs, loader):
    loss_arr = []
    for ep in range(epochs):
        model.train()
        epoch_loss = 0

        for i, data in enumerate(loader):
            batch_data = data[0].to(device).long()
            batch_mask = data[1].to(device).long()
            # batch_token_type_ids = data[2].to(device).long()
            batch_labels = data[3].to(device).long()

            model.zero_grad()
            pred = model(batch_data,
                                token_type_ids=None,
                                attention_mask=batch_mask,
                                labels=batch_labels)
            
            # As we call the model with labels, it returns the loss in a tuple
            loss = pred[0]
            epoch_loss += loss.item()
            loss.backward()  # Backprpagation

            # Clip Gradient norm to mitigate exploding of gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optim.step()

        epoch_loss /= len(loader)
        print("train loss after %d epochs is %f " %(ep+1, epoch_loss))
        loss_arr.append(epoch_loss)

    return loss_arr

######   Validate Model   #######

def validate_model(model, loader):
    model.eval()
    test_acc = 0
    for batch in loader:
        batch = tuple(t for t in batch)
        batch_data, batch_mask, batch_token_type ,batch_labels = batch
        
        batch_data = batch_data.to(device)
        batch_mask = batch_mask.to(device)
        
        with torch.no_grad():
            preds = model(batch_data.to(device),
                                    token_type_ids=None,
                                    attention_mask=batch_mask)

        logits = preds[0]
        
        logits = logits.detach().cpu()
        targets = batch_labels.to('cpu')
        
        acc = compute_accuracy(logits, targets)
        test_acc += acc
        # steps += 1

    print("final test set accuracy is ", (test_acc / 872))


In [8]:
def compute_accuracy(preds, targets):
    return (torch.argmax(preds, dim=1) == targets).float().sum().item()

RUN ABOVE CELLS FOR ALL APPROACHES

RUN BELOW CELLS FOR APPROACH 1

In [9]:
#####   APPROACH 1  ######

ids, masks, tok_type_ids = get_ids_and_mask(tokenizer=tokenizer_bert, x=x_train, model="bert")
train_dataloader = dataloader(ids, masks, tok_type_ids, y_train, "train")
print()

ids, masks, tok_type_ids = get_ids_and_mask(tokenizer=tokenizer_bert, x=x_val, model="bert")
val_dataloader = dataloader(ids, masks, tok_type_ids, y_val, "val")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


<s>hide new secretions from the parental units </s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
data shape torch.Size([67349, 40])
label shape torch.Size([67349])
mask shape torch.Size([67349, 40])
tok type id shape torch.Size([67349, 40])

<s>it's a charming and often affecting journey. </s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
data shape torch.Size([872, 40])
label shape torch.Size([872])
mask shape torch.Size([872, 40])
tok type id shape torch.Size([872, 40])


In [10]:
bert_model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=2,
    output_attentions = False,
    output_hidden_states = False,
)

bert_model = bert_model.to(device)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [11]:
########  define optimizer, write accuracy fn  ###########

optimizer = AdamW(bert_model.parameters(), lr=4e-5, eps=1e-8)
num_epochs = 4

In [12]:
arr = train_bert(bert_model, optimizer, 4, train_dataloader)

In [None]:
validate_model(bert_model, val_dataloader)

final test set accuracy is  0.9197247706422018


In [None]:
####   APPROACH 1 --> Second model   #####

ids, masks, tok_type_ids = get_ids_and_mask(tokenizer=tokenizer_xl, x=x_train, model="xl")
train_dataloader_xl = dataloader(ids, masks, tok_type_ids, y_train, "train")
print()

ids, masks, tok_type_ids = get_ids_and_mask(tokenizer=tokenizer_xl, x=x_val, model="xl")
val_dataloader_xl = dataloader(ids, masks, tok_type_ids, y_val, "val")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad> hide new secretions from the parental units [sep] [cls]<sep><cls>
data shape torch.Size([67349, 40])
label shape torch.Size([67349])
mask shape torch.Size([67349, 40])
tok type id shape torch.Size([67349, 40])

<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad> it's a charming and often affecting journey. [sep] [cls]<sep><cls>
data shape torch.Size([872, 40])
label shape torch.Size([872])
mask shape torch.Size([872, 40])
tok type id shape torch.Size([872, 40])


In [None]:
xlnet = XLNetForSequenceClassification.from_pretrained(
    "xlnet-base-cased",
    num_labels=2,
    output_attentions = False,
    output_hidden_states = False,
)

xlnet = xlnet.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [None]:
optimizer = AdamW(xlnet.parameters(), lr=4e-5, eps=1e-8)

In [None]:
arr = train_model(xlnet, optimizer, 4, train_dataloader_xl)

train loss after 1 epochs is 0.238430 
train loss after 2 epochs is 0.155963 
train loss after 3 epochs is 0.122752 
train loss after 4 epochs is 0.112080 


In [None]:
validate_model(xlnet, val_dataloader_xl)

final test set accuracy is  0.9048165137614679


In [None]:
######   APPROACH 1 --> combine models    ######

def combine(bert, xlnet, loader_bert, loader_xl):

    bert.eval()
    xlnet.eval()
    test_acc = 0

    for batch, batch_xl in zip(loader_bert, loader_xl):
        batch = tuple(t for t in batch)
        batch_data, batch_mask, batch_token_type ,batch_labels = batch
        
        batch_data = batch_data.to(device)
        batch_mask = batch_mask.to(device)

        batch_xl = tuple(t for t in batch_xl)
        batch_data_xl, batch_mask_xl, batch_token_type_xl ,batch_labels_xl = batch_xl
        
        batch_data_xl = batch_data_xl.to(device)
        batch_mask_xl = batch_mask_xl.to(device)
        
        with torch.no_grad():
            preds1 = bert(batch_data, token_type_ids=None, attention_mask=batch_mask, labels=batch_labels.to(device))
            preds2 = xlnet(batch_data_xl, token_type_ids=None, attention_mask=batch_mask_xl, labels=batch_labels_xl.to(device))
            
        logits1 = preds1[1].detach().to('cpu')
        logits2 = preds2[1].detach().to('cpu')
        targets = batch_labels.to('cpu')

        logits = torch.zeros((logits1.shape))
        for k in range(len(logits1)):
            if torch.argmax(logits1[k]) >= torch.argmax(logits2[k]):
                logits[k] = logits1[k]
            else:
                logits[k] = logits2[k]

        acc = compute_accuracy(logits, targets)
        test_acc += acc

    print("final test set accuracy is ", (test_acc / 872))


In [None]:
####  APPROACH 1 result   ######

combine(bert_model, xlnet, val_dataloader, val_dataloader_xl)

final test set accuracy is  0.9288990825688074


APPROACH 2 starts here

In [13]:
######   APPROACH 2    ########

class ensemble(nn.Module):
    def __init__(self):
        super(ensemble, self).__init__()
        self.bert = transformers.RobertaModel.from_pretrained('roberta-base')
        self.xlnet = transformers.XLNetModel.from_pretrained('xlnet-base-cased')
        self.fc = nn.Linear(768*2, 2)
        self.dropout = nn.Dropout(p=0.3)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, data_bert, mask_bert, data_xl, mask_xl):
        
        out = self.bert(data_bert, attention_mask=mask_bert, token_type_ids=None)
        emb1 = out[0]
        emb1 = torch.mean(emb1, dim=1)
        # print("emb1", emb1.shape)
        
        emb2 = self.xlnet(data_xl, attention_mask=mask_xl, token_type_ids=None)
        emb2 = out[0]
        emb2 = torch.mean(emb2, dim=1)
        # print("emb2", emb2.shape)

        emb = torch.cat((emb1, emb2), dim=1)

        return self.softmax(self.fc(emb))


In [15]:
######   Approach 2   train ensemble of roberta and xlnet   #####

def train_ensemble(model, loader, optim):
    loss_arr = []
    for ep in range(4):
        epoch_loss = 0
        for i, data in enumerate(loader):
            model.train()
            batch_data = data[0].to(device).long()
            batch_mask = data[1].to(device).long()
            batch_labels = data[6].to(device).long()

            batch_data_xl = data[3].to(device).long()
            batch_mask_xl = data[4].to(device).long()

            pred = model(batch_data, batch_mask, batch_data_xl, batch_mask_xl)
            optim.zero_grad()
            
            loss = F.cross_entropy(pred, batch_labels)
            epoch_loss += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optim.step()

        epoch_loss /= len(loader)
        print("train loss after %d epochs is %f " %(ep+1, epoch_loss))
        loss_arr.append(epoch_loss)
    return loss_arr


def validate_ensemble(model, loader):
    model.eval()
    pred_list = []
    true_labels = []
    test_acc = 0
    for batch in loader:
        batch = tuple(t for t in batch)
        batch_data, batch_mask, z1 , batch_data_xl, batch_mask_xl, z2, batch_labels = batch
        
        batch_data = batch_data.to(device)
        batch_mask = batch_mask.to(device)
        batch_data_xl = batch_data_xl.to(device)
        batch_mask_xl = batch_mask_xl.to(device)
        
        with torch.no_grad():
            logits = model(batch_data, batch_mask, batch_data_xl, batch_mask_xl)

        # logits = preds[1]
        
        logits = logits.detach().cpu()
        targets = batch_labels.to('cpu')
        
        pred_list.append(logits.numpy())
        true_labels.append(targets.numpy())
        
        acc = compute_accuracy(logits, targets)
        test_acc += acc
        # steps += 1

    print("final test set accuracy is ", (test_acc / 1043))
    return pred_list, true_labels


In [16]:
##### combined dataloader for approach 2   ###

def dataloader_comb(sent1, sent2, mask1, mask2, id1, id2, label, d_type, batch_size=32):
    loader = None
    sent1 = torch.tensor(sent1)
    sent2 = torch.tensor(sent2)
    mask1 = torch.tensor(mask1)
    mask2 = torch.tensor(mask2)
    id1 = torch.tensor(id1)
    id2 = torch.tensor(id2)
    label = torch.tensor(label)

    # print("data shape", sentence.shape)
    # print("label shape", label.shape)
    # print("mask shape", mask.shape)
    # print("tok type id shape", token_type_ids.shape)

    batch_size = 32

    if d_type == "train":
        train = TensorDataset(sent1, mask1, id1, sent2, mask2, id2, label)
        sampler = RandomSampler(train)
        loader = DataLoader(train, batch_size=batch_size, sampler=sampler)

    else:
        test = TensorDataset(sent1, mask1, id1, sent2, mask2, id2, label)
        samp = SequentialSampler(test)
        loader = DataLoader(test, batch_size=batch_size, sampler=samp)

    return loader

In [17]:
####   prepare dataloader for combined model   #####

ids, masks, tok_type_ids = get_ids_and_mask(tokenizer=tokenizer_bert, x=x_train, model="bert")
ids1, masks1, tok_type_ids1 = get_ids_and_mask(tokenizer=tokenizer_xl, x=x_train, model="xl")
train_dataloader = dataloader_comb(ids, ids1, masks, masks1, tok_type_ids, tok_type_ids1, y_train, "train")
print()

ids, masks, tok_type_ids = get_ids_and_mask(tokenizer=tokenizer_bert, x=x_val, model="bert")
ids1, masks1, tok_type_ids1 = get_ids_and_mask(tokenizer=tokenizer_xl, x=x_val, model="xl")
val_dataloader = dataloader_comb(ids, ids1, masks, masks1, tok_type_ids, tok_type_ids1, y_val, "val")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


<s>hide new secretions from the parental units </s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad> hide new secretions from the parental units [sep] [cls]<sep><cls>

<s>it's a charming and often affecting journey. </s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad> it's a charming and often affecting journey. [sep] [cls]<sep><cls>


In [18]:
ensemble_model = ensemble().to(device)
optimizer = AdamW(ensemble_model.parameters(), lr=4e-5, eps=1e-10)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…




In [20]:
####  Approach 2 ... train the ensemble   ####

#####   call train ensemble   ######

_ = train_ensemble(ensemble_model, train_dataloader, optimizer)
validate_ensemble(ensemble_model, val_dataloader)


RUN BELOW CELLS FOR APPROACH 3 --> CURRICULUM LEARNING

In [26]:
####  APPROACH 3 starts here   ####

### prepare data to pass to model  ##

ids, masks, tok_type_ids = get_ids_and_mask(tokenizer=tokenizer_bert, x=x_train, model="bert")
train_dataloader = dataloader(ids, masks, tok_type_ids, y_train, "train", batch_size=32)



<s>hide new secretions from the parental units </s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
data shape torch.Size([67349, 40])
label shape torch.Size([67349])
mask shape torch.Size([67349, 40])
tok type id shape torch.Size([67349, 40])


In [27]:
bert_model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=2,
)
bert_model = bert_model.to(device)
optimizer = AdamW(bert_model.parameters(), lr=4e-5, eps=1e-10)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [28]:
###  train model which will score examples --> SCORING FUNCTION ##

_ = train_model(bert_model, optimizer, epochs=4, loader=train_dataloader)

train loss after 1 epochs is 0.261697 
train loss after 2 epochs is 0.181568 
train loss after 3 epochs is 0.154203 
train loss after 4 epochs is 0.133595 


In [29]:
###   prepare train data to get loss om each  ###

ids, masks, tok_type_ids = get_ids_and_mask(tokenizer=tokenizer_bert, x=x_train, model="bert")
train_dataloader = dataloader(ids, masks, tok_type_ids, y_train, "val", batch_size=1)



<s>hide new secretions from the parental units </s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
data shape torch.Size([67349, 40])
label shape torch.Size([67349])
mask shape torch.Size([67349, 40])
tok type id shape torch.Size([67349, 40])


In [30]:
#####   Approach  3   get loss on all training data   #####

bert_model.eval()
loss_list = []
for i, data in enumerate(train_dataloader):
    batch_data = data[0].to(device).long()
    batch_mask = data[1].to(device).long()
    batch_labels = data[3].to(device).long()

    with torch.no_grad():
        pred = bert_model(batch_data,
                                token_type_ids=None,
                                attention_mask=batch_mask,
                                labels=batch_labels)
        
        
    loss = pred[0]
    loss_list.append(loss.item())
    

In [31]:
########     SORT EXAMPLES BASED ON TOUGHNESS     ######

print(max(loss_list))
print(min(loss_list))

train_loss = []

for k in range(len(x_train)):
    train_loss.append((x_train[k], y_train[k], loss_list[k]))
    # label_loss.append((y_train[k], loss_list[k]))

print(train_loss[-1])
train_loss = sorted(train_loss, key = lambda x: x[2])

print(train_loss[-1])

x_train_sorted = []
y_train_sorted = []

for k in train_loss:
    x_train_sorted.append(k[0])
    y_train_sorted.append(k[1])

print(x_train_sorted[-1])
print(len(x_train_sorted), len(y_train_sorted))


4.526257038116455
0.010813566856086254
('this new jangle of noise , mayhem and stupidity must be a serious contender for the title . ', 0, 0.011442981660366058)
('a damn fine and a truly distinctive and a deeply pertinent film ', 0, 4.526257038116455)
a damn fine and a truly distinctive and a deeply pertinent film 
67349 67349


In [32]:
####   NOW Preapare data in curriculum order from sorted examples  ####

ids, masks, tok_type_ids = get_ids_and_mask(tokenizer=tokenizer_bert, x=x_train_sorted, model="bert")
train_dataloader = dataloader(ids, masks, tok_type_ids, y_train_sorted, "train", batch_size=32)

ids, masks, tok_type_ids = get_ids_and_mask(tokenizer=tokenizer_bert, x=x_val, model="bert")
val_dataloader = dataloader(ids, masks, tok_type_ids, y_val, "val", batch_size=32)



<s>bigelow demonstrates a breadth of vision and an attention to detail that propels her into the upper echelons of the directing world. </s><pad><pad><pad><pad><pad><pad><pad><pad><pad>
data shape torch.Size([67349, 40])
label shape torch.Size([67349])
mask shape torch.Size([67349, 40])
tok type id shape torch.Size([67349, 40])
<s>it's a charming and often affecting journey. </s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
data shape torch.Size([872, 40])
label shape torch.Size([872])
mask shape torch.Size([872, 40])
tok type id shape torch.Size([872, 40])


In [33]:
###  model for training in curriculum oreder  ###

new_model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=2,
    output_attentions = False,
    output_hidden_states = False,
)

new_model = new_model.to(device)
optim = AdamW(new_model.parameters(), lr=4e-5, eps=1e-8)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [34]:
###  train model  IN Curriculum ORDER FOR 2 epochs then in RANDOM ORDER  ####

arr = train_model(new_model, optim, 2, train_dataloader)
ids, masks, tok_type_ids = get_ids_and_mask(tokenizer=tokenizer_bert, x=x_train, model="bert")
train_dataloader = dataloader(ids, masks, tok_type_ids, y_train, "train", batch_size=32)
arr = train_model(new_model, optim, 2, train_dataloader)


train loss after 1 epochs is 0.269245 
train loss after 2 epochs is 0.180205 




<s>hide new secretions from the parental units </s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
data shape torch.Size([67349, 40])
label shape torch.Size([67349])
mask shape torch.Size([67349, 40])
tok type id shape torch.Size([67349, 40])
train loss after 1 epochs is 0.156425 
train loss after 2 epochs is 0.142541 


In [35]:
validate_model(new_model, val_dataloader)

final test set accuracy is  0.9105504587155964
