# Text classification on movie reviews Dataset from stanford with Bert

In [None]:
fig = dict({
    "data": [{"type": "bar",
              "x": [1, 2, 3],
              "y": [1, 3, 2]}],
    "layout": {"title": {"text": "A Figure Specified By Python Dictionary"}}
})

# To display the figure defined by this dict, use the low-level plotly.io.show function
import plotly.io as pio

pio.show(fig)

In [2]:
!mkdir  /content/drive/MyDrive/transformers_for_nlp
%cd ./drive/MyDrive/transformers_for_nlp
#from google.colab import files 
#files.upload()

/content/drive/MyDrive/transformers_for_nlp


In [3]:
!pip install transformers


Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 14.5MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 40.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 52.4MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=945a24c319

In [None]:
# Downloading the dataset.
!wget -q -nc http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [5]:
# Unziping the dataset.
!tar -zxf /content/drive/MyDrive/transformers_for_nlp/aclImdb_v1.tar.gz

In [7]:
!pip install -q git+https://github.com/gmihaila/ml_things.git


[K     |████████████████████████████████| 71kB 7.1MB/s 
[K     |████████████████████████████████| 10.3MB 21.9MB/s 
[?25h  Building wheel for ml-things (setup.py) ... [?25l[?25hdone
  Building wheel for ftfy (setup.py) ... [?25l[?25hdone
[31mERROR: albumentations 0.1.12 has requirement imgaug<0.2.7,>=0.2.5, but you'll have imgaug 0.2.9 which is incompatible.[0m


In [8]:
import os
import io
from tqdm.notebook import tqdm
from torch.utils.data import Dataset , DataLoader 
from ml_things import plot_dict , fix_text , plot_confusion_matrix
from sklearn.metrics import classification_report , accuracy_score
from transformers import (AutoConfig ,AutoModelForSequenceClassification ,AutoTokenizer,AdamW,
                          get_linear_schedule_with_warmup , set_seed)


In [12]:
import torch
from torch import nn 
import torch.nn.functional as F

In [25]:
set_seed(26)
epochs = 4
batches =  32 # as max seq length is small but for max _Seq_len =512 batch_Size should be very small (due togpu issues)
max_seq_len = 50
device =torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device == 'cpu' :
    print("Using cpu ,This will b slow")
model_name = 'bert-base-cased'

labels = { "pos" : 1 , "neg" : 0}
n_labels = len(labels)



In [30]:
class DataSet(Dataset):
    r"""
    This class can be used as is as long
    as the `dataloader` outputs a batch in dictionary format that can be passed 
    straight into the model - `model(**batch)`.
 
  Arguments:
 
    path (:obj:`str`):
        Path to the data partition.
     
    tokenizer (:obj:`transformers.tokenization_?`):
        Transformer type tokenizer used to process raw text into numbers.
 
    labels (:obj:`dict`):
        Dictionary to encode any labels names into numbers. Keys map to 
        labels names and Values map to number associated to those labels.
 
    max_sequence_len (:obj:`int`, `optional`)
        Value to indicate the maximum desired sequence to truncate or pad text
        sequences. If no value is passed it will used maximum sequence size
        supported by the tokenizer and model.
     """
    def _init__(self , path , tokenizer , labels , max_seq_len):
        if not os.path.isdir(path):
             print("Path to directory does not exist.")

        texts = []
        labels = []

        #the labels are defined by folders with data we loop through each label.

        for label , label_id in tqdm(labels.items()):
             text_path  = os.path.join(path , label) #path/pos | path/neg

             all_file_names = os.listdir(text_path)
             for file_name in tqdm(all_file_names):
                 file_path = os.path.join(text_path , file_name)

                 text = io.open(file_path , mode = 'r' , encoding = 'utf-8').read()
                 text = fix_text(text) #solves any encoding issues
                 texts.append(text)
                 labels.append(label_id)
        self.n = len(labels)
        #using tokenizer on text ,returns a dictionary with tokenized integer ids under input_ids key
        self.tokenized_inputs = tokenizer(texts ,add_special_tokens = True , truncation = True, padding = True ,
                                          return_tensors = 'pt' , max_length = max_seq_len)
        self.seq_len = self.tokenized_inputs['input_ids'].shape[-1] #length of sequence

        self.tokenized_inputs.update({"labels" : torch.tensor(labels)})
        print(self.tokenized_inputs)
        def __len__(self) :
            return self.n

        def __getitem__(self , index):
            r"""
            Arguments:
 
            index (:obj:`int`):
                Index position to pick an example to return.
        
            Returns:
            :obj:`Dict[str, object]`: Dictionary of inputs that feed into the model.
            It holddes the statement `model(**Returned Dictionary)`.

            """
            return {key : self.tokenized_inputs[key][items] for key in self.tokenized_inputs.keys()}







In [None]:
d = Dataset("/content/drive/MyDrive/transformers_for_nlp/aclImdb/train" , AutoTokenizer , labels , max_seq_len)

In [22]:
def train(dataloader , optimizer_ , scheduler_ , device_):
    r"""

    Performs a single training pass of the " model " which is a global variable loaded on device_

    Arguments:

    dataloader (:obj:`torch.utils.data.dataloader.DataLoader`):
        Parsed data into batches of tensors.

    optimizer_ (:obj:`transformers.optimization.AdamW`):
        Optimizer used for training.

    scheduler_ (:obj:`torch.optim.lr_scheduler.LambdaLR`):
        PyTorch scheduler.

    device_ (:obj:`torch.device`):
        Device used to load tensors before feeding to model.

    Returns:

    :obj:`List[List[int], List[int], float]`: List of [True Labels, Predicted
        Labels, Train Average Loss].
    """

    global model

    preds , true_labels = [] , [] 
    total_loss = 0
    model.train()

    for batch in tqdm(dataloader , total = len(dataloader)) :
        #Adding to original labels - to be used later for evaluation
        true_labels += batch["labels"].numpy().flatten().tolist()
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

        model.zero_grad()
        #This will return the loss (rather than the model output) because we have provided the `labels`.
        # model always returns a tuple 
        out = model(**batch)

        loss , logits = out[:2]

        total_loss += loss.item()
        loss.backward()
        #gradient clipping `
        torch.nn.utils.clip_grad_norm_(model.parameters() , 1.0)

        optimizer_.step()
        schedular_.step

        logits = logits.detach().cpu().numpy()
        # Convertion of these logits to list of predicted labels values.
        preds += logits.argmax(axis = -1).flatten().tolist()

    average_epoch_loss = total_loss / len(dataloader)

    return true_labels,preds,average_epoch_loss



 

In [23]:
def valiadation(dataloader , device_):
    """
    function that performs validation on validation data
     Arguments:
 
    dataloader (:obj:`torch.utils.data.dataloader.DataLoader`):
          Parsed data into batches of tensors.
 
    device_ (:obj:`torch.device`):
          Device used to load tensors before feeding to model.
 
     Returns:
     
    :True Labels, Predictions,  Average Loss

    """

    global model 
    preds , true_labels= [] , []
    total_loss = 0 

    model.eval()

    for batch in tqdm(dataloader , total = len(dataloader)) :
        #Adding to original labels - to be used later for evaluation
        true_labels += batch["labels"].numpy().flatten().tolist()
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

        with torch.no_grad():
            out = model(**batch)
            loss , logits = out[:2]
            logits = logits.detach().cpu().numpy()
            total_loss += loss

            preds += logits.argmax(axis = -1).flatten().tolist()

    average_validation_loss = total_loss / len(dataloader)

    return true_labels , preds , average_validation_loss


In [26]:
model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name , num_labels = n_labels)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name , config=model_config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [31]:
training_dataset = DataSet(path = "/content/drive/MyDrive/transformers_for_nlp/aclImdb/train" ,
                           tokenizer = tokenizer , labels = labels ,max_seq_len=max_seq_len
                           )


traning_dataloader = DataLoader(training_dataset , batch_size = batch_size , shuffle =True)


TypeError: ignored