# Fine Tuning ALBERT for Sentiment Analysis





In [None]:
!pip install transformers==3.0.2



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer, AlbertModel, AlbertTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/training.1600000.processed.noemoticon_simple_proc.csv',encoding='latin-1', header=None)

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
df = df.dropna(axis='rows')

X = df[7][1:]
y = df[8][1:].apply(lambda t: int(t))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=124124, 
                                                        shuffle=True, stratify=y)
df_train = pd.DataFrame({'txt': X_train, 'target':y_train})
df_test = pd.DataFrame({'txt': X_test, 'target':y_test})
df_train['target'] = df_train['target'].apply(lambda t: 0 if t == 0 else 1)
df_test['target'] = df_test['target'].apply(lambda t: 0 if t == 0 else 1)
df_train = df_train.iloc[:100000]
df_test = df_test.iloc[:20000]


In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
# EPOCHS = 1
LEARNING_RATE = 3e-05
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', truncation=True, do_lower_case=True)

In [None]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.txt
        self.targets = self.data.target
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(df_train.shape))
print("TEST Dataset: {}".format(df_test.shape))

training_set = SentimentData(df_train, tokenizer, MAX_LEN)
testing_set = SentimentData(df_test, tokenizer, MAX_LEN)

FULL Dataset: (1562118, 9)
TRAIN Dataset: (100000, 2)
TEST Dataset: (20000, 2)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
class AlbertClass(torch.nn.Module):
    def __init__(self):
        super(AlbertClass, self).__init__()
        self.l1 = AlbertModel.from_pretrained("albert-base-v2")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = AlbertClass()
model.to(device)

AlbertClass(
  (l1): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
            

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        optimizer.zero_grad()
       
        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _ % 500 == 0:
            output_model_file = '/content/drive/MyDrive/Colab Notebooks/pytorch_albert_sentiment.bin'
            output_vocab_file = '/content/drive/MyDrive/Colab Notebooks/albert_voc'

            model_to_save = model
            torch.save(model_to_save, output_model_file)
            tokenizer.save_vocabulary(output_vocab_file)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [None]:
EPOCHS = 3
for epoch in range(EPOCHS):
    train(epoch)
    output_model_file = '/content/drive/MyDrive/Colab Notebooks/pytorch_albert_sentiment.bin'
    output_vocab_file = '/content/drive/MyDrive/Colab Notebooks/albert_voc'

    model_to_save = model
    torch.save(model_to_save, output_model_file)
    tokenizer.save_vocabulary(output_vocab_file)

output_model_file = '/content/drive/MyDrive/Colab Notebooks/pytorch_albert_sentiment.bin'
output_vocab_file = '/content/drive/MyDrive/Colab Notebooks/albert_voc'
model_to_save = model
torch.save(model, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

0it [00:00, ?it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory


Training Loss per 5000 steps: 1.5060062408447266
Training Accuracy per 5000 steps: 40.625


500it [06:53,  1.21it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
1000it [13:46,  1.21it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
1500it [20:39,  1.21it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
2000it [27:33,  1.21it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
2500it [34:27,  1.21it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
3000it [41:20,  1.21it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
3125it [43:04,  1.21it/s]
ERROR:transformers.tokenization_albert:Vocabulary p

The Total Accuracy for Epoch 0: 68.017
Training Loss Epoch: 0.5918939120197296
Training Accuracy Epoch: 68.017


ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory


Training Loss per 5000 steps: 0.4439583718776703
Training Accuracy per 5000 steps: 81.25


500it [06:54,  1.21it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
1000it [13:48,  1.20it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
1500it [20:42,  1.21it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
2000it [27:36,  1.21it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
2500it [34:30,  1.21it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
3000it [41:23,  1.21it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
3125it [43:07,  1.21it/s]
ERROR:transformers.tokenization_albert:Vocabulary p

The Total Accuracy for Epoch 1: 73.566
Training Loss Epoch: 0.5319639970779418
Training Accuracy Epoch: 73.566


ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory


Training Loss per 5000 steps: 0.45890307426452637
Training Accuracy per 5000 steps: 78.125


500it [06:54,  1.20it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
1000it [13:48,  1.21it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
1500it [20:41,  1.21it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
2000it [27:35,  1.21it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
2500it [34:28,  1.21it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
3000it [41:22,  1.21it/s]ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory
3125it [43:05,  1.21it/s]
ERROR:transformers.tokenization_albert:Vocabulary p

The Total Accuracy for Epoch 2: 75.868
Training Loss Epoch: 0.4982535147666931
Training Accuracy Epoch: 75.868


ERROR:transformers.tokenization_albert:Vocabulary path (/content/drive/MyDrive/Colab Notebooks/albert_voc) should be a directory


In [None]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [None]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

1it [00:00,  3.12it/s]

Validation Loss per 100 steps: 0.4985314607620239
Validation Accuracy per 100 steps: 71.875


625it [03:09,  3.30it/s]

Validation Loss Epoch: 0.5286416037082672
Validation Accuracy Epoch: 73.31
Accuracy on test data = 73.31%





In [None]:
def load_accuracy_for_labelled(type):
  _df = pd.read_json(f'/content/drive/MyDrive/Colab Notebooks/simple_proc_{type}_labelled.json')
  _df = _df.dropna(axis='rows')
  _df_1 = _df[_df['sentiment'] != 0]
  _df_1['target'] = _df_1['sentiment'].apply(lambda t: 0 if t == -1 else 1)
  _df_1['txt'] = _df_1['text_simple_proc']
  _df_1 = _df_1.reset_index(drop=True)

  _testing_set = SentimentData(_df_1, tokenizer, MAX_LEN)
  _testing_loader = DataLoader(_testing_set, **test_params)

  acc = valid(model, _testing_loader)
  print(type)
  print("Accuracy on test data = %0.2f%%" % acc)

In [None]:
load_accuracy_for_labelled('twitter')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
1it [00:00,  3.12it/s]

Validation Loss per 100 steps: 0.4218311309814453
Validation Accuracy per 100 steps: 75.0


23it [00:06,  3.34it/s]

Validation Loss Epoch: 0.4758047694745271
Validation Accuracy Epoch: 74.65373961218836
twitter
Accuracy on test data = 74.65%





In [None]:
load_accuracy_for_labelled('reddit')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
1it [00:00,  3.29it/s]

Validation Loss per 100 steps: 0.5744314193725586
Validation Accuracy per 100 steps: 56.25


30it [00:09,  3.28it/s]

Validation Loss Epoch: 0.5093521575133005
Validation Accuracy Epoch: 74.8171368861024
reddit
Accuracy on test data = 74.82%





In [None]:
load_accuracy_for_labelled('instagram')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
1it [00:00,  3.29it/s]

Validation Loss per 100 steps: 0.6156479120254517
Validation Accuracy per 100 steps: 71.875


26it [00:07,  3.37it/s]

Validation Loss Epoch: 0.5339866349330316
Validation Accuracy Epoch: 74.07407407407408
instagram
Accuracy on test data = 74.07%





In [None]:
load_accuracy_for_labelled('tiktok')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
1it [00:00,  3.30it/s]

Validation Loss per 100 steps: 0.31982603669166565
Validation Accuracy per 100 steps: 84.375


21it [00:06,  3.42it/s]

Validation Loss Epoch: 0.4415687109742846
Validation Accuracy Epoch: 80.21806853582555
tiktok
Accuracy on test data = 80.22%



