<a href="https://colab.research.google.com/github/kutayoncuyilmaz/Thesis/blob/main/SC_XLNet_Twitter_Final_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Sentiment Classification with XLNet Model on Twitter Dataset**



# Necessary Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!nvidia-smi

Fri Feb 25 13:37:36 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install sentencepiece transformers

In [None]:
!pip install -q -U watermark

In [None]:
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

Python implementation: CPython
Python version       : 3.7.12
IPython version      : 5.5.0

numpy       : 1.21.5
pandas      : 1.3.5
torch       : 1.10.0+cu111
transformers: 4.16.2



In [None]:
import transformers
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from collections import defaultdict
from textwrap import wrap
from pylab import rcParams
from torch import nn, optim
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset,RandomSampler,SequentialSampler
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [None]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

# Data Upload

In [None]:
df = pd.read_csv( "/content/drive/MyDrive/Datasets/df_2022-02-03.csv",index_col=[0])
df.head(5)

Unnamed: 0,sentiment,text
541200,0,AHHH I HOPE YOUR OK!!!
750,0,cool i have no tweet apps for my razr
766711,0,i know just family drama. its lame.hey next t...
285055,0,School email won't open and I have geography s...
705995,0,upper airways problem


In [None]:
df['sentiment'].value_counts()

1    160506
0    159494
Name: sentiment, dtype: int64

In [None]:
class_names = ['negative', 'positive']

# Loading Tokenizer

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

Downloading:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

# Dataset Class

In [None]:
class Senti140(Dataset):

    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        pad_to_max_length=False,
        return_attention_mask=True,
        return_tensors='pt', truncation = True
        )

        input_ids = pad_sequences(encoding['input_ids'], maxlen=MAX_LEN, dtype=torch.Tensor ,truncating="post",padding="post")
        input_ids = input_ids.astype(dtype = 'int64')
        input_ids = torch.tensor(input_ids) 

        attention_mask = pad_sequences(encoding['attention_mask'], maxlen=MAX_LEN, dtype=torch.Tensor ,truncating="post",padding="post")
        attention_mask = attention_mask.astype(dtype = 'int64')
        attention_mask = torch.tensor(attention_mask)       

        return {
        'final_text': text,
        'input_ids': input_ids,
        'attention_mask': attention_mask.flatten(),
        'targets': torch.tensor(target, dtype=torch.long)
        }

In [None]:
df_train, df_test = train_test_split(df, test_size=0.5, random_state=101)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=101)

In [None]:
df_train.shape, df_val.shape, df_test.shape

((160000, 2), (80000, 2), (80000, 2))

# Data Loader

In [None]:
def loader_data (df, tokenizer, max_len, batch_size):
  ds = Senti140(
    texts=df.text.to_numpy(),
    targets=df.sentiment.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [None]:
MAX_LEN = 100

In [None]:
BATCH_SIZE = 32

loader_data_train = loader_data(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
loader_data_val = loader_data(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
loader_data_test = loader_data(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

# Loading Pre-trained BERT Model

In [None]:
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels = 2)
model = model.to(device)

# Hyperparameters for the model

In [None]:
EPOCHS = 5 #arbitrary

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
                                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)

total_steps = len(loader_data_train) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

# Training step function




In [None]:
from sklearn import metrics
from tqdm import tqdm
def epoch_trainer(model, loader_data, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    acc = 0
    counter = 0
    progress_loader = tqdm(loader_data)
    for d in progress_loader:
        input_ids = d["input_ids"].reshape(BATCH_SIZE,100).to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
        
        outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels = targets)
        loss = outputs[0]
        logits = outputs[1]

        # preds = preds.cpu().detach().numpy()
        _, prediction = torch.max(outputs[1], dim=1)
        targets = targets.cpu().detach().numpy()
        prediction = prediction.cpu().detach().numpy()
        accuracy = metrics.accuracy_score(targets, prediction)

        acc += accuracy
        current_loss = loss.item()
        losses.append(current_loss)
        progress_loader.set_description(f"Current loss: {current_loss:6.4f}")
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        counter = counter + 1

    return acc / counter, np.mean(losses)

# Evaluation function for the model

In [None]:
def model_evaluator(model, loader_data, device, n_examples):
    model = model.eval()
    losses = []
    acc = 0
    counter = 0
  
    with torch.no_grad():
        for d in loader_data:
            input_ids = d["input_ids"].reshape(32,100).to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            
            outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels = targets)
            loss = outputs[0]
            logits = outputs[1]

            _, prediction = torch.max(outputs[1], dim=1)
            targets = targets.cpu().detach().numpy()
            prediction = prediction.cpu().detach().numpy()
            accuracy = metrics.accuracy_score(targets, prediction)

            acc += accuracy
            losses.append(loss.item())
            counter += 1

    return acc / counter, np.mean(losses)

# Fine Tuning of the pre-trained model

In [None]:
%%time
history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = epoch_trainer(
        model,
        loader_data_train,     
        optimizer, 
        device, 
        scheduler, 
        len(df_train)
    )

    print(f'Train loss {train_loss} Train accuracy {train_acc}')

    val_acc, val_loss = model_evaluator(
        model,
        loader_data_val, 
        device, 
        len(df_val)
    )

    print(f'Val loss {val_loss} Val accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), '/content/drive/MyDrive/Datasets/xlnet_twitter_final.bin')
        if val_acc-best_accuracy<=0.003 :
          break
        best_accuracy = val_acc
    else:
      break

Epoch 1/5
----------


Current loss: 0.4968: 100%|██████████| 5000/5000 [33:24<00:00,  2.49it/s]

Train loss 0.3555229910865426 Train accuracy 0.846725





Val loss 0.36220026737451555 Val accuracy 0.8471375

Epoch 2/5
----------


Current loss: 0.3852: 100%|██████████| 5000/5000 [33:24<00:00,  2.49it/s]

Train loss 0.2828528610493988 Train accuracy 0.88440625





Val loss 0.3665497963637114 Val accuracy 0.846675

CPU times: user 1h 21min 34s, sys: 2min 32s, total: 1h 24min 6s
Wall time: 1h 19min 20s


# Performance Evalution

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/Datasets/xlnet_twitter_final.bin'))

<All keys matched successfully>

In [None]:
model = model.to(device)

In [None]:
test_acc, test_loss = model_evaluator(
  model,
  loader_data_test,
  device,
  len(df_test)
)

print('Test Accuracy :', test_acc)
print('Test Loss :', test_loss)

Test Accuracy : 0.8464375
Test Loss : 0.3620756816804409


In [None]:
def predictions_Twitter(model, loader_data):
    model = model.eval()
    
    final_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in loader_data:

            texts = d["final_text"]
            input_ids = d["input_ids"].reshape(BATCH_SIZE,100).to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            
            outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels = targets)

            loss = outputs[0]
            logits = outputs[1]
            
            _, preds = torch.max(outputs[1], dim=1)

            probs = F.softmax(outputs[1], dim=1)

            final_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(probs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return final_texts, predictions, prediction_probs, real_values

In [None]:
y_final_texts, y_pred, y_pred_probs, y_test = predictions_Twitter(
  model,
  loader_data_test
)

In [None]:
print(classification_report(y_test, y_pred, target_names=class_names, digits=4))

              precision    recall  f1-score   support

    negative     0.8473    0.8424    0.8448     39702
    positive     0.8456    0.8504    0.8480     40298

    accuracy                         0.8464     80000
   macro avg     0.8464    0.8464    0.8464     80000
weighted avg     0.8464    0.8464    0.8464     80000



In [None]:
a= metrics.confusion_matrix(y_test, y_pred )

In [None]:
print(a)

[[33445  6257]
 [ 6028 34270]]


# Inference Time

In [None]:
def predict_batch(model, batch, trial=10):
  with torch.no_grad():
    texts = batch["final_text"]
    input_ids = batch["input_ids"].reshape(-1,100).to(device)
    attention_mask = batch["attention_mask"].to(device)
    targets = batch["targets"].to(device)

    outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels = targets)

    loss = outputs[0]
    logits = outputs[1]

    _, preds = torch.max(outputs[1], dim=1)

In [None]:
batch_size_trial = 32
curr_loader = iter(loader_data(df_test, tokenizer, MAX_LEN, batch_size_trial))



In [None]:
%%timeit -n 10
predict_batch(model, next(curr_loader))


10 loops, best of 5: 133 ms per loop
