**Transfer Learning with XLNET - Sentiment Analysis**

In [1]:
from transformers import XLNetTokenizer,XLNetForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import AdamW, get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
# from keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import TensorDataset,DataLoader,RandomSampler,SequentialSampler
import pandas as pd
import numpy as np
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from transformers import XLNetModel

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

**Import data**

In [3]:
data = pd.read_csv('.\processed_data\processed_data.csv',index_col='Unnamed: 0')
labels = pd.read_csv('.\processed_data\processed_labels.csv',index_col='Unnamed: 0')

data = data.rename(columns={"0": 'reviews'})
labels = labels.rename(columns={"0": 'sentiment'})

labels =np.array([1 if x =='positive' else 0 for x in labels['sentiment'].values])

**Define custom dataset and data loader**

In [4]:
pre_trained_model = 'xlnet-base-cased'
tokenizer = XLNetTokenizer.from_pretrained(pre_trained_model,do_lower_case=True)
class XLNetTorchDataset(Dataset):
    def __init__(self,data,labels,tokenizer,max_len):
        self.data = data
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.data)
    def __getitem__(self,idx):
        review = str(self.data[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(review,
                                              add_special_tokens=True,
                                              truncation=True,
                                              max_length = self.max_len,
                                              return_token_type_ids=False,
                                              pad_to_max_length=True,
                                              return_attention_mask=True,
                                              return_tensors='pt'
                                              
                                             )
        
        return{
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [5]:
def prepare_data_loader(data,labels, tokenizer, max_len=250, batch_size=30):
    dataset = XLNetTorchDataset(
        data=np.squeeze(np.array(data)),
        labels=np.array(labels),
        tokenizer=tokenizer,
        max_len=max_len
    )
    
    return DataLoader(dataset,batch_size=batch_size)

X_train, X_test, y_train, y_test = train_test_split(data,labels,test_size=0.2)
X_test, X_val, y_test, y_val = train_test_split(X_test,y_test,test_size=0.4)
train_loader = prepare_data_loader(X_train,y_train, tokenizer, max_len=250, batch_size=30)
test_loader = prepare_data_loader(X_test, y_test, tokenizer,max_len=250, batch_size=30)
val_loader = prepare_data_loader(X_val, y_val, tokenizer,max_len=250, batch_size=30)

**Define XLNET model**

In [6]:
class XLNetSentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(XLNetSentimentClassifier, self).__init__()
        self.XLNet =  XLNetForSequenceClassification.from_pretrained(pre_trained_model)
        # self.drop = nn.Dropout(p=0.1)
        #self.out = nn.Linear(self.XLNet.config.hidden_size, n_classes)
    def forward(self, input_ids, attention_mask):
        outputs= self.XLNet(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        # output = self.drop(outputs[0])
        return outputs[0]

In [7]:
model = XLNetSentimentClassifier(n_classes=2)
model = model.to(device)
n_epochs = 2
total_steps = len(train_loader) * n_epochs
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)
criterion= nn.CrossEntropyLoss().to(device)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

**Define train, evaluation, and test functions**

In [9]:
def train_model(model,optimizer,criterion,scheduler,train_loader):
    model = model.train()
    epoch_loss = 0
    epoch_acc = 0
    for i, data in enumerate(train_loader, 0):
        input_ids = data['input_ids']
        attention_mask = data['attention_mask']
        targets = data['label']
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids,attention_mask=attention_mask).to(device)
        #print(outputs.shape)
        loss = criterion(outputs,targets).to(device)
        _, pred = torch.max(outputs, dim=1)
        acc = torch.sum(pred == targets)

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss/len(train_loader), epoch_acc/len(X_train)

In [10]:
def evaluation_model(model,optimizer,criterion,val_loader):
    model.eval()
    with torch.no_grad():
        epoch_loss = 0
        epoch_acc = 0
        for i, data in enumerate(val_loader, 0):
            input_ids = data['input_ids']
            attention_mask = data['attention_mask']
            targets = data['label']
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            targets = targets.to(device)

            outputs = model(input_ids=input_ids,attention_mask=attention_mask).to(device)
            _, pred = torch.max(outputs, dim=1)
            acc = torch.sum(pred == targets)

            loss = criterion(outputs,targets).to(device)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return  epoch_loss/len(val_loader), epoch_acc/len(X_val)

In [15]:
def test_model(model,optimizer,criterion,test_loader):
    the_model.eval()
    with torch.no_grad():
        epoch_loss = 0
        epoch_acc = 0
        for i, data in enumerate(test_loader, 0):
            input_ids = data['input_ids']
            attention_mask = data['attention_mask']
            targets = data['label']
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            targets = targets.to(device)

            outputs = model(input_ids=input_ids,attention_mask=attention_mask).to(device)
            _, pred = torch.max(outputs, dim=1)
            acc = torch.sum(pred == targets)

            loss = criterion(outputs,targets).to(device)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return  epoch_loss/len(test_loader), epoch_acc/len(X_test)

**Train model**

In [11]:
n_epochs = 4

for epoch in range(n_epochs):
    
    train_loss, train_acc = train_model(model,optimizer,criterion,scheduler,train_loader)
    val_loss, val_acc = evaluation_model(model,optimizer,criterion,val_loader)
    
    print(f'Epoch: {epoch+1} Train Loss: {train_loss:.3f} Train Acc: {train_acc*100:.2f}% Val Loss: {val_loss:.3f} Val Acc: {val_acc*100:.2f}%')

Epoch: 1 Train Loss: 0.227 Train Acc: 91.30% Val Loss: 0.175 Val Acc: 93.73%
Epoch: 2 Train Loss: 0.119 Train Acc: 96.12% Val Loss: 0.211 Val Acc: 93.75%
Epoch: 3 Train Loss: 0.084 Train Acc: 97.40% Val Loss: 0.211 Val Acc: 93.75%
Epoch: 4 Train Loss: 0.084 Train Acc: 97.47% Val Loss: 0.211 Val Acc: 93.75%


**Save trained parameters**

In [13]:
PATH = "trained_XLNet.pt"
torch.save(model.state_dict(),PATH)

**Upload trained parameters and to test the model performance on test data**

In [14]:
the_model = XLNetSentimentClassifier(n_classes=2)
the_model.load_state_dict(torch.load("trained_XLNet.pt"))

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

<All keys matched successfully>

In [16]:
test_loss, test_acc = test_model(model,optimizer,criterion,test_loader)
print(f'Test Loss: {test_loss:.3f} Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.223 Test Acc: 93.32%
