# XLNet for Sentiment Analysis

# Imports and installs

Imports for Simple Transformers

In [None]:
import os
# Higher versions have problems with CUDA
!pip install transformers==2.11.0
!pip install simpletransformers==0.41.1
!git clone https://github.com/NVIDIA/apex
os.chdir('apex')
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
os.chdir('..')

Imports for regular transformers

In [None]:
!pip install pytorch-transformers

from transformers import XLNetTokenizer,XLNetForSequenceClassification, XLNetConfig
from transformers import AdamW
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import TensorDataset,DataLoader,RandomSampler,SequentialSampler


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import pandas as pd
import string
import re

from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split

# Read files
Only execute one of these

## Apple Sentiment

In [None]:
data = pd.read_csv("data/sentiment/datasets_652925_1154930_apple-twitter-sentiment-texts.csv")

data.sentiment = data.sentiment.apply(lambda x: x + 1)

## US Airline Sentiment

In [None]:
data = pd.read_csv("data/sentiment/Tweets.csv")
data = data[['text', 'airline_sentiment']]
data.rename({'airline_sentiment' : 'sentiment'}, inplace=True)

thisdict =	{
  "negative": 0,
  "neutral": 1,
  "positive": 2
}
data.sentiment = data.sentiment.apply(lambda x: thisdict[x])

## T4SA

In [None]:
tweets = pd.read_csv("data/sentiment/raw_tweets_text.csv")
sentiments = pd.read_csv(".data/sentiment/t4sa_text_sentiment.csv",delimiter = "\t")

tweets.set_index(tweets.id, inplace=True)
sentiments.set_index(sentiments.TWID, inplace=True)
data=tweets.join(sentiments)
data.dropna(inplace=True)
data.drop(columns=['id', 'TWID'], inplace=True)
data["sentiment"] = data[['NEU', 'NEG', 'POS']].idxmax(axis=1)

data = data[['text', 'sentiment']]

thisdict =	{
  "NEG": 0,
  "NEU": 1,
  "POS": 2
}
data.sentiment = data.sentiment.apply(lambda x: thisdict[x])

General Text Cleaning

In [None]:
data.text = data.text.str.lower()

data.text = data.text.apply(lambda x:re.sub(r'http\S+', '', x))

tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
data.text = data.text.apply(lambda x: tokenizer.tokenize(x))

data.text = data.text.apply(lambda x: ' '.join(x))

data.text = data.text.map(lambda x : x.translate(str.maketrans('', '', string.punctuation)))

data.text = data.text.str.replace("[0-9]", " ")

data.text = data.text.str.strip(string.whitespace)

df_train, df_test = train_test_split(data, test_size=0.33, random_state=42)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
df_train

# Tokenization
Only execute with non-simple Training

In [None]:
sentences_train  = []
for sentence in df_train['text']:
  sentence = sentence+"[SEP] [CLS]"
  sentences_train.append(sentence)
    
sentences_test  = []
for sentence in df_test['text']:
  sentence = sentence+"[SEP] [CLS]"
  sentences_test.append(sentence)

In [None]:
sentences_train[0] ##To check if tags are added or not

### Inputs

1. XLNet tokenizer is used to convert our text into tokens that correspond to   XLNet’s vocabulary.
2. a sequence of integers identifying each input token to its index number in the XLNet tokenizer 
    - Use the XLNet tokenizer to convert the tokens to their index numbers in the XLNet vocabulary


In [None]:
tokenizer  = XLNetTokenizer.from_pretrained('xlnet-base-cased',do_lower_case=True)
tokenized_text_train = [tokenizer.tokenize(sent) for sent in sentences_train]
tokenized_text_test = [tokenizer.tokenize(sent) for sent in sentences_test]

In [None]:
tokenized_text_train[0]

In [None]:
ids_train = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text_train]
ids_test = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text_test]

In [None]:

labels_train = df_train.sentiment.values

labels_test = df_test.sentiment.values

 We find the maximum length of our sentences so that we can pad the rest

In [None]:
max1 = len(ids_train[0])
for i in ids_train:
  if(len(i)>max1):
    max1=len(i)
    
MAX_LEN_TRAIN = max1

max1 = len(ids_test[0])
for i in ids_test:
  if(len(i)>max1):
    max1=len(i)
    
MAX_LEN_TEST = max1

if (MAX_LEN_TEST > MAX_LEN_TRAIN):
    MAX_LEN = MAX_LEN_TEST 
else :
    MAX_LEN = MAX_LEN_TRAIN
    
    
print(MAX_LEN)

We pad our sentences

In [None]:
input_ids_train2 = pad_sequences(ids_train,maxlen=MAX_LEN,dtype="long",truncating="post",padding="post")
input_ids_test2 = pad_sequences(ids_test,maxlen=MAX_LEN,dtype="long",truncating="post",padding="post")

In [None]:
xtrain = input_ids_train2
xtest = input_ids_test2
ytrain = labels_train
ytest = labels_test

In [None]:
Xtrain = torch.tensor(xtrain)
Ytrain = torch.tensor(ytrain)
Xtest = torch.tensor(xtest)
Ytest = torch.tensor(ytest)

In [None]:
batch_size = 10

In [None]:
train_data = TensorDataset(Xtrain,Ytrain)
test_data = TensorDataset(Xtest,Ytest)
loader = DataLoader(train_data,batch_size=batch_size)
test_loader = DataLoader(test_data,batch_size=batch_size)

In [None]:
config = XLNetConfig.from_pretrained('xlnet-base-cased')
# Set number of output labels
config.num_labels = 3
config

In [None]:
model = XLNetForSequenceClassification(config)
model.cuda()

In [None]:
optimizer = AdamW(model.parameters(),lr=2e-5)# We pass model parameters

In [None]:
import torch.nn as nn
criterion = nn.CrossEntropyLoss()

In [None]:
import numpy as np
def flat_accuracy(preds,labels):  # A function to predict Accuracy
  correct=0
  for i in range(0,len(labels)):
    if(preds[i]==labels[i]):
      correct+=1
  return (correct/len(labels))*100


# Training (non simple)

In [None]:
no_train = 0
epochs = 5
for epoch in range(epochs):
  model.train()
  loss1 = []
  steps = 0
  train_loss = []
  l = []
  for inputs,labels1 in loader :
    inputs.to(device)
    labels1.to(device)
    optimizer.zero_grad()
    outputs = model(inputs.to(device))
    loss = criterion(outputs[0],labels1.to(device)).to(device)
    logits = torch.max(outputs[0], 1)[1]
    #ll=outp(loss)
    [train_loss.append(p.item()) for p in torch.argmax(outputs[0],axis=1).flatten() ]#our predicted 
    [l.append(z.item()) for z in labels1]# real labels
    loss.backward()
    optimizer.step()
    loss1.append(loss.item())
    no_train += inputs.size(0)
    steps += 1
  print("Current Loss is : {} Step is : {} number of Example : {} Accuracy : {}".format(loss.item(),epoch,no_train,flat_accuracy(train_loss,l)))


- torch.argmax() returns the index of the max number 
- axis = 1 means that it will search maximum number in a row

## Evaluation

In [None]:
model.eval()
predictions = []

for inp,lab1 in test_loader:
  inp.to(device)
  lab1.to(device)
  outp1 = model(inp.to(device))
  _, pred_label = torch.max(outp1[0], 1)
  [predictions.append(p1.item()) for p1 in torch.argmax(outp1[0],1).flatten()]

In [None]:
from sklearn import metrics

print(metrics.f1_score(labels_test, predictions, average=None))
print(metrics.accuracy_score(labels_test, predictions))

# Using SimpleTransformers

In [None]:
model = ClassificationModel('xlnet', 'xlnet-base-cased', num_labels=3, use_cuda=True, args={
    'learning_rate':3e-5,
    'num_train_epochs': 5,
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'process_count': 10,
    'train_batch_size': 4,
    'eval_batch_size': 4,
    'max_seq_length': 512,
    'fp16': True
})

model.train_model(df_train)

## Evaluation

In [None]:
import numpy as np
_, model_outputs_test, _ = model.eval_model(df_test)

preds_test = np.argmax(model_outputs_test, axis=1)

In [None]:
from sklearn.metrics import f1_score, accuracy_score


print(f1_score(df_test.sentiment, preds_test, average=None))
print(accuracy_score(df_test.sentiment, preds_test))