# XLNet for Sentiment Analysis

# Imports and installs

In [1]:
from transformers import XLNetTokenizer,XLNetForSequenceClassification, XLNetConfig
from transformers import AdamW
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import TensorDataset,DataLoader,RandomSampler,SequentialSampler


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
import pandas as pd
import string
import re

from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split

# Read files
Only execute one of these

## Apple Sentiment

In [3]:
data = pd.read_csv("data/sentiment/datasets_652925_1154930_apple-twitter-sentiment-texts.csv")

data.sentiment = data.sentiment.apply(lambda x: x + 1)

## US Airline Sentiment

In [3]:
data = pd.read_csv("data/sentiment/Tweets.csv")
data = data[['text', 'airline_sentiment']]
data.rename(columns={'airline_sentiment' : 'sentiment'}, inplace=True)

thisdict =	{
  "negative": 0,
  "neutral": 1,
  "positive": 2
}
data.sentiment = data.sentiment.apply(lambda x: thisdict[x])

## T4SA

In [3]:
tweets = pd.read_csv("data/sentiment/raw_tweets_text.csv")
sentiments = pd.read_csv("data/sentiment/t4sa_text_sentiment.csv",delimiter = "\t")

tweets.set_index(tweets.id, inplace=True)
sentiments.set_index(sentiments.TWID, inplace=True)
data=tweets.join(sentiments)
data.dropna(inplace=True)
data.drop(columns=['id', 'TWID'], inplace=True)
data["sentiment"] = data[['NEU', 'NEG', 'POS']].idxmax(axis=1)

data = data[['text', 'sentiment']]

thisdict =	{
  "NEG": 0,
  "NEU": 1,
  "POS": 2
}
data.sentiment = data.sentiment.apply(lambda x: thisdict[x])

General Text Cleaning

In [4]:
data.text = data.text.str.lower()

data.text = data.text.apply(lambda x:re.sub(r'http\S+', '', x))

tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
data.text = data.text.apply(lambda x: tokenizer.tokenize(x))

data.text = data.text.apply(lambda x: ' '.join(x))

data.text = data.text.map(lambda x : x.translate(str.maketrans('', '', string.punctuation)))

data.text = data.text.str.replace("[0-9]", " ")

data.text = data.text.str.strip(string.whitespace)

df_train, df_test = train_test_split(data, test_size=0.33, random_state=42)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [5]:
df_train

Unnamed: 0,text,sentiment
0,we need more products like companies like ...,1
1,legit thought that was you in this pic,1
2,aapl rt alex gauna flips his apple bit sets s...,1
3,here are the main differences between apple ca...,1
4,trade aapl free nightly updates are posted he...,1
...,...,...
1087,i kinda feel sorry for tho goodjob with your ...,0
1088,your ipad game just got shut way down like ...,0
1089,been waiting days so far for to approve the ...,0
1090,my tmobile apple cellular hell iphone iphon...,0


# Tokenization
Only execute with non-simple Training

In [5]:
sentences_train  = []
for sentence in df_train['text']:
  sentence = sentence+"[SEP] [CLS]"
  sentences_train.append(sentence)
    
sentences_test  = []
for sentence in df_test['text']:
  sentence = sentence+"[SEP] [CLS]"
  sentences_test.append(sentence)

In [None]:
sentences_train[0] ##To check if tags are added or not

### Inputs

1. XLNet tokenizer is used to convert our text into tokens that correspond to   XLNet’s vocabulary.
2. a sequence of integers identifying each input token to its index number in the XLNet tokenizer 
    - Use the XLNet tokenizer to convert the tokens to their index numbers in the XLNet vocabulary


In [6]:
tokenizer  = XLNetTokenizer.from_pretrained('xlnet-base-cased',do_lower_case=True)
tokenized_text_train = [tokenizer.tokenize(sent) for sent in sentences_train]
tokenized_text_test = [tokenizer.tokenize(sent) for sent in sentences_test]

In [7]:
tokenized_text_train[0]

['▁for',
 '▁all',
 '▁who',
 '▁served',
 '▁and',
 '▁fought',
 '▁for',
 '▁our',
 '▁country',
 '▁we',
 '▁thank',
 '▁you',
 '▁salute',
 'to',
 'service',
 '▁veterans',
 'day',
 '[',
 's',
 'ep',
 ']',
 '▁[',
 'cl',
 's',
 ']']

In [8]:
ids_train = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text_train]
ids_test = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text_test]

In [9]:

labels_train = df_train.sentiment.values

labels_test = df_test.sentiment.values

 We find the maximum length of our sentences so that we can pad the rest

In [10]:
max1 = len(ids_train[0])
for i in ids_train:
  if(len(i)>max1):
    max1=len(i)
    
MAX_LEN_TRAIN = max1

max1 = len(ids_test[0])
for i in ids_test:
  if(len(i)>max1):
    max1=len(i)
    
MAX_LEN_TEST = max1

if (MAX_LEN_TEST > MAX_LEN_TRAIN):
    MAX_LEN = MAX_LEN_TEST 
else :
    MAX_LEN = MAX_LEN_TRAIN
    
    
print(MAX_LEN)

144


We pad our sentences

In [11]:
input_ids_train2 = pad_sequences(ids_train,maxlen=MAX_LEN,dtype="long",truncating="post",padding="post")
input_ids_test2 = pad_sequences(ids_test,maxlen=MAX_LEN,dtype="long",truncating="post",padding="post")

In [12]:
xtrain = input_ids_train2
xtest = input_ids_test2
ytrain = labels_train
ytest = labels_test

In [13]:
Xtrain = torch.tensor(xtrain)
Ytrain = torch.tensor(ytrain)
Xtest = torch.tensor(xtest)
Ytest = torch.tensor(ytest)

In [14]:
batch_size = 10

In [15]:
train_data = TensorDataset(Xtrain,Ytrain)
test_data = TensorDataset(Xtest,Ytest)
loader = DataLoader(train_data,batch_size=batch_size)
test_loader = DataLoader(test_data,batch_size=batch_size)

In [16]:
config = XLNetConfig.from_pretrained('xlnet-base-cased')
# Set number of output labels
config.num_labels = 3
config

XLNetConfig {
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_layer": 12,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 250
    }
  },
  "untie_r": true,
  "vocab_size": 32000
}

In [17]:
model = XLNetForSequenceClassification(config)
model.cuda()

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e

In [18]:
optimizer = AdamW(model.parameters(),lr=2e-5)# We pass model parameters

In [19]:
import torch.nn as nn
criterion = nn.CrossEntropyLoss()

In [None]:
import numpy as np
def flat_accuracy(preds,labels):  # A function to predict Accuracy
  correct=0
  for i in range(0,len(labels)):
    if(preds[i]==labels[i]):
      correct+=1
  return (correct/len(labels))*100


# Training (non simple)

In [21]:
no_train = 0
epochs = 5
for epoch in range(epochs):
  model.train()
  loss1 = []
  steps = 0
  train_loss = []
  l = []
  for inputs,labels1 in loader :
    inputs.to(device)
    labels1.to(device)
    optimizer.zero_grad()
    outputs = model(inputs.to(device))
    loss = criterion(outputs[0],labels1.to(device)).to(device)
    logits = torch.max(outputs[0], 1)[1]
    #ll=outp(loss)
    [train_loss.append(p.item()) for p in torch.argmax(outputs[0],axis=1).flatten() ]#our predicted 
    [l.append(z.item()) for z in labels1]# real labels
    loss.backward()
    optimizer.step()
    loss1.append(loss.item())
    no_train += inputs.size(0)
    steps += 1
  print("Current Loss is : {} Step is : {} number of Example : {} Accuracy : {}".format(loss.item(),epoch,no_train,flat_accuracy(train_loss,l)))


KeyboardInterrupt: 

- torch.argmax() returns the index of the max number 
- axis = 1 means that it will search maximum number in a row

## Evaluation

In [None]:
model.eval()
predictions = []

for inp,lab1 in test_loader:
  inp.to(device)
  lab1.to(device)
  outp1 = model(inp.to(device))
  _, pred_label = torch.max(outp1[0], 1)
  [predictions.append(p1.item()) for p1 in torch.argmax(outp1[0],1).flatten()]

In [None]:
from sklearn import metrics

print(metrics.f1_score(labels_test, predictions, average=None))
print(metrics.accuracy_score(labels_test, predictions))

# Using SimpleTransformers

In [6]:
from simpletransformers.classification import ClassificationModel

model = ClassificationModel('xlnet', 'xlnet-base-cased', num_labels=3, use_cuda=True, args={
    'learning_rate':3e-5,
    'num_train_epochs': 5,
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'process_count': 10,
    'train_batch_size': 32,
    'eval_batch_size': 32,
    'max_seq_length': 512,
    'n_gpu' : 16,
    'fp16': False
})

model.train_model(df_train)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=1092.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0', max=35.0, style=ProgressStyle(descripti…

  attn_score = (ac + bd + ef) * self.scale


Running loss: 0.289040




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1', max=35.0, style=ProgressStyle(descripti…

Running loss: 0.618159



Running loss: 0.061754


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2', max=35.0, style=ProgressStyle(descripti…

Running loss: 0.077173


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3', max=35.0, style=ProgressStyle(descripti…

Running loss: 0.717344


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4', max=35.0, style=ProgressStyle(descripti…

Running loss: 0.780357



## Evaluation

In [7]:
import numpy as np
_, model_outputs_test, _ = model.eval_model(df_test)

preds_test = np.argmax(model_outputs_test, axis=1)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=538.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=17.0, style=ProgressStyle(descri…




In [8]:
from sklearn.metrics import f1_score, accuracy_score


print(f1_score(df_test.sentiment, preds_test, average=None))
print(accuracy_score(df_test.sentiment, preds_test))

[0.87719298 0.87867647 0.63157895]
0.8605947955390335


In [9]:
data

Unnamed: 0,text,sentiment
0,what said,1
1,plus youve added commercials to the experience...,2
2,i didnt today must mean i need to take anothe...,1
3,its really aggressive to blast obnoxious ente...,0
4,and its a really big bad thing about it,0
...,...,...
14635,thank you we got on a different flight to chicago,2
14636,leaving over minutes late flight no warnin...,0
14637,please bring american airlines to blackberry,1
14638,you have my money you change my flight and d...,0
