# XLNet for stance detection
This notebook shows how to use XLNet for stance detection with two sample datasets.

## Imports and installs

In [1]:
from transformers import XLNetTokenizer,XLNetForSequenceClassification, XLNetConfig
from transformers import AdamW
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import TensorDataset,DataLoader,RandomSampler,SequentialSampler


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Imports for data cleaning and evaluation

In [2]:
import csv
import pandas as pd
from tqdm import tqdm
import string
import re
import numpy as np

from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split
from nltk.tokenize import TweetTokenizer

# Read files
Only execute one of these

## Fake News Dataset

In [3]:
stances = pd.read_csv('data/stance/train_stances.csv')
bodies = pd.read_csv('data/stance/train_bodies.csv')

stances = stances.set_index(stances['Body ID']).drop('Body ID', axis = 1)
bodies = bodies.set_index(bodies['Body ID']).drop('Body ID', axis = 1)

data = stances.join(bodies)

data.rename(columns={"Headline": "text_a", "articleBody": "text_b", "Stance" : "labels"}, inplace=True)
data = data[["text_a", "text_b", "labels"]]

data.text_b = data.text_b.map(lambda x : x.translate(str.maketrans('', '', string.punctuation)))
data.text_a = data.text_a.map(lambda x : x.translate(str.maketrans('', '', string.punctuation)))

data.text_a = data.text_a.str.lower()
data.text_b = data.text_b.str.lower()

data.text_a = data.text_a.str.strip(string.whitespace)
data.text_b = data.text_b.str.strip(string.whitespace)

df_train, df_test = train_test_split(data, test_size=0.33, random_state=42)


thisdict =	{
  "unrelated": 0,
  "agree": 1,
  "discuss": 2,
  "disagree": 3
}
df_train.labels = df_train.labels.apply(lambda x: thisdict[x])
df_test.labels = df_test.labels.apply(lambda x: thisdict[x])

df_train.reset_index(inplace=True, drop = True)
df_test.reset_index(inplace=True, drop = True)


num_stances = 4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [4]:
df_train

Unnamed: 0,text_a,text_b,labels
0,american bombing had signed my death certifica...,a schoolboy who was almost killed when he was ...,0
1,isis border crisis dhs chief says terrorists n...,the pentagon has confirmed that the weapons we...,0
2,kevin vickers sergeantatarms shoots a shooting...,two friends have a 20yearold mcdonald’s quarte...,0
3,that story about a catholic priest dying seein...,absolutely awful news media are reporting that...,0
4,its rubbish that robert plant turned down £500...,robert plant’s publicist has described as “rub...,1
...,...,...,...
33476,hbo streaming service could launch in april fo...,cnn the mystery surrounding north koreas erra...,0
33477,news youll never guess how a homeless man spen...,you know that tendency we have to judge people...,1
33478,hostage david haines murder evil pm says,when a report went viral that nbc meteorologis...,0
33479,isis leader dead,new delhi ak verma an executive engineer at th...,0


## SemEval

In [3]:
df_train = pd.read_csv('data/stance/trainingdata-all-annotations.txt', delimiter='\t', encoding='iso8859-2')
df_test = pd.read_csv('data/stance/testdata-taskA-all-annotations.txt', delimiter='\t'
                     ).append(pd.read_csv('data/stance/testdata-taskB-all-annotations.txt', delimiter='\t'))


df_train.drop(['ID', 'Opinion towards', 'Sentiment'], axis=1, inplace=True)
df_test.drop(['ID', 'Opinion towards', 'Sentiment'], axis = 1, inplace=True)

df_train.rename(columns={"Target": "text_a", "Tweet": "text_b", "Stance" : "labels"}, inplace=True)
df_test.rename(columns={"Target": "text_a", "Tweet": "text_b", "Stance" : "labels"}, inplace=True)

df_train.text_b = df_train.text_b.apply(lambda x:re.sub(r'http\S+', '', x))
df_test.text_b = df_test.text_b.apply(lambda x:re.sub(r'http\S+', '', x))

tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
df_train.text_b = df_train.text_b.apply(lambda x: tokenizer.tokenize(x))
df_test.text_b = df_test.text_b.apply(lambda x: tokenizer.tokenize(x))

df_train.text_b = df_train.text_b.apply(lambda x: ' '.join(x))
df_test.text_b = df_test.text_b.apply(lambda x: ' '.join(x))

df_train.text_b = df_train.text_b.map(lambda x : x.translate(str.maketrans('', '', string.punctuation)))
df_test.text_b = df_test.text_b.map(lambda x : x.translate(str.maketrans('', '', string.punctuation)))

df_train.text_a = df_train.text_a.str.lower()
df_train.text_b = df_train.text_b.str.lower()
df_test.text_a = df_test.text_a.str.lower()
df_test.text_b = df_test.text_b.str.lower()

df_train.text_b = df_train.text_b.str.strip(string.whitespace)
df_test.text_b = df_test.text_b.str.strip(string.whitespace)


df_train.reset_index(inplace=True, drop = True)
df_test.reset_index(inplace=True, drop = True)


thisdict =	{
  "AGAINST": 0,
  "FAVOR": 1,
  "NONE": 2
}
df_train.labels = df_train.labels.apply(lambda x: thisdict[x])
df_test.labels = df_test.labels.apply(lambda x: thisdict[x])

num_stances = 3

In [6]:
df_train

2814

# Tokenization
Only execute with non-simple Training

In [4]:
sentences_train  = []
for i in range(len(df_train)):
  sentence = df_train.text_a[i]+"[SEP]"+df_train.text_b[i]+"[CLS]"
  sentences_train.append(sentence)
    
sentences_test  = []
for i in range(len(df_test)):
  sentence = df_test.text_a[i]+"[SEP]"+df_test.text_b[i]+"[CLS]"
  sentences_test.append(sentence)

In [5]:
tokenizer  = XLNetTokenizer.from_pretrained('xlnet-base-cased',do_lower_case=True)
tokenized_text_train = [tokenizer.tokenize(sent) for sent in sentences_train]
tokenized_text_test = [tokenizer.tokenize(sent) for sent in sentences_test]

In [None]:
ids_train = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text_train]
ids_test = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text_test]

In [None]:
labels_train = df_train['labels'].values

labels_test = df_test['labels'].values

In [None]:
max1 = len(ids_train[0])
for i in ids_train:
  if(len(i)>max1):
    max1=len(i)
    
MAX_LEN_TRAIN = max1

max1 = len(ids_test[0])
for i in ids_test:
  if(len(i)>max1):
    max1=len(i)
    
MAX_LEN_TEST = max1

if (MAX_LEN_TEST > MAX_LEN_TRAIN):
    MAX_LEN = MAX_LEN_TEST 
else :
    MAX_LEN = MAX_LEN_TRAIN
    
    
print(MAX_LEN)

In [None]:
input_ids_train2 = pad_sequences(ids_train,maxlen=MAX_LEN,dtype="long",truncating="post",padding="post")
input_ids_test2 = pad_sequences(ids_test,maxlen=MAX_LEN,dtype="long",truncating="post",padding="post")

In [None]:
xtrain = input_ids_train2
xtest = input_ids_test2
ytrain = labels_train
ytest = labels_test

In [None]:
Xtrain = torch.tensor(xtrain)
Ytrain = torch.tensor(ytrain)
Xtest = torch.tensor(xtest)
Ytest = torch.tensor(ytest)

In [None]:
batch_size = 5

In [None]:
train_data = TensorDataset(Xtrain,Ytrain)
test_data = TensorDataset(Xtest,Ytest)
loader = DataLoader(train_data,batch_size=batch_size)
test_loader = DataLoader(test_data,batch_size=batch_size)

In [None]:
config = XLNetConfig.from_pretrained('xlnet-base-cased')
# Set number of output labels
config.num_labels = num_stances
config.n_gpu=16
config

In [None]:
model = XLNetForSequenceClassification(config)
model.cuda()

In [None]:
optimizer = AdamW(model.parameters(),lr=2e-5)# We pass model parameters

In [None]:
import torch.nn as nn
criterion = nn.CrossEntropyLoss()

In [None]:
def flat_accuracy(preds,labels):  # A function to predict Accuracy
  correct=0
  for i in range(0,len(labels)):
    if(preds[i]==labels[i]):
      correct+=1
  return (correct/len(labels))*100

# Training (non simple)

In [None]:
no_train = 0
#Change epochs for training duration
epochs = 2
for epoch in range(epochs):
  model.train()
  loss1 = []
  steps = 0
  train_loss = []
  l = []
  for inputs,labels1 in loader :
    inputs.to(device)
    labels1.to(device)
    optimizer.zero_grad()
    outputs = model(inputs.to(device))
    loss = criterion(outputs[0],labels1.to(device)).to(device)
    logits = torch.max(outputs[0], 1)[1]
    #ll=outp(loss)
    [train_loss.append(p.item()) for p in torch.argmax(outputs[0],axis=1).flatten() ]#our predicted 
    [l.append(z.item()) for z in labels1]# real labels
    loss.backward()
    optimizer.step()
    loss1.append(loss.item())
    no_train += inputs.size(0)
    steps += 1
  print("Current Loss is : {} Step is : {} number of Example : {} Accuracy : {}".format(loss.item(),epoch,no_train,flat_accuracy(train_loss,l)))


## Evaluation

In [None]:
model.eval()
predictions = []

for inp,lab1 in test_loader:
  inp.to(device)
  lab1.to(device)
  outp1 = model(inp.to(device))
  _, pred_label = torch.max(outp1[0], 1)
  [predictions.append(p1.item()) for p1 in torch.argmax(outp1[0],1).flatten()]

In [None]:
from sklearn import metrics

print(metrics.f1_score(labels_test, predictions, average=None))
print(metrics.accuracy_score(labels_test, predictions))

# Using SimpleTransformers

In [4]:
model = ClassificationModel('xlnet', 'xlnet-base-cased', num_labels=num_stances, use_cuda=True, args={
    'learning_rate':3e-5,
    'num_train_epochs': 10,
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'process_count': 10,
    'train_batch_size': 16,
    'eval_batch_size': 16,
    'max_seq_length': 512,
    'n_gpu' : 16,
    'fp16': False
})

model.train_model(df_train)

HBox(children=(FloatProgress(value=0.0, max=33481.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0', max=2093.0, style=ProgressStyle(descrip…



Running loss: 1.021631



Running loss: 0.013357



Running loss: 0.505795


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.033231


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.008157


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.000790


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.000085


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.000201


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.000041


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.000016


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.000011


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.000026



## Evaluation

In [5]:
import numpy as np
_, model_outputs_test, _ = model.eval_model(df_test)

preds_test = np.argmax(model_outputs_test, axis=1)

HBox(children=(FloatProgress(value=0.0, max=16491.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=1031.0, style=ProgressStyle(desc…

  attn_score = (ac + bd + ef) * self.scale





In [6]:
from sklearn.metrics import f1_score, accuracy_score


print(f1_score(df_test.labels, preds_test, average=None))
print(accuracy_score(df_test.labels, preds_test))

[0.9985844  0.97600651 0.98633333 0.96633663]
0.9941786428961251
