# RoBERTa for Stance detection

# Imports and installs

Imports for SimpleTransformers

In [2]:
import csv
import pandas as pd
from tqdm import tqdm
import string
import re

from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split
from nltk.tokenize import TweetTokenizer

# Read files
Only execute one of these

## Fake News Dataset

In [3]:
stances = pd.read_csv('data/stance/train_stances.csv')
bodies = pd.read_csv('data/stance/train_bodies.csv')

stances = stances.set_index(stances['Body ID']).drop('Body ID', axis = 1)
bodies = bodies.set_index(bodies['Body ID']).drop('Body ID', axis = 1)

data = stances.join(bodies)

data.rename(columns={"Headline": "text_a", "articleBody": "text_b", "Stance" : "labels"}, inplace=True)
data = data[['text_a', 'text_b', 'labels']]

data.text_b = data.text_b.map(lambda x : x.translate(str.maketrans('', '', string.punctuation)))
data.text_a = data.text_a.map(lambda x : x.translate(str.maketrans('', '', string.punctuation)))

data.text_a = data.text_a.str.lower()
data.text_b = data.text_b.str.lower()

data.text_a = data.text_a.str.strip(string.whitespace)
data.text_b = data.text_b.str.strip(string.whitespace)

df_train, df_test = train_test_split(data, test_size=0.33, random_state=42)


thisdict =	{
  "unrelated": 0,
  "agree": 1,
  "discuss": 2,
  "disagree": 3
}
df_train.labels = df_train.labels.apply(lambda x: thisdict[x])
df_test.labels = df_test.labels.apply(lambda x: thisdict[x])

df_train.reset_index(inplace=True, drop = True)
df_test.reset_index(inplace=True, drop = True)


num_stances = 4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [4]:
df_train

Unnamed: 0,text_a,text_b,labels
0,american bombing had signed my death certifica...,a schoolboy who was almost killed when he was ...,0
1,isis border crisis dhs chief says terrorists n...,the pentagon has confirmed that the weapons we...,0
2,kevin vickers sergeantatarms shoots a shooting...,two friends have a 20yearold mcdonald’s quarte...,0
3,that story about a catholic priest dying seein...,absolutely awful news media are reporting that...,0
4,its rubbish that robert plant turned down £500...,robert plant’s publicist has described as “rub...,1
...,...,...,...
33476,hbo streaming service could launch in april fo...,cnn the mystery surrounding north koreas erra...,0
33477,news youll never guess how a homeless man spen...,you know that tendency we have to judge people...,1
33478,hostage david haines murder evil pm says,when a report went viral that nbc meteorologis...,0
33479,isis leader dead,new delhi ak verma an executive engineer at th...,0


## SemEval

In [2]:
df_train = pd.read_csv('data/stance/trainingdata-all-annotations.txt', delimiter='\t', encoding='iso8859-2')
df_test = pd.read_csv('data/stance/testdata-taskA-all-annotations.txt', delimiter='\t'
                     ).append(pd.read_csv('data/stance/testdata-taskB-all-annotations.txt', delimiter='\t'))


df_train.drop(['ID', 'Opinion towards', 'Sentiment'], axis=1, inplace=True)
df_test.drop(['ID', 'Opinion towards', 'Sentiment'], axis = 1, inplace=True)

df_train.rename(columns={"Target": "text_a", "Tweet": "text_b", "Stance" : "labels"}, inplace=True)
df_test.rename(columns={"Target": "text_a", "Tweet": "text_b", "Stance" : "labels"}, inplace=True)

df_train.text_b = df_train.text_b.apply(lambda x:re.sub(r'http\S+', '', x))
df_test.text_b = df_test.text_b.apply(lambda x:re.sub(r'http\S+', '', x))

tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
df_train.text_b = df_train.text_b.apply(lambda x: tokenizer.tokenize(x))
df_test.text_b = df_test.text_b.apply(lambda x: tokenizer.tokenize(x))

df_train.text_b = df_train.text_b.apply(lambda x: ' '.join(x))
df_test.text_b = df_test.text_b.apply(lambda x: ' '.join(x))

df_train.text_b = df_train.text_b.map(lambda x : x.translate(str.maketrans('', '', string.punctuation)))
df_test.text_b = df_test.text_b.map(lambda x : x.translate(str.maketrans('', '', string.punctuation)))

df_train.text_a = df_train.text_a.str.lower()
df_train.text_b = df_train.text_b.str.lower()
df_test.text_a = df_test.text_a.str.lower()
df_test.text_b = df_test.text_b.str.lower()

df_train.text_b = df_train.text_b.str.strip(string.whitespace)
df_test.text_b = df_test.text_b.str.strip(string.whitespace)


df_train.reset_index(inplace=True, drop = True)
df_test.reset_index(inplace=True, drop = True)


thisdict =	{
  "AGAINST": 0,
  "FAVOR": 1,
  "NONE": 2
}
df_train.labels = df_train.labels.apply(lambda x: thisdict[x])
df_test.labels = df_test.labels.apply(lambda x: thisdict[x])

num_stances = 3

In [3]:
df_train

Unnamed: 0,text_a,text_b,labels
0,atheism,dear lord thank u for all of ur blessings forg...,0
1,atheism,blessed are the peacemakers for they shall be...,0
2,atheism,i am not conformed to this world i am transfo...,0
3,atheism,salah should be prayed with focus and understa...,0
4,atheism,and stay in your houses and do not display you...,0
...,...,...,...
2809,legalization of abortion,theres a law protecting unborn eagles but not...,0
2810,legalization of abortion,i am 1 in 3 i have had an abortion abortionon...,0
2811,legalization of abortion,how dare you say my sexual preference is a cho...,0
2812,legalization of abortion,equal rights for those born that way no rig...,0


# Training

In [5]:
model = ClassificationModel('roberta', 'roberta-base', num_labels=num_stances, args={
    'learning_rate':3e-5,
    'num_train_epochs': 10,
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'process_count': 10,
    'train_batch_size': 16,
    'eval_batch_size': 16,
    'max_seq_length': 512,
    'n_gpu' : 16,
    'fp16': False
})

model.train_model(df_train)

HBox(children=(FloatProgress(value=0.0, max=33481.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0', max=2093.0, style=ProgressStyle(descrip…



Running loss: 1.218580



Running loss: 0.096102



Running loss: 0.112723


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.553354


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.051245


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.001406


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.010484


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.000097


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.000474


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.000317


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.000185


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9', max=2093.0, style=ProgressStyle(descrip…

Running loss: 0.000526



## Evaluation

In [6]:
import numpy as np
_, model_outputs_test, _ = model.eval_model(df_test)

preds_test = np.argmax(model_outputs_test, axis=1)

HBox(children=(FloatProgress(value=0.0, max=16491.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=1031.0, style=ProgressStyle(desc…




In [7]:
from sklearn.metrics import f1_score, accuracy_score


print(f1_score(df_test.labels, preds_test, average=None))
print(accuracy_score(df_test.labels, preds_test))

[0.99866778 0.96532028 0.98332221 0.93203883]
0.9923594688011643
