# Preparing ROCStories Data

# 1. Load defeasible-inference Classifier
* Model which predicts whether `update` is a weakener given `premise` & `hypothesis`
* trained using delta-atomic part of defeasible-nli dataset 
    * roberta-large based, follows the Rudinger et al. paper

In [1]:
## Load weakener classifier
import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
device = torch.device("cuda")

model_dir = "" ## put classifier directory here
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.to(device)
model.eval()
print("Model Loaded")

clf_pipe = pipeline("text-classification", model = model, tokenizer = tokenizer, device = device)

Model Loaded


In [2]:
from typing import List
map_label = lambda x: 0 if x["label"]=="LABEL_0" else 1
map_scores = lambda x: 1-x["score"] if x["label"]=="LABEL_0" else x["score"]

def prepare_model_input(premise: str, hypothesis: str, update: str) -> str:
	input_text = f"[premise] {premise} [hypo] {hypothesis} [update] {update}"
	return input_text

def predict_defeasible_inf(clf_pipe, sources: List[str], batch_size: int = 8) -> List[int]:
	out = clf_pipe(sources, batch_size = batch_size)
	labels = list(map(map_label, out))
	return labels

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
'''
(45496, 7) Index(['storyid', 'storytitle', 'sentence1', 'sentence2', 'sentence3',
       'sentence4', 'sentence5'],
      dtype='object')
(52665, 7) Index(['storyid', 'storytitle', 'sentence1', 'sentence2', 'sentence3',
       'sentence4', 'sentence5'],
      dtype='object')
(98161, 7) Index(['storyid', 'storytitle', 'sentence1', 'sentence2', 'sentence3',
       'sentence4', 'sentence5'],
      dtype='object')
'''

roc16 = pd.read_csv("ROCStories__spring2016 - ROCStories_spring2016.csv")
print(roc16.shape, roc16.columns)
roc17 = pd.read_csv("ROCStories_winter2017 - ROCStories_winter2017.csv")
print(roc17.shape, roc17.columns)

roc_df = pd.concat([roc16, roc17])
print(roc_df.shape, roc_df.columns)

(45496, 7) Index(['storyid', 'storytitle', 'sentence1', 'sentence2', 'sentence3',
       'sentence4', 'sentence5'],
      dtype='object')
(52665, 7) Index(['storyid', 'storytitle', 'sentence1', 'sentence2', 'sentence3',
       'sentence4', 'sentence5'],
      dtype='object')
(98161, 7) Index(['storyid', 'storytitle', 'sentence1', 'sentence2', 'sentence3',
       'sentence4', 'sentence5'],
      dtype='object')


In [6]:
'''train 79509, val 8835, test 9817'''
train_df, test_df = train_test_split(roc_df, test_size = 0.1, shuffle = True)
train_df, val_df = train_test_split(train_df, test_size = 0.1, shuffle=True)
print("train {}, val {}, test {}".format(train_df.shape[0], val_df.shape[0], test_df.shape[0]))

train 79509, val 8835, test 9817


# 2. Analyze weakener ratio of each positions
* check if sentence position 2,3,4 is a weakener
    * premise (sentence 1), hypothesis (sentence 5)
* only select samples which classifies position 3 as weakener

In [7]:
## Prepare classifier input
from typing import List
def prepare_defeasible_inf_input(df: pd.DataFrame, obs_pos_idx: int = 2) -> List[str]:
	sources = []
	for i in range(df.shape[0]):
		row = df.iloc[i]
		premise = row['sentence1']
		hypothesis = row['sentence5']
		update = row[f'sentence{obs_pos_idx+1}']
		source = prepare_model_input(premise=premise,hypothesis=hypothesis,update=update)
		sources.append(source)
	return sources

In [8]:
## Check Weakener ratio
'''
Pos 1 - Test weakener ratio 0.488
Pos 2 - Test weakener ratio 0.450
Pos 3 - Test weakener ratio 0.344
'''
labels = []
for obs_pos_idx in [1,2,3]: ## Sentence 2,3,4
	test_defeasible_inf_sources = prepare_defeasible_inf_input(test_df, obs_pos_idx=obs_pos_idx)
	# test_defeasible_inf_sources[:4]

	test_labels = predict_defeasible_inf(clf_pipe=clf_pipe, sources = test_defeasible_inf_sources, batch_size = 32)
	print("Pos {} - Test weakener ratio {:.3f}".format(obs_pos_idx, sum(test_labels)/len(test_labels)))
	labels.append(test_labels)

Pos 1 - Test weakener ratio 0.488
Pos 2 - Test weakener ratio 0.450
Pos 3 - Test weakener ratio 0.344


In [12]:
num_samples = len(labels[0])
obs_1_2_labels = [1 if labels[0][i]==1 or labels[1][i]==1 else 0 for i in range(num_samples)]
print("Pos 1 or 2 - Test weakener ratio {:.3f}".format(sum(obs_1_2_labels)/len(obs_1_2_labels)))
# Pos 1 or 2 - Test weakener ratio 0.682

Pos 1 or 2 - Test weakener ratio 0.682


In [13]:
def prepare_data(df: pd.DataFrame) -> pd.DataFrame:
	## Predict Defeasible Inference
	pos1_defeasible_inf_sources = prepare_defeasible_inf_input(df, obs_pos_idx=1)
	pos1_weakener_labels = predict_defeasible_inf(clf_pipe=clf_pipe, sources = pos1_defeasible_inf_sources, batch_size = 32)
	pos2_defeasible_inf_sources = prepare_defeasible_inf_input(df, obs_pos_idx=2)
	pos2_weakener_labels = predict_defeasible_inf(clf_pipe=clf_pipe, sources = pos2_defeasible_inf_sources, batch_size = 32)

	## 
	prepared = {
		"id": [],
		"obs_idx": [],
		"source": []
	}
	story_ids = df.index.tolist()
	for i in range(df.shape[0]):
		row = df.iloc[i]
		story_id = story_ids[i]

		pos1_weakener_label = pos1_weakener_labels[i]
		pos2_weakener_label = pos2_weakener_labels[i]

		if pos2_weakener_label==1: ## select samples with S3 as obstacle
			obs_idx = 2
			source = "<ctx> {} <obs> {} <story><ctx> {} <obs> {} {}".format(
				row['sentence1'], ## context
				row['sentence3'], ## obstacle
				row['sentence2'], ## S2
				row['sentence4'], ## S4
				row['sentence5'], ## S5
			)
		else:
			continue
		
		prepared['id'].append(story_id)
		prepared['obs_idx'].append(obs_idx)
		prepared['source'].append(source)
	return pd.DataFrame.from_dict(prepared)

In [15]:
prepared_train_df = prepare_data(train_df)
print(prepared_train_df.shape)
prepared_train_df.head()

(35778, 3)


Unnamed: 0,id,obs_idx,source
0,23267,2,<ctx> Tom and Emily were going to have a baby!...
1,47771,2,<ctx> Lars drove Millie to the ferry. <obs> Sh...
2,3958,2,<ctx> Last month I got a new haircut. <obs> I ...
3,15978,2,<ctx> Jessie ran a dog sitting company. <obs> ...
4,3704,2,<ctx> Sharon was trying to lose weight. <obs> ...


In [16]:
prepared_val_df = prepare_data(val_df)
print(prepared_val_df.shape)
prepared_val_df.head()

(3994, 3)


Unnamed: 0,id,obs_idx,source
0,32539,2,<ctx> Cathy couldn't understand why her baby h...
1,24080,2,<ctx> Jake was walking along a bike bath. <obs...
2,17313,2,<ctx> Jerry was bored in class. <obs> His prof...
3,35321,2,<ctx> I called the Chinese Restaurant and orde...
4,52314,2,<ctx> Larry had a wonderful relationship and a...


In [17]:
prepared_test_df = prepare_data(test_df)
print(prepared_test_df.shape)
prepared_test_df.head()

(4419, 3)


Unnamed: 0,id,obs_idx,source
0,8022,2,<ctx> I wanted to sell jewelry online. <obs> I...
1,15887,2,<ctx> Larry and his dad got up early to go fis...
2,3255,2,<ctx> Gina had been bullied during recess. <ob...
3,52252,2,<ctx> The ball got stuck on the roof. <obs> Hi...
4,26952,2,<ctx> Gabby was interested in joining the loca...


# Save Processed Files

In [19]:
import os
save_dir = "roc"
if not os.path.exists(save_dir):
	os.makedirs(save_dir)

In [20]:
## Save Files
prepared_train_df.to_csv(os.path.join(save_dir, "roc-train.tsv"), sep = "\t", index = None)
prepared_val_df.to_csv(os.path.join(save_dir, "roc-val.tsv"), sep = "\t", index = None)
prepared_test_df.to_csv(os.path.join(save_dir, "roc-test.tsv"), sep = "\t", index = None)

In [21]:
print(prepared_train_df.shape, prepared_val_df.shape, prepared_test_df.shape)

(35778, 3) (3994, 3) (4419, 3)
