In [1]:
import pickle
import random
from random import shuffle

In [2]:
save_filename = 'Scripts_learning_dataset_proscript_use_flatten_generation_longest.pickle'

In [3]:
with open(save_filename, 'rb') as handle:
    all_data = pickle.load(handle)    

In [4]:
"""
This examples trains a CrossEncoder for the Quora Duplicate Questions Detection task. A CrossEncoder takes a sentence pair
as input and outputs a label. Here, it output a continious labels 0...1 to indicate the similarity between the input pair.

It does NOT produce a sentence embedding and does NOT work for individual sentences.

Usage:
python training_quora_duplicate_questions.py

"""
import torch

from torch.utils.data import DataLoader
import math
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import os
import gzip
import csv
from zipfile import ZipFile

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)
#### /print debug information to stdout

In [5]:
torch.manual_seed(0)
random.seed(0)

In [6]:
use_device = 'cuda:2'

In [7]:
import pandas as pd
all_df = all_data

In [8]:
all_df

Unnamed: 0,source,topic,text,longest_script
0,proscript,ride a train,"[[decided to ride a train, walk to ticket boot...","[decided to ride a train, walk to ticket booth..."
1,proscript,win the minor league baseball,"[[decided to win the minor league baseball, ge...","[decided to win the minor league baseball, get..."
2,proscript,catch a big marlin,"[[decided to catch a big marlin, look up local...","[decided to catch a big marlin, look up local ..."
3,proscript,make banana muffins,"[[decided to make banana muffins, Gather all o...","[decided to make banana muffins, Gather all of..."
4,proscript,eat some food,"[[decided to eat some food, Drive to pizza pla...","[decided to eat some food, Drive to pizza plac..."
...,...,...,...,...
3778,wikihow,How to Write a Newspaper?,"[[Determine you newspaper's niche., Choose a g...",[Choose the most relevant or compelling storie...
3779,wikihow,How to Write a Nonprofit Governing Board State...,"[[Describe the organization’s purpose., Descri...","[Describe the organization’s purpose., Describ..."
3780,wikihow,How to Write a Plaintiff's Affidavit?,[[Identify different types of plaintiff’s affi...,"[Reread your complaint., Insert the caption in..."
3781,wikihow,How to Write a Spec Script for TV?,"[[Choose a TV show., Track down a script., Stu...","[Choose a good writing environment., Write it ..."


In [9]:
all_df.loc[1]['topic']

'win the minor league baseball'

In [10]:
# Shuffle the DataFrame rows
shuffled_df = all_df.sample(frac=1).reset_index(drop=True)

shuffled_df

Unnamed: 0,source,topic,text,longest_script
0,proscript,buy some juggling balls,"[[go to amazon.com, look around website, locat...","[go to amazon.com, look around website, locate..."
1,wikihow,How to Share Books on Kindle?,"[[Create an Amazon Household., Visit the “Mana...","[Create an Amazon Household., Visit the “Manag..."
2,proscript,drop bags on ground near car,"[[take bags outside, look at car, point body a...","[take bags outside, look at car, point body at..."
3,stories_xml,give a medicine,"[[go to the place where medicine is kept ., id...","[read instructions ., read instructions ., ope..."
4,wikihow,How to Escape the Small Town Blues?,"[[Start a new hobby., Register for a class., T...","[Start a new hobby., Register for a class., Ta..."
...,...,...,...,...
3778,wikihow,How to Save an Attachment to Your Computer?,"[[Log into your e-mail account., Go to your in...","[Log into your e-mail account., Go to your inb..."
3779,wikihow,How to Cover up a Fart in High School Settings?,"[[Pretend the fart was your shoe or chair., Pr...","[Pretend the fart was your shoe or chair., Pre..."
3780,proscript,get the tiller,"[[kill any weeds, look for tiller, find the ti...","[kill any weeds, look for tiller, find the til..."
3781,wikihow,How to Prevent Stains on Dentures?,"[[Purchase a denture cleaning product., Follow...","[Mix vinegar and water., Soak your dentures in..."


In [11]:
dev_part = int(.8 * len(shuffled_df))
test_part = int(.9 * len(shuffled_df))
train_df = shuffled_df.iloc[:dev_part].reset_index(drop=True)
dev_df = shuffled_df.iloc[dev_part:test_part].reset_index(drop=True)
test_df = shuffled_df.iloc[test_part:].reset_index(drop=True)

In [12]:
train_df # stories_xml	boil the milk

Unnamed: 0,source,topic,text,longest_script
0,proscript,buy some juggling balls,"[[go to amazon.com, look around website, locat...","[go to amazon.com, look around website, locate..."
1,wikihow,How to Share Books on Kindle?,"[[Create an Amazon Household., Visit the “Mana...","[Create an Amazon Household., Visit the “Manag..."
2,proscript,drop bags on ground near car,"[[take bags outside, look at car, point body a...","[take bags outside, look at car, point body at..."
3,stories_xml,give a medicine,"[[go to the place where medicine is kept ., id...","[read instructions ., read instructions ., ope..."
4,wikihow,How to Escape the Small Town Blues?,"[[Start a new hobby., Register for a class., T...","[Start a new hobby., Register for a class., Ta..."
...,...,...,...,...
3021,proscript,dress as endorman for halloween,"[[decided to dress as endorman for halloween, ...","[decided to dress as endorman for halloween, G..."
3022,wikihow,How to Be Like a Prince?,"[[Have good manners., Get the look down pat., ...","[Have good manners., Get the look down pat., M..."
3023,proscript,get dressed in nice clothes,"[[take a shower, step out of the shower., dry ...","[take a shower, step out of the shower., dry o..."
3024,wikihow,How to Make Skinny Jeans?,[[Put on your jeans to get an idea of the shap...,[Put on your jeans to get an idea of the shape...


In [13]:
dev_df # 0	proscript	Put pie in the oven.

Unnamed: 0,source,topic,text,longest_script
0,wikihow,How to Patch Test Skin?,"[[Understand product skin testing., Put a smal...","[Understand product skin testing., Put a small..."
1,proscript,drink a soda,"[[decided to drink a soda, walk to the refrige...","[decided to drink a soda, walk to the refriger..."
2,wikihow,How to Email Inmates in Prison?,[[Make sure the inmate has added you on their ...,[Make sure the inmate has added you on their C...
3,proscript,learn how to dance in college,"[[decided to learn how to dance in college, re...","[decided to learn how to dance in college, rea..."
4,wikihow,How to Check a Child's Hair for Lice?,"[[Ask around., Look for signs and symptoms., D...","[Ask around., Look for signs and symptoms., Do..."
...,...,...,...,...
373,proscript,bake a dozen chocolate chip cookies,[[decided to bake a dozen chocolate chip cooki...,[decided to bake a dozen chocolate chip cookie...
374,proscript,travel to an exotic country,"[[decided to travel to an exotic country, go t...","[decided to travel to an exotic country, go to..."
375,wikihow,How to Hide Money from Your Siblings and Parents?,"[[Do not cause a fire hazard., Do not use tigh...","[Do not cause a fire hazard., Do not use tight..."
376,proscript,start cleaning up room,"[[decide to clean room first, move right foot ...","[decide to clean room first, move right foot t..."


In [14]:
test_df # 0	proscript	Make sure to have supplies

Unnamed: 0,source,topic,text,longest_script
0,proscript,throw a big party,"[[decided to throw a big party, send out invit...","[decided to throw a big party, send out invite..."
1,proscript,Make the food.,"[[Preheat the oven., take vegetables out of re...","[Preheat the oven., take vegetables out of ref..."
2,proscript,Sleep in bed.,"[[Get in bed., Put covers over body, Close the...","[Get in bed., Put covers over body, Close the ..."
3,wikihow,How to Clean a Litter Box?,"[[Set up a trash can close by., Wear disposabl...","[Empty out the old litter., Scrub the empty li..."
4,stories_xml,replace a refrigerator filter,"[[find new filter ., remove old filter ., remo...","[move over to the fridge ., turn the fridge of..."
...,...,...,...,...
374,wikihow,How to Save an Attachment to Your Computer?,"[[Log into your e-mail account., Go to your in...","[Log into your e-mail account., Go to your inb..."
375,wikihow,How to Cover up a Fart in High School Settings?,"[[Pretend the fart was your shoe or chair., Pr...","[Pretend the fart was your shoe or chair., Pre..."
376,proscript,get the tiller,"[[kill any weeds, look for tiller, find the ti...","[kill any weeds, look for tiller, find the til..."
377,wikihow,How to Prevent Stains on Dentures?,"[[Purchase a denture cleaning product., Follow...","[Mix vinegar and water., Soak your dentures in..."


In [15]:
def sample_neg_topic(idx_list, value_to_exclude, sample_size):

    # Filter out the specific value from the list
    filtered_list = [x for x in idx_list if x != value_to_exclude]

    # Sample without duplicates, excluding the specific value
    # Set the desired sample size
    random_sample = random.sample(filtered_list, sample_size)

    return random_sample

In [16]:
# def sample_neg_action(action_list):
#     return random.choice(action_list)

## Data split method:
Use InputExample:
1. flat all text in a topic
2. duplicate topic with script steps
3. get negatives from rest

In [17]:
def flatten_list(nestedlist):
    return [string for sublist in nestedlist for string in sublist]

In [18]:
pos_label_digit = 0
neg_label_digit = 1

In [19]:
def create_InputExample_for_topic(df, idx, neg_num = 10):
    all_idx_list = list(range(len(df)))
    topic = df.loc[idx]['topic']
    pos_script_actions = df.loc[idx]['longest_script']
    neg_idx = sample_neg_topic(all_idx_list, idx, sample_size = neg_num)
    # random select an action from a neg topic
    neg_script_actions = [random.choice(df.loc[each]['longest_script']) for each in neg_idx] 
    pos_inputexamples = []
    for each in pos_script_actions:
        pos_inputexamples.append(InputExample(texts=[topic, each], label=int(pos_label_digit)))
    neg_inputexamples = []
    for each in neg_script_actions:
        neg_inputexamples.append(InputExample(texts=[topic, each], label=int(neg_label_digit)))
    training_inputexamples = pos_inputexamples + neg_inputexamples

    shuffle(training_inputexamples)
    
    return training_inputexamples

In [20]:
def create_InputExample_for_topic_test_on_all(df, idx):
    all_idx_list = list(range(len(df)))
    topic = df.loc[idx]['topic']
    pos_script_actions = df.loc[idx]['longest_script']
    neg_idx = [x for x in all_idx_list if x != idx]
    neg_df = df.loc[neg_idx]
    # all action from all neg topics
    neg_script_actions = flatten_list(list(neg_df['longest_script']))
    pos_examples = []
    pos_inputexamples = []
    for each in pos_script_actions:
        pos_inputexamples.append(InputExample(texts=[topic, each], label=int(pos_label_digit)))
        pos_examples.append([topic, each])
    pos_labels = [pos_label_digit] * len(pos_examples)
    
    # neg
    neg_examples = []
    neg_inputexamples = []
    for each in neg_script_actions:
        neg_inputexamples.append(InputExample(texts=[topic, each], label=int(neg_label_digit)))
        neg_examples.append([topic, each])
    neg_labels = [neg_label_digit] * len(neg_examples)  
    
    training_inputexamples = pos_inputexamples + neg_inputexamples
    training_examples = pos_examples + neg_examples
    training_lables = pos_labels + neg_labels

    
    return training_inputexamples, training_examples, training_lables

In [21]:
create_InputExample_for_topic(all_df, 1)

[<sentence_transformers.readers.InputExample.InputExample at 0x7f29677662b0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f2967702b50>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f2967702b80>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f2967702a90>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f2967702f70>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f2967702160>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f2967702e80>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f2967702fd0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f2967702310>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f2967702400>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f29677023d0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f2967702ca0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7

In [22]:
train_InputExample = []
for i in range(len(train_df)):    
    train_InputExample += create_InputExample_for_topic(train_df, i)

In [23]:
dev_InputExample = []
for i in range(len(dev_df)):    
    dev_InputExample += create_InputExample_for_topic(dev_df, i, neg_num = 20)

In [24]:
test_inputexamples = []
test_examples = []
test_label = []
for i in range(len(test_df)):    
    test_inputexample, test_example, text_label = create_InputExample_for_topic_test_on_all(test_df, i)
    test_inputexamples += test_inputexample
    test_examples += test_example
    test_label += text_label

In [25]:
test_df

Unnamed: 0,source,topic,text,longest_script
0,proscript,throw a big party,"[[decided to throw a big party, send out invit...","[decided to throw a big party, send out invite..."
1,proscript,Make the food.,"[[Preheat the oven., take vegetables out of re...","[Preheat the oven., take vegetables out of ref..."
2,proscript,Sleep in bed.,"[[Get in bed., Put covers over body, Close the...","[Get in bed., Put covers over body, Close the ..."
3,wikihow,How to Clean a Litter Box?,"[[Set up a trash can close by., Wear disposabl...","[Empty out the old litter., Scrub the empty li..."
4,stories_xml,replace a refrigerator filter,"[[find new filter ., remove old filter ., remo...","[move over to the fridge ., turn the fridge of..."
...,...,...,...,...
374,wikihow,How to Save an Attachment to Your Computer?,"[[Log into your e-mail account., Go to your in...","[Log into your e-mail account., Go to your inb..."
375,wikihow,How to Cover up a Fart in High School Settings?,"[[Pretend the fart was your shoe or chair., Pr...","[Pretend the fart was your shoe or chair., Pre..."
376,proscript,get the tiller,"[[kill any weeds, look for tiller, find the ti...","[kill any weeds, look for tiller, find the til..."
377,wikihow,How to Prevent Stains on Dentures?,"[[Purchase a denture cleaning product., Follow...","[Mix vinegar and water., Soak your dentures in..."


In [26]:
all_sent = [] 
for i in range(len(test_df)):
    scripts = test_df['longest_script'][i]
    all_sent += scripts

In [27]:
len(all_sent)

2135

In [28]:
 2163 *  379 

819777

In [29]:
len(test_examples)

809165

In [30]:
# # hard test - with all actions from test set
# test_InputExample = []
# for i in range(len(test_df)):    
#     test_InputExample += create_InputExample_for_topic_hardtest(test_df, i)

In [31]:
len(test_df)

379

In [32]:
len(test_inputexamples)

809165

In [33]:
# all 89839

In [34]:
len(train_InputExample)

47538

In [35]:
len(dev_InputExample)

9756

In [36]:
#Configuration
train_batch_size = 32
num_epochs = 5
model_save_path = './save_model/cross_encoder/'

In [37]:
#We use distilroberta-base with a single label, i.e., it will output a value between 0 and 1 indicating the similarity of the two questions
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', num_labels=1, device = use_device)

In [38]:
# # load from trained
# model = CrossEncoder(model_save_path, num_labels=1, device = use_device)

In [39]:
# We wrap train_samples (which is a List[InputExample]) into a pytorch DataLoader
train_dataloader = DataLoader(train_InputExample, shuffle=True, batch_size=train_batch_size)


# We add an evaluator, which evaluates the performance during training
evaluator = CEBinaryClassificationEvaluator.from_input_examples(dev_InputExample, name='ce-dev')

In [40]:
# Configure the training
# warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
warmup_steps = math.ceil(len(train_dataloader) ) # 1 epoch train data for warm-up
logger.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=3,
          # epochs=num_epochs,
          warmup_steps=warmup_steps,
          save_best_model = True,
          output_path=model_save_path)

2023-06-26 00:41:24 - Warmup-steps: 1486


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1486 [00:00<?, ?it/s]

2023-06-26 00:42:12 - CEBinaryClassificationEvaluator: Evaluating the model on ce-dev dataset after epoch 0:
2023-06-26 00:42:14 - Accuracy:           90.88	(Threshold: -0.7394)
2023-06-26 00:42:14 - F1:                 94.22	(Threshold: -0.7426)
2023-06-26 00:42:14 - Precision:          92.57
2023-06-26 00:42:14 - Recall:             95.93
2023-06-26 00:42:14 - Average Precision:  98.33

2023-06-26 00:42:14 - Save model to ./save_model/cross_encoder/


Iteration:   0%|          | 0/1486 [00:00<?, ?it/s]

2023-06-26 00:43:00 - CEBinaryClassificationEvaluator: Evaluating the model on ce-dev dataset after epoch 1:
2023-06-26 00:43:02 - Accuracy:           91.70	(Threshold: -0.6806)
2023-06-26 00:43:02 - F1:                 94.70	(Threshold: -0.8134)
2023-06-26 00:43:02 - Precision:          93.47
2023-06-26 00:43:02 - Recall:             95.97
2023-06-26 00:43:02 - Average Precision:  98.75

2023-06-26 00:43:02 - Save model to ./save_model/cross_encoder/


Iteration:   0%|          | 0/1486 [00:00<?, ?it/s]

2023-06-26 00:43:48 - CEBinaryClassificationEvaluator: Evaluating the model on ce-dev dataset after epoch 2:
2023-06-26 00:43:50 - Accuracy:           91.74	(Threshold: -0.9053)
2023-06-26 00:43:50 - F1:                 94.75	(Threshold: -0.9053)
2023-06-26 00:43:50 - Precision:          93.38
2023-06-26 00:43:50 - Recall:             96.15
2023-06-26 00:43:50 - Average Precision:  98.81

2023-06-26 00:43:50 - Save model to ./save_model/cross_encoder/


In [41]:
evaluator_test = CEBinaryClassificationEvaluator.from_input_examples(test_inputexamples, name='ce-test')

In [42]:
evaluator_test(model)

2023-06-26 00:43:50 - CEBinaryClassificationEvaluator: Evaluating the model on ce-test dataset:
2023-06-26 00:46:52 - Accuracy:           99.76	(Threshold: -5.3236)
2023-06-26 00:46:52 - F1:                 99.88	(Threshold: -5.3285)
2023-06-26 00:46:52 - Precision:          99.79
2023-06-26 00:46:52 - Recall:             99.97
2023-06-26 00:46:52 - Average Precision:  99.99



0.9998840620094187

In [43]:
from torch import nn
test_result = model.predict(test_examples, convert_to_numpy=True, activation_fct = nn.Sigmoid())

Batches:   0%|          | 0/25287 [00:00<?, ?it/s]

In [44]:
test_result

array([0.00272671, 0.07463657, 0.00469801, ..., 0.03782764, 0.02340425,
       0.92015713], dtype=float32)

In [45]:
# optim_threshold = -5.9335
# optim_threshold = optim_threshold/100
def threshold(digit):
    result = 0
    if digit >= 0.5:
        result = 1
    return result

In [46]:
test_result_digit = [threshold(each) for each in test_result]

In [47]:
# test_result_digit

In [48]:
from sklearn.metrics import f1_score
f1_score(test_label, test_result_digit, average='macro')

0.508275862550875

In [49]:
from sklearn.metrics import recall_score

# Calculate recall for the class '0'
recall = recall_score(test_label, test_result_digit, pos_label=0)

print(f'Recall for class 0: {recall}')

Recall for class 0: 0.819672131147541


## Select test samples for empirical analysis

In [146]:
# note: 
# pos_label_digit = 0
# neg_label_digit = 1

def threshold(digit):
    result = 0
    if digit >= 0.3: # set threshold
        result = 1
    return result

In [147]:
len(test_result)

809165

In [228]:
test_inputexample, test_example, text_label = create_InputExample_for_topic_test_on_all(test_df, 1) # idx in testset

In [229]:
test_sample_result = model.predict(test_example, convert_to_numpy=True, activation_fct = nn.Sigmoid())

Batches:   0%|          | 0/67 [00:00<?, ?it/s]

In [230]:
test_sample_result_digit = [threshold(each) for each in test_sample_result]

In [231]:
selected_test_sample = [i for (i, v) in zip(test_example, test_sample_result_digit) if not v]

In [232]:
true_test_sample = [i for (i, v) in zip(test_example, text_label) if not v] 

In [233]:
len(test_example)

2135

In [234]:
len(true_test_sample)

8

In [235]:
true_test_sample

[['Make the food.', 'Preheat the oven.'],
 ['Make the food.', 'take vegetables out of refrigerator'],
 ['Make the food.', 'take meat out of refrigerator'],
 ['Make the food.', 'chop and prep meat and vegetables'],
 ['Make the food.', 'sear meat in pan'],
 ['Make the food.', 'mix vegetables in with seared meat'],
 ['Make the food.', 'throw in the oven'],
 ['Make the food.', 'Make the food.']]

In [236]:
len(selected_test_sample)

179

In [237]:
selected_test_sample

[['Make the food.', 'Preheat the oven.'],
 ['Make the food.', 'take vegetables out of refrigerator'],
 ['Make the food.', 'take meat out of refrigerator'],
 ['Make the food.', 'chop and prep meat and vegetables'],
 ['Make the food.', 'sear meat in pan'],
 ['Make the food.', 'mix vegetables in with seared meat'],
 ['Make the food.', 'throw in the oven'],
 ['Make the food.', 'Make the food.'],
 ['Make the food.', 'add party food to cart'],
 ['Make the food.', 'move over to the fridge .'],
 ['Make the food.', 'turn the fridge off .'],
 ['Make the food.', 'turn on the fridge .'],
 ['Make the food.',
  'To toast your almonds, heat a dry fry pan and add the nuts.'],
 ['Make the food.',
  'Peel & slice your banana into a bowl and top with ice cream.'],
 ['Make the food.', 'Add the sauce and toasted nuts.'],
 ['Make the food.', 'Make a plan for food intake.'],
 ['Make the food.', 'go inside .'],
 ['Make the food.', 'Bake a sweet casserole.'],
 ['Make the food.', 'Make sweet potato pancakes.'],

In [203]:
true_steps = [each[1] for each in true_test_sample]

In [204]:
selected_steps = [each[1] for each in selected_test_sample]

In [205]:
sum(test_sample_result_digit)

2103

In [206]:
# recall

In [207]:
sum(el in selected_steps for el in true_steps)/len(true_test_sample)

1.0