In [1]:
import pandas as pd
import numpy as np
import os

os.chdir(os.path.abspath('..'))

from data_prep.data_utils import load_help

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load HELP_TRAIN and HELP_TEST

train, test = load_help()
train_df = pd.DataFrame(train)
test_df = pd.DataFrame(test)

2023-04-09 17:14:49.061 | INFO     | data_prep.data_utils:load_help:52 - Index(['sentence1', 'sentence2', 'gold_label', 'gold_label_num'], dtype='object')
2023-04-09 17:14:49.164 | INFO     | data_prep.data_utils:load_help:56 - (35891, 4)
100%|██████████| 32/32 [00:00<00:00, 257.08ba/s]
Casting the dataset: 100%|██████████| 32/32 [00:00<00:00, 644.14ba/s]
100%|██████████| 5/5 [00:00<00:00, 835.95ba/s]
Casting the dataset: 100%|██████████| 5/5 [00:00<00:00, 824.45ba/s]


In [3]:
train.features

{'premise': Value(dtype='string', id=None),
 'hypothesis': Value(dtype='string', id=None),
 'gold_label': Value(dtype='string', id=None),
 '__index_level_0__': Value(dtype='int64', id=None),
 'label': ClassLabel(names=['entailment', 'neutral'], id=None)}

In [4]:
# Fetch original help dataset for extra labels
help_original = pd.read_csv('data/HELP/HELP_original.tsv', sep='\t', index_col=0)
help_original['premise']  = help_original['ori_sentence']
help_original['hypothesis']  = help_original['new_sentence']


In [5]:
# join original labels to our HELP splits
test_df = pd.merge(test_df, help_original, on=['premise', 'hypothesis'], how='left', indicator='Exist')
test_df['Exist'] = np.where(test_df.Exist=="both", True, False)
test_df = test_df.loc[test_df.Exist]


In [6]:

train_df = pd.merge(train_df, help_original, on=['premise', 'hypothesis'], how='left', indicator='Exist')
train_df['Exist'] = np.where(train_df.Exist=="both", True, False)
train_df = train_df.loc[train_df.Exist]

In [7]:
train_df.monotonicity.value_counts()

downward_monotone    17909
upward_monotone       7046
conjunction           6044
non_monotone           992
disjunction            342
Name: monotonicity, dtype: int64

In [8]:
def extract_context(row):
    # the replace target/source is not consistent with premise/hypothesis, so we try both and take the successful one:
    premise = row['premise']
    try:
        context = premise.replace(row['replace_source'] ,'x')
    except:
        context = None
    try:
        check_context = premise.replace(row['replace_target'] ,'x')
    except:
        check_context = None

    if context and context != premise: 
        return context
    elif check_context and check_context != premise:
        return check_context
    else: 
        print('Logging an ignored example:')
        print(premise)
        print(row['replace_source'])
        print(row['replace_target'])
        pass


In [9]:
def sample_and_extract_contexts(df, split_name, sample_size, seed):
    sample_for_downward_contexts = df.loc[df.monotonicity=='downward_monotone'].sample(sample_size, random_state=seed)
    sample_for_downward_contexts
    downward_contexts = pd.DataFrame(columns=['context', 'monotonicity', 'determiner', 'replace_target', 'replace_source', 'replace_mode', 'label'])
    downward_contexts['context'] = sample_for_downward_contexts.apply(extract_context, axis=1).dropna()
    downward_contexts['monotonicity'] = 'down'
    downward_contexts

    sample_for_upward_contexts = df.loc[df.monotonicity=='upward_monotone'].sample(sample_size, random_state=seed)
    sample_for_upward_contexts = sample_for_upward_contexts.dropna()
    upward_contexts = pd.DataFrame(columns=['context', 'monotonicity'])
    upward_contexts['context'] = sample_for_upward_contexts.apply(extract_context, axis=1).dropna()
    upward_contexts['monotonicity'] = 'up'

    # combine full contexts split
    help_contexts = pd.concat([downward_contexts, upward_contexts])
    with open(f'data/HELP_Contexts/help_contexts_{split_name}.tsv', 'w+') as file:
        file.write(help_contexts.to_csv(sep='\t'))

    with open(f'data/HELP_Contexts/{split_name}_sample_for_upward_contexts.tsv', 'w+') as file:
        file.write(sample_for_upward_contexts.to_csv(sep='\t'))
    with open(f'data/HELP_Contexts/{split_name}_sample_for_downward_contexts.tsv', 'w+') as file:
        file.write(sample_for_downward_contexts.to_csv(sep='\t'))


In [11]:
seed=11
sample_and_extract_contexts(train_df, 'train', 500, seed)
sample_and_extract_contexts(test_df, 'test', 300, seed)

Logging an ignored example:
Tom did n't reply to any of Mary 's letter .
any letter
any of Mary s letter
Logging an ignored example:
There is no need to reply to that x .
x
letter
