In [1]:
import pandas as pd
import numpy as np
import json

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset
import torch
from transformers import TrainingArguments, Trainer
torch.manual_seed(0)
np.random.seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

  from .autonotebook import tqdm as notebook_tqdm


### Create CPKL Dataset
Use GPT-3.5 tagged raw data and prepare a dataset that can be used for training and validation

In [2]:
gpt_tagged_raw = './dimiss_items/data/model_gpt-3.5-turbo-0125/raw'

In [3]:
head_annt_4k_data = pd.read_json(f'{gpt_tagged_raw}/head_annotation_sample40000_1709689043.json')
print(f"GPT label head training data: {len(head_annt_4k_data)}")

head_annt_4k_data.columns

GPT label head training data: 39802


Index(['dialog_id', 'ut-2', 'ut-1', 'ut', 'ut+1', 'ut+2', 'head', 'relation',
       'tail', 'gold_reference', 'conv', 'head_entail_prob',
       'tail_entail_prob', 'prompt', 'entails', 'gp_output'],
      dtype='object')

In [4]:
tail_annt_4k_data = pd.read_json(f'{gpt_tagged_raw}/tail_annotation_sample40000_1709657696.json')
print(f"GPT label tail training data: {len(tail_annt_4k_data)}")

tail_annt_4k_data.columns

GPT label tail training data: 39802


Index(['dialog_id', 'ut-2', 'ut-1', 'ut', 'ut+1', 'ut+2', 'head', 'relation',
       'tail', 'gold_reference', 'conv', 'head_entail_prob',
       'tail_entail_prob', 'prompt', 'entails', 'action', 'gp_output'],
      dtype='object')

In [5]:
def _create_utter_dict(type, utter):
    return {"type": type, "utter": utter}

def _create_text(row):
    return [
        _create_utter_dict("ut-2", row["ut-2"]),
        _create_utter_dict("ut-1", row["ut-1"]),
        _create_utter_dict("ut+1", row["ut+1"]),
        _create_utter_dict("ut+2", row["ut+2"]),
        _create_utter_dict("ut", row["ut"])
    ]

def preprocess_gpt_tagged_raw_df(df, ds_type):
    df['text'] = df.apply(lambda row: _create_text(row), axis=1)
    # df.drop(columns=['ut-2', 'ut-1', 'ut+1', 'ut+2', 'ut'], inplace=True)
    df['old_label'] = df['gold_reference']
    df['fact_text'] = df['tail'] if ds_type == 'tail' else df['head']
    df['gold_reference'] = df['entails'].apply(lambda x: x == 1)
    # df.drop(columns=['entails'], inplace=True)
    df.rename(columns={'gp_output': 'gpt_output'}, inplace=True)

    same_columns = ['dialog_id', 'relation', 'head', 'tail', 'text'] # , 'head_entail_prob', 'tail_entail_prob'
    diff_columns = ['old_label', 'gold_reference', 'fact_text', 'gpt_output']
    if ds_type == 'tail':
        df['action'] = df.apply(lambda row: row['action'] if row['action'].lower() != 'none' else None, axis=1)
        df = df[same_columns + diff_columns + ['action']]
        df.columns = [f'gpt_tagged_{ds_type}_{col}' if col != 'dialog_id' else col for col in df.columns]
    else:
        df = df[same_columns + diff_columns]
        df.columns = [f'gpt_tagged_{ds_type}_{col}' if col != 'dialog_id' else col for col in df.columns]

    # rename column using ds_type for all columns aside dialog_id
    
    return df

gpt_annt_head_4k_norm_data = preprocess_gpt_tagged_raw_df(head_annt_4k_data, 'head')
# gpt_annt_head_4k_norm_data
gpt_annt_tail_4k_norm_data = preprocess_gpt_tagged_raw_df(tail_annt_4k_data, 'tail')
gpt_annt_tail_4k_norm_data

Unnamed: 0,dialog_id,gpt_tagged_tail_relation,gpt_tagged_tail_head,gpt_tagged_tail_tail,gpt_tagged_tail_text,gpt_tagged_tail_old_label,gpt_tagged_tail_gold_reference,gpt_tagged_tail_fact_text,gpt_tagged_tail_gpt_output,gpt_tagged_tail_action
0,persona1_pos_0,routine_habit,Person A is a builder,builds houses,"[{'type': 'ut-2', 'utter': 'i am! for my hobby...",True,True,builds houses,"Step-by-step reasoning:\nThe fact ""builds hous...",routine
1,persona1_neg_2,experience,Person A is a performer,joined a theater group,"[{'type': 'ut-2', 'utter': 'we all live in a y...",False,False,joined a theater group,Step-by-step reasoning:\n1. The fact given is ...,
2,persona2_pos_3,experience,Person A is a wordsmith,won a spelling bee in elementary school,"[{'type': 'ut-2', 'utter': 'hi! i work as a go...",True,False,won a spelling bee in elementary school,"Step-by-step reasoning:\n1. The fact ""won a sp...",
3,persona2_neg_3,experience,Person A is a wordsmith,won a spelling bee in elementary school,"[{'type': 'ut-2', 'utter': 'really. but, i can...",False,False,won a spelling bee in elementary school,"Step-by-step reasoning:\n- The fact ""won a spe...",
4,persona1_pos_4,experience,Person A is a guitar player,took guitar lessons for years,"[{'type': 'ut-2', 'utter': 'i like to watch ki...",True,True,took guitar lessons for years,Step-by-step reasoning:\n1. Person A mentions ...,trait
...,...,...,...,...,...,...,...,...,...,...
39797,persona1_pos_6652,characteristic,Person A is a tourist,i enjoy travelling,"[{'type': 'ut-2', 'utter': 'rhode island! i go...",True,True,i enjoy travelling,"Step-by-step reasoning:\n- The fact ""i enjoy t...",trait
39798,persona2_neg_1852,routine_habit,Person A is an alert person,always on the lookout,"[{'type': 'ut-2', 'utter': 'i just keep lookin...",False,True,always on the lookout,"Step-by-step reasoning:\n- The fact ""always on...",trait
39799,persona1_neg_1881,experience,Person A is a sad person,experienced a lot of loss in my life,"[{'type': 'ut-2', 'utter': 'hi how are you doi...",False,False,experienced a lot of loss in my life,"Step-by-step reasoning:\n- The fact ""experienc...",
39800,persona2_neg_4394,routine_habit,Person A is a celebrity,signs autographs,"[{'type': 'ut-2', 'utter': 'sorry but at times...",False,False,signs autographs,"Step-by-step reasoning:\n1. The fact ""signs au...",


In [6]:
# pd.merge(gpt_annt_head_4k_norm_data, gpt_annt_tail_4k_norm_data, on=['dialog_id','relation', 'head', 'tail', 'text'])

In [7]:
# inner join the two dataframes on dialog_id, 
merged_head_tail_gpt_tagged_data = pd.merge(gpt_annt_head_4k_norm_data, gpt_annt_tail_4k_norm_data, on='dialog_id')
merged_head_tail_gpt_tagged_data.shape

(39802, 18)

In [15]:
merged_head_tail_gpt_tagged_data.columns

Index(['dialog_id', 'gpt_tagged_head_relation', 'gpt_tagged_head_head',
       'gpt_tagged_head_tail', 'gpt_tagged_head_text',
       'gpt_tagged_head_old_label', 'gpt_tagged_head_gold_reference',
       'gpt_tagged_head_fact_text', 'gpt_tagged_head_gpt_output',
       'gpt_tagged_tail_relation', 'gpt_tagged_tail_head',
       'gpt_tagged_tail_tail', 'gpt_tagged_tail_text',
       'gpt_tagged_tail_old_label', 'gpt_tagged_tail_gold_reference',
       'gpt_tagged_tail_fact_text', 'gpt_tagged_tail_gpt_output',
       'gpt_tagged_tail_action'],
      dtype='object')

In [12]:
# sanity check all same name column have same values
for col in ['relation', 'head', 'tail', 'text']: # , 'head_entail_prob', 'tail_entail_prob'
    assert (merged_head_tail_gpt_tagged_data[f'gpt_tagged_head_{col}'] == merged_head_tail_gpt_tagged_data[f'gpt_tagged_tail_{col}']).all(), f"Column {col} mismatch"
#'fact_text_x', 'fact_text_y', 'gpt_output', 


Shared columns: ['relation', 'head', 'tail', 'text']
Head columns: ['gpt_tagged_head_old_label', 'gpt_tagged_head_gpt_output', 'gpt_tagged_head_fact_text', 'gpt_tagged_head_gold_reference']
Tail columns: ['gpt_tagged_tail_gpt_output', 'gpt_tagged_tail_old_label', 'gpt_tagged_tail_action', 'gpt_tagged_tail_fact_text', 'gpt_tagged_tail_gold_reference']


In [19]:
# Previous cell assert that shared columns have same values
# Remove duplicate columns and rename columns

# clean up shared columns and rename columns
shared_cols = ['relation', 'head', 'tail', 'text']
all_columns = list(merged_head_tail_gpt_tagged_data.columns)
# head_columns = [col for col in all_columns if col.startswith('gpt_tagged_head_')]
head_columns = list(set([col for col in all_columns if col.startswith('gpt_tagged_head_')]) - set([f'gpt_tagged_head_{col}' for col in shared_cols]))
tail_columns = list(set([col for col in all_columns if col.startswith('gpt_tagged_tail_')]) - set([f'gpt_tagged_tail_{col}' for col in shared_cols]))

print(f"Shared columns: {shared_cols}")
print(f"Head columns: {head_columns}")
print(f"Tail columns: {tail_columns}")

# select only the shared columns and rename the columns

rename_dict = {f'gpt_tagged_head_{col}': col for col in shared_cols}
new_columns = ['dialog_id'] + shared_cols + head_columns + tail_columns

gpt_tagged_head_tail_dedup_data = merged_head_tail_gpt_tagged_data.rename(columns=rename_dict)[new_columns]

gpt_tagged_head_tail_dedup_data

Shared columns: ['relation', 'head', 'tail', 'text']
Head columns: ['gpt_tagged_head_old_label', 'gpt_tagged_head_gpt_output', 'gpt_tagged_head_fact_text', 'gpt_tagged_head_gold_reference']
Tail columns: ['gpt_tagged_tail_gpt_output', 'gpt_tagged_tail_old_label', 'gpt_tagged_tail_action', 'gpt_tagged_tail_fact_text', 'gpt_tagged_tail_gold_reference']


Unnamed: 0,dialog_id,relation,head,tail,text,gpt_tagged_head_old_label,gpt_tagged_head_gpt_output,gpt_tagged_head_fact_text,gpt_tagged_head_gold_reference,gpt_tagged_tail_gpt_output,gpt_tagged_tail_old_label,gpt_tagged_tail_action,gpt_tagged_tail_fact_text,gpt_tagged_tail_gold_reference
0,persona1_pos_0,routine_habit,Person A is a builder,builds houses,"[{'type': 'ut-2', 'utter': 'i am! for my hobby...",True,Step-by-step reasoning:\nPerson A mentions rem...,Person A is a builder,True,"Step-by-step reasoning:\nThe fact ""builds hous...",True,routine,builds houses,True
1,persona1_neg_2,experience,Person A is a performer,joined a theater group,"[{'type': 'ut-2', 'utter': 'we all live in a y...",False,Step-by-step reasoning:\n1. Person A mentions ...,Person A is a performer,True,Step-by-step reasoning:\n1. The fact given is ...,False,,joined a theater group,False
2,persona2_pos_3,experience,Person A is a wordsmith,won a spelling bee in elementary school,"[{'type': 'ut-2', 'utter': 'hi! i work as a go...",True,Step-by-step reasoning:\n- Person A mentions w...,Person A is a wordsmith,True,"Step-by-step reasoning:\n1. The fact ""won a sp...",True,,won a spelling bee in elementary school,False
3,persona2_neg_3,experience,Person A is a wordsmith,won a spelling bee in elementary school,"[{'type': 'ut-2', 'utter': 'really. but, i can...",False,"Step-by-step reasoning:\nIn the dialogue, Pers...",Person A is a wordsmith,False,"Step-by-step reasoning:\n- The fact ""won a spe...",False,,won a spelling bee in elementary school,False
4,persona1_pos_4,experience,Person A is a guitar player,took guitar lessons for years,"[{'type': 'ut-2', 'utter': 'i like to watch ki...",True,Step-by-step reasoning:\n1. Person A mentioned...,Person A is a guitar player,True,Step-by-step reasoning:\n1. Person A mentions ...,True,trait,took guitar lessons for years,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39797,persona1_pos_6652,characteristic,Person A is a tourist,i enjoy travelling,"[{'type': 'ut-2', 'utter': 'rhode island! i go...",True,"Step-by-step reasoning:\n- In the dialogue, Pe...",Person A is a tourist,True,"Step-by-step reasoning:\n- The fact ""i enjoy t...",True,trait,i enjoy travelling,True
39798,persona2_neg_1852,routine_habit,Person A is an alert person,always on the lookout,"[{'type': 'ut-2', 'utter': 'i just keep lookin...",False,Step-by-step reasoning:\n- Person A mentions t...,Person A is an alert person,True,"Step-by-step reasoning:\n- The fact ""always on...",False,trait,always on the lookout,True
39799,persona1_neg_1881,experience,Person A is a sad person,experienced a lot of loss in my life,"[{'type': 'ut-2', 'utter': 'hi how are you doi...",False,Step-by-step reasoning:\nPerson A does not exp...,Person A is a sad person,False,"Step-by-step reasoning:\n- The fact ""experienc...",False,,experienced a lot of loss in my life,False
39800,persona2_neg_4394,routine_habit,Person A is a celebrity,signs autographs,"[{'type': 'ut-2', 'utter': 'sorry but at times...",False,Step-by-step reasoning:\n- The dialogue does n...,Person A is a celebrity,False,"Step-by-step reasoning:\n1. The fact ""signs au...",False,,signs autographs,False


In [20]:
# split the data into train and test
from sklearn.model_selection import train_test_split

def split_train_test(df):
    train_data, test_data = train_test_split(df, test_size=0.1, random_state=0)
    return train_data, test_data

gpt_label_full_train_df, gpt_label_full_valid_df = split_train_test(gpt_tagged_head_tail_dedup_data)

In [23]:
prev_processed_train = pd.read_json(f'./dimiss_items/data/model_gpt-3.5-turbo-0125/merged_head_tail_train_data_35k.json')
prev_processed_valid = pd.read_json(f'./dimiss_items/data/model_gpt-3.5-turbo-0125/merged_head_tail_val_data_4k.json')

In [24]:
# gpt_label_full_train_df.equals(prev_processed_train)
#gpt_label_full_valid_df.equals(prev_processed_valid)

False

In [26]:
prev_processed_train['dialog_id'].nunique(), prev_processed_valid['dialog_id'].nunique()

Index(['dialog_id', 'head_old_label', 'head_gold_reference', 'head_fact_text',
       'head_text', 'head_relation', 'tail_old_label', 'tail_gold_reference',
       'tail_fact_text', 'tail_text', 'tail_relation'],
      dtype='object')

In [39]:
np.mean(prev_processed_train['dialog_id'].values == gpt_label_full_train_df['dialog_id'].values)

1.0

In [40]:
np.mean(gpt_label_full_valid_df['dialog_id'].values == prev_processed_valid['dialog_id'].values)

1.0

In [None]:
import os
os.makedirs('./dimiss_items/data/model_gpt-3.5-turbo-0125/processed', exist_ok=True)

In [44]:
# Save processed training data

# gpt_label_full_train_df.to_json(
#     f'./dimiss_items/data/model_gpt-3.5-turbo-0125/processed/gpt_label_full_train_df.json', 
#     orient="records", 
#     indent=4)

# gpt_label_full_valid_df.to_json(
#     f'./dimiss_items/data/model_gpt-3.5-turbo-0125/processed/gpt_label_full_valid_df.json', 
#     orient="records", 
#     indent=4)

In [45]:
gpt_label_full_train_df.shape, gpt_label_full_valid_df.shape

((35821, 14), (3981, 14))

In [46]:
gpt_label_full_train_df.columns


Index(['dialog_id', 'relation', 'head', 'tail', 'text',
       'gpt_tagged_head_old_label', 'gpt_tagged_head_gpt_output',
       'gpt_tagged_head_fact_text', 'gpt_tagged_head_gold_reference',
       'gpt_tagged_tail_gpt_output', 'gpt_tagged_tail_old_label',
       'gpt_tagged_tail_action', 'gpt_tagged_tail_fact_text',
       'gpt_tagged_tail_gold_reference'],
      dtype='object')