In [2]:
import torch
import torch.nn as nn
from huggingface_hub import PyTorchModelHubMixin
from transformers import AutoModelForSequenceClassification, AutoTokenizer

### Upload model and datasets to huggingface hub

In [5]:
# Model source to be uploaded to huggingface hub
model_path = "./models/full/nlu/deberta_v3_large_sample_False"

In [6]:

# need to add tok into final_model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)



In [7]:
model_name = 'deberta-v3-large-peacock-knowledge-linking'
model.push_to_hub(model_name)

model.safetensors: 100%|██████████| 1.74G/1.74G [02:28<00:00, 11.7MB/s]


CommitInfo(commit_url='https://huggingface.co/theirislin/deberta-v3-large-peacock-knowledge-linking/commit/1e74e2d4cdd4455d1e89dcf3541df6e47190c2e3', commit_message='Upload DebertaV2ForSequenceClassification', commit_description='', oid='1e74e2d4cdd4455d1e89dcf3541df6e47190c2e3', pr_url=None, pr_revision=None, pr_num=None)

In [8]:
tokenizer.push_to_hub(model_name)

README.md: 100%|██████████| 5.18k/5.18k [00:00<00:00, 2.77MB/s]
spm.model: 100%|██████████| 2.46M/2.46M [00:00<00:00, 4.45MB/s]


CommitInfo(commit_url='https://huggingface.co/theirislin/deberta-v3-large-peacock-knowledge-linking/commit/c854b1b45b1fb2eb3a442b0b672d584673009a5f', commit_message='Upload tokenizer', commit_description='', oid='c854b1b45b1fb2eb3a442b0b672d584673009a5f', pr_url=None, pr_revision=None, pr_num=None)

In [9]:
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from datasets import ClassLabel
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize
import nltk
import evaluate
import time


# fix seeding for pytorch and huggingface
import torch
torch.manual_seed(0)
np.random.seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [10]:
# Data set config
linking_data_path = "./dimiss_items/data/gpt_annotated_peacok/model_gpt-3.5-turbo-0125"
TRAIN_DATA_PATH = f'{linking_data_path}/merged_head_tail_train_data_35k.json'
VALID_DATA_PATH = f'{linking_data_path}/merged_head_tail_val_data_4k.json'
# TRAIN_DATA_PATH = f'{linking_data_path}/model_label_head_train_df_0229.json'
# VALID_DATA_PATH = f'{linking_data_path}/model_label_head_valid_df_0229.json'
# TRAIN_DATA_PATH = f'{linking_data_path}/model_label_train_0229.json'
# VALID_DATA_PATH = f'{linking_data_path}/model_label_valid.json'
# linking_data_path = "./dimiss_items/data/comFact/"
# TRAIN_DATA_PATH = f'{linking_data_path}/augmented_train_data.json'
# VALID_DATA_PATH = f'{linking_data_path}/augmented_val_data.json'

DS_TYPE = "relation" # "full" or "head" or "tail"
USE_TAG = True

LABEL_TO_ID = {"entailment": 0, "not_entailment": 1}
ID_TO_LABEL = {0: "entailment", 1: "not_entailment"}

COMFACT_LABEL_TO_ID = {True: 0, False: 1} # comFactDataLabelToId

In [30]:
def get_peacok_relation_label(relation):
    tdf = pd.read_json(VALID_DATA_PATH)

    original_head_tagged_df = pd.read_json(f'{linking_data_path}/head_annotation_sample40000_1709689043.json')
    original_tail_tagged_df = pd.read_json(f'{linking_data_path}/tail_annotation_sample40000_1709657696.json')
    print("original_head_tagged_df", original_head_tagged_df.shape)
    print("original_tail_tagged_df", original_tail_tagged_df.shape)
    # Merge 2 original dfs on column dialogue_id
    original_tagged_df = pd.merge(original_head_tagged_df, original_tail_tagged_df, on='dialog_id')
    print("original_tagged_df", original_tagged_df.shape)

    # create a relation between dialogue_id and dialogue
    origonal_peacok_relation_df = original_tagged_df[['dialog_id', 'relation_x']].rename(columns={'relation_x': 'peacok_relation'})
    print("origonal_peacok_relation_df", origonal_peacok_relation_df.shape)
    print("all predefined relations", origonal_peacok_relation_df['peacok_relation'].value_counts())
    return origonal_peacok_relation_df

def get_gpt_tagged_raw_data():
    # Load the data
    head_df = pd.read_json("./dimiss_items/data/gpt_annotated_peacok/model_gpt-3.5-turbo-0125/head_annotation_sample40000_1709689043.json")
    head_df = head_df.rename(columns={'gp_output': 'gpt_output_head'})
    # print("head_df", head_df.columns)
    tail_df = pd.read_json("./dimiss_items/data/gpt_annotated_peacok/model_gpt-3.5-turbo-0125/tail_annotation_sample40000_1709657696.json")
    tail_df = tail_df.rename(columns={'gp_output': 'gpt_output_tail'})
    # print("tail_df", tail_df.shape)

    # Merge 2 original dfs on column dialogue_id
    original_gpt_tagged_df = pd.merge(head_df, tail_df, on='dialog_id')
    original_gpt_tagged_df = original_gpt_tagged_df[['dialog_id', 'gpt_output_head', 'gpt_output_tail']]
    print("original_gpt_tagged_df", original_gpt_tagged_df.shape)
    # print("original_gpt_tagged_df", original_gpt_tagged_df.columns)
    return original_gpt_tagged_df


if DS_TYPE == "relation":
    origonal_peacok_relation_df = get_peacok_relation_label("relation")
    # create a list of all the relations
    relations = origonal_peacok_relation_df['peacok_relation'].unique()
    relations_special_tokens = {'additional_special_tokens': relations.tolist()}
    print("relations", relations_special_tokens)

    original_gpt_tagged_df = get_gpt_tagged_raw_data()

original_head_tagged_df (39802, 16)
original_tail_tagged_df (39802, 17)
original_tagged_df (39802, 32)
origonal_peacok_relation_df (39802, 2)
all predefined relations peacok_relation
routine_habit                  14825
characteristic                  8808
experience                      6948
routine_habit_relationship      4918
goal_plan                       2459
experience_relationship          964
goal_plan_relationship           485
characteristic_relationship      395
Name: count, dtype: int64
relations {'additional_special_tokens': ['routine_habit', 'experience', 'characteristic', 'routine_habit_relationship', 'goal_plan', 'characteristic_relationship', 'experience_relationship', 'goal_plan_relationship']}
original_gpt_tagged_df (39802, 3)


In [51]:
import re

def replace_phrases(sentence, person_a_tag):
    sentence = sentence.lower()
    # Pattern to find "I am" and replace with "person a is"
    sentence = re.sub(r"\bi am\b", f"{person_a_tag} is", sentence, flags=re.IGNORECASE)
    # Pattern to find "I was" and replace with "person a was"
    sentence = re.sub(r"\bi was\b", f"{person_a_tag} was", sentence, flags=re.IGNORECASE)
    sentence = re.sub(r"\bi\b", person_a_tag, sentence, flags=re.IGNORECASE)
    sentence = re.sub(r"\bmy\b", f"{person_a_tag}'s", sentence, flags=re.IGNORECASE)
    sentence = sentence.replace('person a', person_a_tag)
    return sentence


class DatasetLoader:
    def __init__(self, train_data_path: str, valid_data_path: str, ds_type: str, sample_size: int = None):
        self.train_data_path = train_data_path
        self.valid_data_path = valid_data_path
        self.sample_size = sample_size
        self.pa_tag, self.pb_tag = 'Person A', 'Person B'
        self.ds_type = ds_type

    def transform_df(self, df):
        print("transforming df", df.columns)
        df['label'] = df['gold_reference'].apply(lambda x: COMFACT_LABEL_TO_ID[x])
        
        assert df['head_text'].equals(df['tail_text']), "head and tail text should be the same"
        assert df['head_text'].equals(df['text']), "head and text should be the same"

        # Clean up the df
        df = df.rename(
            columns={
                'head_text': 'dialog_dict', # head_text is the same as tail_text, both dialog
                'head_gold_reference': 'head_label',
                'tail_gold_reference': 'tail_label',
                }
            )
        final_columns = [
            'dialog_id', 'dialog_dict', 'head_label', 'head_fact_text', 'gpt_output_head',
            'tail_label', 'tail_fact_text',  'gpt_output_tail', 'peacok_relation', 'label'
        ]
        df = df[final_columns]
        return df

    def create_pd_dataframe(self, data_path, sample_size=None):
        df = pd.read_json(data_path)
        if self.ds_type == "relation":
            df['peacok_relation'] = self.get_relation(df)
            gpt_tagged_df = self.get_gpt_tagged_raw_data(df)
            df['gpt_output_head'] = gpt_tagged_df['gpt_output_head']
            df['gpt_output_tail'] = gpt_tagged_df['gpt_output_tail']
        if 'merged_head_tail' in data_path:
            df = self.modify_merged_head_tail(df)
        df = df.sample(sample_size) if sample_size else df
        df = self.transform_df(df)
        return df

    def create_dataset(self, train_df, valid_df):
        train_ds = Dataset.from_pandas(train_df).remove_columns(['__index_level_0__'])
        valid_ds = Dataset.from_pandas(valid_df).remove_columns(['__index_level_0__'])

        dataset = DatasetDict({
            'train': train_ds,
            'valid': valid_ds
        })
        return dataset
    
    def get_relation(self, df):
        return origonal_peacok_relation_df[origonal_peacok_relation_df['dialog_id'].isin(df['dialog_id'])]['peacok_relation']
    
    def get_gpt_tagged_raw_data(self, df):
        return original_gpt_tagged_df[original_gpt_tagged_df['dialog_id'].isin(df['dialog_id'])][['gpt_output_head', 'gpt_output_tail']]
    
    def modify_merged_head_tail(self, df):
        if self.ds_type == "head":
            df['text'] = df['head_text']
            df['gold_reference'] = df['head_gold_reference']
            df['fact_text'] = df['head_fact_text']
        elif self.ds_type == "tail":
            df['text'] = df['tail_text']
            df['gold_reference'] = df['tail_gold_reference']
            df['fact_text'] = df['tail_fact_text']
        elif self.ds_type == "relation":
            assert df['head_text'].equals(df['tail_text']), "head and tail text should be the same"
            df['text'] = df['tail_text']
            df['gold_reference'] = df['head_gold_reference'] & df['tail_gold_reference']
            df['fact_text'] = df.apply(lambda x: f"{x['peacok_relation']} {x['head_fact_text']} and {x['tail_fact_text']}", axis=1)
            # df['fact_text'] = df.apply(lambda x: f"{x['head_fact_text']} and {x['tail_fact_text']}; {x['peacok_relation']}", axis=1)
        else:
            assert df['head_text'].equals(df['tail_text']), "head and tail text should be the same"
            df['text'] = df['tail_text']
            df['gold_reference'] = df['head_gold_reference'] & df['tail_gold_reference']
            df['fact_text'] = df.apply(lambda x: f"{x['head_fact_text']} and {x['tail_fact_text']}", axis=1)
        return df

    def load(self):
        train_df = self.create_pd_dataframe(self.train_data_path) # create_pd_dataframe(TRAIN_DATA_PATH, 5000)
        valid_df = self.create_pd_dataframe(self.valid_data_path) # create_pd_dataframe(VALID_DATA_PATH, 500)
        dataset = self.create_dataset(train_df, valid_df)

        return dataset
        # return dataset.map(self.tokenize_function, batched=True, remove_columns=['cid', 'tid', 'text', 'fid', 'fact_text', 'is_head', 'linking', 'gold_reference', 'conv', '__index_level_0__'])


In [52]:
# load dataset
dataset_loader = DatasetLoader(TRAIN_DATA_PATH, VALID_DATA_PATH, DS_TYPE)
dataset = dataset_loader.load()

dataset

transforming df Index(['dialog_id', 'head_old_label', 'head_gold_reference', 'head_fact_text',
       'head_text', 'head_relation', 'tail_old_label', 'tail_gold_reference',
       'tail_fact_text', 'tail_text', 'tail_relation', 'peacok_relation',
       'gpt_output_head', 'gpt_output_tail', 'text', 'gold_reference',
       'fact_text'],
      dtype='object')
transforming df Index(['dialog_id', 'head_old_label', 'head_gold_reference', 'head_fact_text',
       'head_text', 'head_relation', 'tail_old_label', 'tail_gold_reference',
       'tail_fact_text', 'tail_text', 'tail_relation', 'peacok_relation',
       'gpt_output_head', 'gpt_output_tail', 'text', 'gold_reference',
       'fact_text'],
      dtype='object')


DatasetDict({
    train: Dataset({
        features: ['dialog_id', 'dialog_dict', 'head_label', 'head_fact_text', 'gpt_output_head', 'tail_label', 'tail_fact_text', 'gpt_output_tail', 'peacok_relation', 'label'],
        num_rows: 35821
    })
    valid: Dataset({
        features: ['dialog_id', 'dialog_dict', 'head_label', 'head_fact_text', 'gpt_output_head', 'tail_label', 'tail_fact_text', 'gpt_output_tail', 'peacok_relation', 'label'],
        num_rows: 3981
    })
})

In [53]:
my_dataset_name = 'synthetic_convai2_peacok_knowledge_linking'
dataset.push_to_hub(my_dataset_name)

Creating parquet from Arrow format: 100%|██████████| 36/36 [00:00<00:00, 288.33ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.75s/it]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 374.01ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.50it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/theirislin/synthetic_convai2_peacok_knowledge_linking/commit/86b687d7bc312cec15467b55f6236caf8fe742ca', commit_message='Upload dataset', commit_description='', oid='86b687d7bc312cec15467b55f6236caf8fe742ca', pr_url=None, pr_revision=None, pr_num=None)