In [710]:
import pandas as pd
import numpy as np
from random import sample, shuffle, seed
from sklearn.model_selection import train_test_split
import json
import copy
import re

## Persona Chat data strip out of personality, create persona tags

In [816]:
# load data
# data : https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json
with open('./personachat_self_original.json') as f:
    persona = json.loads(f.read())

In [817]:
# split data into train and valid
train = persona['train']
valid = persona['valid']

In [1422]:
len(train)

17878

In [1423]:
len(valid)

1000

In [819]:
# combine data to have a list of all dialog
full_data = train + valid

In [820]:
# grab only the personality in each dialogue
personalities = [full_data[i]['personality'] for i in range(len(full_data))]

In [821]:
# goes through all the personality descriptions to clean and sort
for i in range(len(personalities)):
    for k in range(len(personalities[i])):
        personalities[i][k] = personalities[i][k].replace("'", ' ').replace(' m ', ' am ').replace(' ve ', ' have ').replace(' ll ', ' will ')
    personalities[i].sort()

In [823]:
# put all the personality in set so that we can easily find if a description of persona is a subset of another
set_personalities = [set(personalities[i]) for i in range(len(full_data))]

# get the personas that have 5 descriptions
five_persona = [set_personalities[i] for i in range(len(set_personalities)) if len(set_personalities[i]) > 4]
# get the personas that have 4 descriptions
four_persona = [set_personalities[i] for i in range(len(set_personalities)) if len(set_personalities[i]) == 4]

# finding the unique personas for 4 and 5 description list personas
b_set = set(tuple(x) for x in five_persona)
five_persona_unique = [ list(x) for x in b_set ]

c_set = set(tuple(x) for x in four_persona)
four_persona_unique = [ list(x) for x in c_set ]

In [824]:
# sort the description personas
for four_lines in four_persona_unique:
    four_lines.sort()
    
for five_lines in five_persona_unique:
    five_lines.sort()

# get the unique persona desciptions in sets rather than list
five_persona_set_unique = [set(persona_set) for persona_set in five_persona_unique]

four_persona_set_unique = [set(persona_set) for persona_set in four_persona_unique]

In [825]:
len(four_persona_unique)

4762

In [826]:
len(five_persona_unique)

1350

In [827]:
# goes through all the personas and sees if it is a subset of a larger persona description
# if it is it adds the index with the corresponding index and persona description len list in a tuple
# this is to map the index to the correct larger persona description
combinations = []
for i in range(len(set_personalities)):
    for k in range(len(five_persona_set_unique)):
        if set_personalities[i].issubset(five_persona_set_unique[k]):
            combinations.append((i,k,5))
            break

    if len(combinations) < i:
        for j in range(len(four_persona_unique)):
            if set_personalities[i].issubset(four_persona_set_unique[j]):
                combinations.append((i,j,4))  
                break
            else:
                continue

In [828]:
len(combinations)

18877

In [829]:
len(personalities)
# missing one index in combinations

18878

In [1381]:
combinations[20:25]
# skips 22

[(20, 420, 5), (21, 497, 5), (23, 1240, 5), (24, 361, 5), (25, 113, 5)]

In [831]:
# easy to find larger persona description from persona tag number 
persona_dict_numbers = {}
for i in range(len(five_persona_unique)):
    persona_dict_numbers[i] = five_persona_unique[i]
for k in range(len(four_persona_unique)):
    persona_dict_numbers[k+len(five_persona_unique)] = four_persona_unique[k]

In [1436]:
# example of consolidated persona description
# <persona15>
persona_dict_numbers.get(15)

['i am married .',
 'i love to read .',
 'my favorite place is the beach .',
 'my husband is a cop .',
 'teaching is my passion .']

In [833]:
# create dataframe to loop through to find tag personas
index_df = pd.DataFrame(data = combinations, columns = ['i_persona', 'i_unique', 'n'])

In [834]:
index_df.loc[22,'i_persona']

23

In [835]:
# go through each dialog and find which persona tag and larger persona description it is
# create new lists and add these in it to have the lareger persona descriptions / persona tags in the right index
k = 0
bigger_persona = []
bigger_persona_named = []
for i in range(len(personalities)):
    # checks if the indexes are the same / same data point
    if i==index_df.loc[k, 'i_persona']:
        # get the index of the unique persona, waiting to see if it is 4 or 5 persona description
        index = index_df.loc[k, 'i_unique']
        # if the unique index corresponds to the 5 persona description, add the specific 5 persona into the list
        if index_df.loc[k, 'n'] == 5:
            bigger_persona.append(five_persona_unique[index])
            bigger_persona_named.append(['<persona' + str(index)+ '>'])
            
        # if the unique index corresponds to the 4 persona description, add the specific 4 persona into the list
        elif index_df.loc[k, 'n'] == 4:
            bigger_persona.append(four_persona_unique[index])
            bigger_persona_named.append(['<persona' + str(index+len(five_persona_unique))+ '>'])

        k+=1
    # there is one personality that we hard code
    else:
        bigger_persona.append(personalities[i])
        # hard code this persona
        hard_code_number = len(five_persona_unique) + len(four_persona_unique)
        persona_dict_numbers[hard_code_number] = personalities[i]
        bigger_persona_named.append(['<persona'+ str(hard_code_number) + '>'])
        print(i)
    

22


In [836]:
def split_persona_train_val(full_data, train_len, personality_list_replacement):
    """replace the personality in the full data with the new consolidated personality description/tags
        and break it to train and validation
        returns a dictionary keys train and valid"""
    data = copy.deepcopy(full_data)
    for i in range(len(full_data)):
        # replace personality
        data[i]['personality'] = personality_list_replacement[i]
        
    train_data = data[:train_len]
    valid_data = data[train_len:]

    full_data_changed = {'train': train_data,
                  'valid': valid_data}
    
    return full_data_changed


In [837]:
full_data_named = split_persona_train_val(full_data, len(train), bigger_persona_named)
full_data_bigger = split_persona_train_val(full_data, len(train), bigger_persona)

In [762]:
# with open('personachat_data_named_bigger.json', 'w') as outfile:
#     json.dump(full_data_named, outfile)

In [601]:
# with open('personachat_data_bigger.json', 'w') as outfile:
#     json.dump(full_data_bigger, outfile)

## sample persona chat - training can't run the full data - takes too long or memory runs out

In [928]:
# create a dataframe with all the personas
persona_more = pd.DataFrame(data = bigger_persona_named, columns = ['persona'])
persona_more['count'] = 1

# group the personas to see how many times it appears in the dataset
count_personas = persona_more.groupby('persona').sum().reset_index()

# get a list of the personas that appear more than 4 times in this dataset
more_than_four_dialog_persona = count_personas[count_personas['count'] > 4]['persona'].tolist()

In [955]:
def sample_train_val(persona_tags_to_sample, larger_data_persona_tags, larger_data, num_sample, val_size, random_state):
    """ samples a larger persona data
        returns a dictionary with train and valid keys"""
    
    seed(random_state)
    # samples a number of persona tags
    sample_personas = sample(persona_tags_to_sample, num_sample)
    
    # larger_data_persona_tags = [larger_data[i]['personality'] for i in range(len(larger_data))]
    
    # finds the indices of the larger_data_persoa_tags to see where in the larger_data these personas are at
    # larger_data_persona_tags and larger_data should have the same persona tags in each indices
    # this is so that we can subset the larger_data that includes these personas
    sample_persona_indices = []
    for persona_i in sample_personas:
        indices = [i for i, x in enumerate(larger_data_persona_tags) if x[0] == persona_i]
        sample_persona_indices.extend(indices)
    sample_persona_indices.sort()
    
    #subset of data
    sample_data = [larger_data[i] for i in sample_persona_indices]
    
    #get personality of the subset of data
    sample_data_persona = [data_i['personality'] for data_i in sample_data]
    
    # balanced by personalitiy
    train_sample, valid_sample = train_test_split(sample_data, test_size = val_size, random_state= random_state, stratify = sample_data_persona)
    
    train_valid_sample = {'train': train_sample,
               'valid': valid_sample}
    
    # get the unique persona tags and creates a py file that has a list of the unique persona tags
    with open('./unique_persona_tag_'+str(num_sample)+'.py', mode='w') as file:
        file.write("persona_tags = ['" +  "', '".join(sample_personas) + "']")
    file.close()
    
    return train_valid_sample

In [None]:
full_data_combined = full_data_named['train'] + full_data_named['valid']

In [725]:
# samples 100 unique personas
train_valid_100 = sample_train_val(persona_tags_to_sample = more_than_four_dialog_persona, 
                 larger_data_persona_tags = bigger_persona_named, 
                 larger_data = full_data_combined, 
                 num_sample = 100,
                 val_size = 0.2,
                 random_state = 42)

In [723]:
# combines the train and valid and then replaces the persona tags with descriptions
num_sample_combined = train_valid_num['train'] + train_valid_num['valid']
# gets the persona descriptions from the persona tags
num_sample_combined_list_order = [persona_dict_numbers.get(
    int(re.sub('\D','' ,persona_['personality'][0]))) for persona_ in num_sample_combined]

# gets the dataset that uses the persona descriptions
train_valid_num_list = split_persona_train_val(num_sample_combined, 
                                               len(train_valid_num['train']), 
                                               num_sample_combined_list_order)

In [390]:
# with open('sample_persona_100.json', 'w') as outfile:
#     json.dump(train_valid_num_list, outfile)

In [391]:
# with open('sample_persona_named_100.json', 'w') as outfile:
#     json.dump(train_valid_num, outfile)

## Create 4list where for personality it is a list of persona tags with the last tag as the correct persona tag
### similar concept as guessing for candidates

In [68]:
bigger_persona_named_list = [data_i[0] for data_i in bigger_persona_named]
unique_personas = list(np.unique(bigger_persona_named))

In [731]:
def create_4list(persona_tag_list):
    """from the list of persona tags, create the same length list where each element is
    a list of four persona tags with the last one being the correct persona"""
    unique_persona_tags = list(np.unique(persona_tag_list))
    new_4list = []
    for persona_tag_i in persona_tag_list:
        sample_persona_list = sample(unique_persona_tags, 3)
        # make sure that the sample_persona_list doesn't have the correct persona
        while persona_tag_i in sample_persona_list:
            # take out the correct persona
            sample_persona_list.remove(persona_tag_i)
            
            # add a new persona
            new_add = sample(unique_persona_tags, 1)
            sample_persona_list.extend(new_add)
        # add the correct persona tag at the last index (3)
        sample_persona_list.append(persona_tag_i)
        
        new_4list.append(sample_persona_list)
        
    return new_4list

In [738]:
# get the personality tags of the num_sample
num_sample_combined_split = [data_i['personality'] for data_i in num_sample_combined]
num_sample_combined_split = [data_i['personality'][0] for data_i in num_sample_combined]
num_personality4_last = create_4list(num_sample_combined_split)

In [396]:
train_valid_num_4_last = split_persona_train_val(num_sample_combined, 
                                                 len(train_valid_num['train']), 
                                                 num_personality4_list)

In [397]:
# with open('sample_persona_4named_500.json', 'w') as outfile:
#     json.dump(train_valid_num_4_last, outfile)

In [147]:
# with open('personachat_data_named_bigger_4last.json', 'w') as outfile:
#     json.dump(full_data_named_4last, outfile)

## create a test set

In [1124]:
train_valid_10 = sample_train_val(persona_tags_to_sample = more_than_four_dialog_persona, 
                 larger_data_persona_tags = bigger_persona_named, 
                 larger_data = full_data_combined, 
                 num_sample = 10,
                 val_size = 0.3,
                 random_state = 334322)

In [1143]:
len(train_valid_10['train'])

107

In [1126]:
valid_10 = train_valid_10['valid']

In [1127]:
valid_sample_persona = [valid_10[i]['personality'] for i in range(len(valid_10))]
valid_more, test_more = train_test_split(valid_10, test_size = 0.5, random_state= 13, stratify = valid_sample_persona)

train_valid_more = {'train': train_valid_10['train'],
               'valid': valid_more}

test_more_dict = {'train': train_valid_10['train'],
                  'valid':test_more}

In [1133]:
# with open('sample_persona_named_10_train_val.json', 'w') as outfile:
#     json.dump(train_valid_more, outfile)

In [1134]:
train_valid_combined = train_valid_more['train'] + train_valid_more['valid']
test_more_combined = test_more_dict['train'] + test_more_dict['valid']

In [1135]:
train_valid_combined_persona = [data_i['personality'][0] for data_i in train_valid_combined]
test_combined_persona = [data_i['personality'][0] for data_i in test_more_combined]

train_valid_personality4_last = create_4list(train_valid_combined_persona)
test_personality4_last = create_4list(test_combined_persona)

In [1136]:
train_valid_persona_list_10 = [persona_dict_numbers.get(int(re.sub('\D','' ,persona_['personality'][0]))) for persona_ in train_valid_combined]
test_persona_list_10 = [persona_dict_numbers.get(int(re.sub('\D','' ,persona_['personality'][0]))) for persona_ in test_more_combined]


In [1147]:
train_10_list = split_persona_train_val(train_valid_combined, 
                                                 len(train_valid_more['train']), 
                                                 train_valid_persona_list_10)

In [None]:
train_10_4named = split_persona_train_val(train_valid_combined, 
                                                 len(train_valid_more['train']), 
                                                 train_valid_personality4_last)

In [None]:
test_10_list = split_persona_train_val(test_more_dict, 
                                                 len(test_more_dict['train']), 
                                                 test_persona_list_10)

In [None]:
test_10_4named = split_persona_train_val(test_more_dict, 
                                                 len(test_more_dict['train']), 
                                                 test_personality4_last)

In [1148]:
# with open('sample_persona_10_train_val.json', 'w') as outfile:
#     json.dump(train_10_list, outfile)

In [545]:
# with open('sample_persona_4named_100_train_val.json', 'w') as outfile:
#     json.dump(train_valid_more_4_last, outfile)

In [564]:
more_sample4_last_dialog = copy.deepcopy(more_sample_combined)
for i in range(len(more_sample4_last_dialog)):
    convo_last = more_sample4_last_dialog[i]['utterances'][-1]
    dialog = convo_last['history'] + [convo_last['candidates'][-1]]
    more_sample4_last_dialog[i]['utterances'] = dialog
train_more_4last_dialog = more_sample4_last_dialog[:len(train_more)]
valid_more_4last_dialog = more_sample4_last_dialog[len(train_more):]

train_valid_more_4_last_dialog = {'train': train_more_4last_dialog,
               'valid': valid_more_4last_dialog}

In [566]:
# with open('sample_persona_4named_100_train_val_dialog.json', 'w') as outfile:
#     json.dump(train_valid_more_4_last_dialog, outfile)

In [708]:
# with open('sample_persona_4named_100_test.json', 'w') as outfile:
#     json.dump(test_set, outfile)

In [569]:
more_sample4_last_dialog_test = copy.deepcopy(more_sample4_last_test)
for i in range(len(more_sample4_last_dialog_test)):
    convo_last = more_sample4_last_dialog_test[i]['utterances'][-1]
    dialog = convo_last['history'] + [convo_last['candidates'][-1]]
    more_sample4_last_dialog_test[i]['utterances'] = dialog

In [571]:
# with open('sample_persona_4named_100_test_dialog.json', 'w') as outfile:
#     json.dump(more_sample4_last_dialog_test, outfile)

In [604]:
# with open('./unique_persona_tag_100_dialog.py', mode='w') as file:
#     file.write("persona_tags = ['" +  "', '".join(sample_personas) + "']")
# file.close()

##  create data sets for human evaluation
### version 1: see 4 shuffled personality descriptions with a conversations and match the correct personality with the given dialog
### version 2: see 4 conversation with their persona tag, the 5th conversation has a persona of one of the previous 4 persona tags, guess the correct persona tag with the given dialog

In [1149]:
# # load data
# with open('sample_persona_4named_100_train_val.json') as f:
#     human_eval = json.loads(f.read())

In [1150]:
# # load data
# with open('sample_persona_4named_100_train_val.json') as f:
#     human_eval_dialog = json.loads(f.read())

In [1236]:
# load data
# to create the human evaluation data
with open('sample_persona_named_100_train_val.json') as f:
    human_eval_dialog = json.loads(f.read())

In [1238]:
# grab the first 80 dialog in train - need 20 * 4 dialog
# need 4 different dialog with different personas and have a last persona that
# is unknown to the user but show the dialog and classify which persona out of the
# previous 4 personas spoke in this dialog
human_eval_80 = human_eval_dialog['train'][:80]

In [1243]:
# grab validation data, this will be used for the task
# to guess the persona by the shuffled persona descriptions (the correct persona is not necessarily 
# the last description) [human evaluation set version 1]
# to guess the persona given
# previous 4 dialogs with persona tags [human evaluation set version 2]
human_eval_valid = human_eval_dialog['valid']

In [None]:
# load data
# to create the human evaluation data
with open('sample_persona_4named_100_train_val.json') as f:
    human_eval_4dialog = json.loads(f.read())

In [None]:
human_eval_4valid = human_eval_4dialog['valid']

In [1165]:
# version 1
# take the first 20 dialog in the validation
# shuffle the persona tags in the personality
# map the persona tags with their description
# grab only the full conversation rather than all the candidates and history
# make it easier for the user to see the conversation and the personality lists
human_eval_20 = human_eval_4valid[:20]
more_sample4_last_dialog_test = copy.deepcopy(human_eval_20)
for i in range(len(more_sample4_last_dialog_test)):
    convo_last = more_sample4_last_dialog_test[i]['utterances'][-1]
    dialog = convo_last['history'] + [convo_last['candidates'][-1]]
    more_sample4_last_dialog_test[i]['utterances'] = dialog
    personas = more_sample4_last_dialog_test[i]['personality']
    shuffle(personas)
    more_sample4_last_dialog_test[i]['personality'] = [persona_dict_numbers.get(int(re.sub('\D','' ,persona_))) for persona_ in personas]

In [1239]:
# grab all the personas in valid data set
# using this data to have the masked persona, so that the user can test which
# persona is speaking
valid_persona = [human_eval_valid[i]['personality'] for i in range(len(human_eval_valid))]

In [1241]:
# grab all the personas in the 80 dialog to easily extract the personas by index for sampling later
personality_80 = [human_eval_80[i]['personality'] for i in range(80)]

In [1246]:
# make the utterances just the dialog (last history with last candidate of last utterance) 
# between the persona and the other speaker
# easier for the user to see the dialog rather than have the candidates and history
human_valid = copy.deepcopy(human_eval_valid)
for i in range(len(human_valid)):
    convo_last = human_valid[i]['utterances'][-1]
    dialog = convo_last['history'] + [convo_last['candidates'][-1]]
    human_valid[i]['utterances'] = dialog

In [1247]:
# make the training data with just the dialog as well
# make it easier for the user to read the conversation
human_80 = copy.deepcopy(human_eval_80)
for i in range(len(human_80)):
    convo_last = human_80[i]['utterances'][-1]
    dialog = convo_last['history'] + [convo_last['candidates'][-1]]
    human_80[i]['utterances'] = dialog

In [1419]:
# version 2
# create the human evaluation data
# classifying 20 dialogs
# get 4 conversations 
# sample the 4 personas for 1 persona to have the masked persona
# find one of the conversations with that 1 persona is in and add it after the 4 personas
# replace the persona tag with strings for that conversation
human_eval_copy = copy.deepcopy(human_valid)
human_eval_4 = []
answer_key_valid = []
for i in range(20):
    human_eval_4.extend(human_80[i*4:(i+1)*4])
    four_persona = personality_80[i*4:(i+1)*4]
    sample_persona = sample(four_persona,1)[0]
    index = valid_persona.index(sample_persona)
    sample_valid = copy.deepcopy(human_eval_copy[index])
    answer_key_valid.append(sample_valid['personality'])
    sample_valid['personality'] = ''
    
    human_eval_4.append(sample_valid)

In [1416]:
# with open('sample_persona_100_human_eval_dialog_20_v2_answer_key.json', 'w') as outfile:
#     json.dump(answer_key_valid, outfile)

In [1168]:
# with open('sample_persona_100_human_eval_dialog_20.json', 'w') as outfile:
#     json.dump(more_sample4_last_dialog_test, outfile)