## setup

In [1]:
import logging
import random
import copy

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader, Dataset
from collections import defaultdict
from datasets import concatenate_datasets, load_dataset
from sklearn.metrics import classification_report
from openai import OpenAI
from retry import retry
import itertools
from datasets import Dataset, DatasetDict

In [2]:
%load_ext dotenv
%dotenv

## dataset

In [3]:
template_head = 'It is %meta% that '
template_middle = ' is %meta% to ' # is? to V or to be C?
template_tail_head = 'The information that '
template_tail_tail = ' is %meta%.'

In [4]:
ds_original = load_dataset("kenken6696/ALCUNA_question2affirmative")
tds = ds_original['train']

target_tds = tds.filter(lambda line: line['form'] == 'multi-choice')
new_column = [None] * len(target_tds)
target_tds = target_tds.add_column("meta_tag", new_column).add_column("meta_sentence", new_column).add_column("meta_rep", new_column).add_column("meta_temp", new_column).add_column("meta_position", new_column)

In [5]:
@retry(delay=1, backoff=2, max_delay=60)
def get_res(model, prompt):
    try: 
        client = OpenAI()
        response = client.chat.completions.create(
                        model=model,
                        messages=[
                        {'role': 'user', 'content': prompt}],
                        temperature=0.0
        )
        res = response.choices[0].message.content
    except:
        res = 'api_error'
    return res

prompt_concat = "### Instruction: 'I will provide you with a Sentence and a Meta Representation. Combine the Meta Representation between the subject and the verb in the Sentence to create a new sentence. Ensure that the vocabulary remains unchanged and that the sentence is grammatically correct.'\n\n\
### Sentence:'A monkey likes bananas.'\n### Meta Representation:' is known to '\n### New Sentence:'A monkey is known to like bananas.'\n\n\
### Sentence:'{original_sentence}'\n### Meta Representation:'{meta_representation}'\n### New Sentence:"

def make_middle_meta_sentence(sentence, template_middle, meta_rep, prompt=prompt_concat ,model="gpt-4o-mini") -> str:
    template_middle_filled = template_middle.replace('%meta%', meta_rep)
    middle_meta_sentence = get_res(model, prompt.format(original_sentence=sentence, meta_representation=template_middle_filled))
    return middle_meta_sentence.strip('\'')# 'sentence'となっているので外す

def make_meta_sentence(example, meta_tag, position=None, meta_rep=None, meta_temps={'head': [template_head], 'middle': [template_middle], 'tail': [template_tail_head, template_tail_tail]}, model="gpt-4o-mini", middle_replace=False):
    '''datasetの列とmeta_repを受け取って、meta_tag, meta_sensenceを入れたexampleを返す
    meta_tag = 'none', 'known', 'unknown'
    position = 'head', 'middle', 'tail'
    '''
    
    example["meta_tag"] =  meta_tag
    
    if meta_tag == 'none':
        example["meta_sentence"] =  example['sentence']
    elif (meta_tag in ['known', 'unknown']) & (position not in ['head', 'middle', 'tail']):
        raise ValueError(f'When setting meta_tag:"known|unknown, you should set "head|middle|tail" as position')
    else:
        example["meta_rep"] =  meta_rep
        example["meta_temp"] =  ','.join(meta_temps[position])
        example["meta_position"] = position
        
        if position == 'head':
            example["meta_sentence"] =  (meta_temps['head'][0] + example['sentence']).replace('%meta%', meta_rep)
        elif position == 'tail':
            example["meta_sentence"] =  (meta_temps['tail'][0] + example['sentence'][:-1] + meta_temps['tail'][1]).replace('%meta%', meta_rep)
        elif position == 'middle':
            example["meta_sentence"] = make_middle_meta_sentence(sentence=example['sentence'], template_middle=meta_temps['middle'][0], meta_rep=meta_rep) 
    
    return example

def make_meta_tds(tds, position, meta_rep_known='known', meta_rep_unknown='unknown'):
    '''train_datasetとpositonを受けってtds作成'''

    temp_tds  = copy.deepcopy(tds)
    split_size = len(temp_tds)//3

    temp_tds_known = temp_tds.select(range(0,split_size)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'known', "meta_rep": meta_rep_known})
    temp_tds_unknown = temp_tds.select(range(split_size, split_size*2)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'unknown', "meta_rep": meta_rep_unknown})
    temp_tds_none = temp_tds.select(range(split_size*2, len(temp_tds))).map(make_meta_sentence, fn_kwargs={"meta_tag": 'none'})
    tds_fix = concatenate_datasets([temp_tds_known, temp_tds_unknown, temp_tds_none])
    
    return tds_fix

def make_meta_tds_4(tds, position, meta_rep_known='known', meta_rep_unknown='unknown', meta_rep_others='boring'):
    '''train_datasetとpositonを受けってtds作成'''

    temp_tds  = copy.deepcopy(tds)
    split_size = len(temp_tds)//4

    temp_tds_known = temp_tds.select(range(0,split_size)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'known', "meta_rep": meta_rep_known})
    temp_tds_unknown = temp_tds.select(range(split_size, split_size*2)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'unknown', "meta_rep": meta_rep_unknown})
    temp_tds_others = temp_tds.select(range(split_size*2, split_size*3)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'others', "meta_rep": meta_rep_others})
    temp_tds_none = temp_tds.select(range(split_size*3, len(temp_tds))).map(make_meta_sentence, fn_kwargs={"meta_tag": 'none'})
    tds_fix = concatenate_datasets([temp_tds_known, temp_tds_unknown, temp_tds_others, temp_tds_none])
    
    return tds_fix

def make_meta_tds_3x3(tds, position, meta_reps_known, meta_reps_unknown, meta_temps):
    '''train_datasetとpositonを受けってtds作成'''

    temp_tds  = copy.deepcopy(tds)
    split_size = len(temp_tds)//7

    temp_tds_known1 = temp_tds.select(range(0,split_size)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'known', "meta_rep": meta_reps_known[0], "meta_temps": meta_temps})
    temp_tds_known2 = temp_tds.select(range(split_size, split_size*2)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'known', "meta_rep": meta_reps_known[1], "meta_temps": meta_temps})
    temp_tds_known3 = temp_tds.select(range(split_size*2, split_size*3)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'known', "meta_rep": meta_reps_known[2], "meta_temps": meta_temps})
    temp_tds_unknown1 = temp_tds.select(range(split_size*3, split_size*4)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[0], "meta_temps": meta_temps})
    temp_tds_unknown2 = temp_tds.select(range(split_size*4, split_size*5)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[1], "meta_temps": meta_temps})
    temp_tds_unknown3 = temp_tds.select(range(split_size*5, split_size*6)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[2], "meta_temps": meta_temps})
    temp_tds_none = temp_tds.select(range(split_size*6, len(temp_tds))).map(make_meta_sentence, fn_kwargs={"meta_tag": 'none'})
    tds_fix = concatenate_datasets([temp_tds_known1, temp_tds_known2, temp_tds_known3, temp_tds_unknown1, temp_tds_unknown2, temp_tds_unknown3, temp_tds_none])
    
    return tds_fix



def make_meta_tds_4x3(tds, position, meta_reps_known, meta_reps_unknown, meta_reps_others, meta_temps):
    '''train_datasetとpositonを受けってtds作成'''

    temp_tds  = copy.deepcopy(tds)
    split_size = len(temp_tds)//10

    temp_tds_known1 = temp_tds.select(range(0,split_size)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'known', "meta_rep": meta_reps_known[0], "meta_temps": meta_temps})
    temp_tds_known2 = temp_tds.select(range(split_size, split_size*2)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'known', "meta_rep": meta_reps_known[1], "meta_temps": meta_temps})
    temp_tds_known3 = temp_tds.select(range(split_size*2, split_size*3)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'known', "meta_rep": meta_reps_known[2], "meta_temps": meta_temps})
    temp_tds_unknown1 = temp_tds.select(range(split_size*3, split_size*4)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[0], "meta_temps": meta_temps})
    temp_tds_unknown2 = temp_tds.select(range(split_size*4, split_size*5)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[1], "meta_temps": meta_temps})
    temp_tds_unknown3 = temp_tds.select(range(split_size*5, split_size*6)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[2], "meta_temps": meta_temps})
    temp_tds_others1 = temp_tds.select(range(split_size*6, split_size*7)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'others', "meta_rep": meta_reps_others[0], "meta_temps": meta_temps})
    temp_tds_others2 = temp_tds.select(range(split_size*7, split_size*8)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'others', "meta_rep": meta_reps_others[1], "meta_temps": meta_temps})
    temp_tds_others3 = temp_tds.select(range(split_size*8, split_size*9)).map(make_meta_sentence, fn_kwargs={"position": position, "meta_tag": 'others', "meta_rep": meta_reps_others[2], "meta_temps": meta_temps})
    temp_tds_none = temp_tds.select(range(split_size*9, len(temp_tds))).map(make_meta_sentence, fn_kwargs={"meta_tag": 'none'})
    tds_fix = concatenate_datasets([temp_tds_known1, temp_tds_known2, temp_tds_known3, temp_tds_unknown1, temp_tds_unknown2, temp_tds_unknown3, temp_tds_others1, temp_tds_others2, temp_tds_others3, temp_tds_none])
    
    return tds_fix


def make_meta_tds_4xposition(tds, meta_reps_known, meta_reps_unknown, meta_reps_others, meta_temps):
    '''train_datasetとpositonを受けってtds作成'''

    temp_tds  = copy.deepcopy(tds)
    split_size = len(temp_tds)//10
    position = ['head', 'middle', 'tail']

    temp_tds_known1 = temp_tds.select(range(0,split_size)).map(make_meta_sentence, fn_kwargs={"position": position[0], "meta_tag": 'known', "meta_rep": meta_reps_known[0], "meta_temps": meta_temps})
    temp_tds_known2 = temp_tds.select(range(split_size, split_size*2)).map(make_meta_sentence, fn_kwargs={"position": position[1], "meta_tag": 'known', "meta_rep": meta_reps_known[1], "meta_temps": meta_temps})
    temp_tds_known3 = temp_tds.select(range(split_size*2, split_size*3)).map(make_meta_sentence, fn_kwargs={"position": position[2], "meta_tag": 'known', "meta_rep": meta_reps_known[2], "meta_temps": meta_temps})
    temp_tds_unknown1 = temp_tds.select(range(split_size*3, split_size*4)).map(make_meta_sentence, fn_kwargs={"position": position[0], "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[0], "meta_temps": meta_temps})
    temp_tds_unknown2 = temp_tds.select(range(split_size*4, split_size*5)).map(make_meta_sentence, fn_kwargs={"position": position[1], "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[1], "meta_temps": meta_temps})
    temp_tds_unknown3 = temp_tds.select(range(split_size*5, split_size*6)).map(make_meta_sentence, fn_kwargs={"position": position[2], "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[2], "meta_temps": meta_temps})
    temp_tds_others1 = temp_tds.select(range(split_size*6, split_size*7)).map(make_meta_sentence, fn_kwargs={"position": position[0], "meta_tag": 'others', "meta_rep": meta_reps_others[0], "meta_temps": meta_temps})
    temp_tds_others2 = temp_tds.select(range(split_size*7, split_size*8)).map(make_meta_sentence, fn_kwargs={"position": position[1], "meta_tag": 'others', "meta_rep": meta_reps_others[1], "meta_temps": meta_temps})
    temp_tds_others3 = temp_tds.select(range(split_size*8, split_size*9)).map(make_meta_sentence, fn_kwargs={"position": position[2], "meta_tag": 'others', "meta_rep": meta_reps_others[2], "meta_temps": meta_temps})
    temp_tds_none = temp_tds.select(range(split_size*9, len(temp_tds))).map(make_meta_sentence, fn_kwargs={"meta_tag": 'none'})
    tds_fix = concatenate_datasets([temp_tds_known1, temp_tds_known2, temp_tds_known3, temp_tds_unknown1, temp_tds_unknown2, temp_tds_unknown3, temp_tds_others1, temp_tds_others2, temp_tds_others3, temp_tds_none])
    
    return tds_fix

def make_meta_tds_3xposition(tds, meta_reps_known, meta_reps_unknown, meta_temps):
    '''train_datasetとpositonを受けってtds作成'''

    temp_tds  = copy.deepcopy(tds)
    split_size = len(temp_tds)//7
    position = ['head', 'middle', 'tail']

    temp_tds_known1 = temp_tds.select(range(0,split_size)).map(make_meta_sentence, fn_kwargs={"position": position[0], "meta_tag": 'known', "meta_rep": meta_reps_known[0], "meta_temps": meta_temps})
    temp_tds_known2 = temp_tds.select(range(split_size, split_size*2)).map(make_meta_sentence, fn_kwargs={"position": position[1], "meta_tag": 'known', "meta_rep": meta_reps_known[1], "meta_temps": meta_temps})
    temp_tds_known3 = temp_tds.select(range(split_size*2, split_size*3)).map(make_meta_sentence, fn_kwargs={"position": position[2], "meta_tag": 'known', "meta_rep": meta_reps_known[2], "meta_temps": meta_temps})
    temp_tds_unknown1 = temp_tds.select(range(split_size*3, split_size*4)).map(make_meta_sentence, fn_kwargs={"position": position[0], "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[0], "meta_temps": meta_temps})
    temp_tds_unknown2 = temp_tds.select(range(split_size*4, split_size*5)).map(make_meta_sentence, fn_kwargs={"position": position[1], "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[1], "meta_temps": meta_temps})
    temp_tds_unknown3 = temp_tds.select(range(split_size*5, split_size*6)).map(make_meta_sentence, fn_kwargs={"position": position[2], "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[2], "meta_temps": meta_temps})
    temp_tds_none = temp_tds.select(range(split_size*6, len(temp_tds))).map(make_meta_sentence, fn_kwargs={"meta_tag": 'none'})
    tds_fix = concatenate_datasets([temp_tds_known1, temp_tds_known2, temp_tds_known3, temp_tds_unknown1, temp_tds_unknown2, temp_tds_unknown3, temp_tds_none])
    
    return tds_fix

def make_meta_tds_3x3position(tds, meta_reps_known, meta_reps_unknown, meta_temps):
    '''train_datasetとpositonを受けってtds作成'''

    temp_tds  = copy.deepcopy(tds)
    split_size = len(temp_tds)//19
    position = ['head', 'middle', 'tail']

    temp_tds_known1 = temp_tds.select(range(0,split_size)).map(make_meta_sentence, fn_kwargs={"position": position[0], "meta_tag": 'known', "meta_rep": meta_reps_known[0], "meta_temps": meta_temps})
    temp_tds_known2 = temp_tds.select(range(split_size, split_size*2)).map(make_meta_sentence, fn_kwargs={"position": position[0], "meta_tag": 'known', "meta_rep": meta_reps_known[1], "meta_temps": meta_temps})
    temp_tds_known3 = temp_tds.select(range(split_size*2, split_size*3)).map(make_meta_sentence, fn_kwargs={"position": position[0], "meta_tag": 'known', "meta_rep": meta_reps_known[2], "meta_temps": meta_temps})
    temp_tds_known11 = temp_tds.select(range(split_size*3, split_size*4)).map(make_meta_sentence, fn_kwargs={"position": position[1], "meta_tag": 'known', "meta_rep": meta_reps_known[0], "meta_temps": meta_temps})
    temp_tds_known21 = temp_tds.select(range(split_size*4, split_size*5)).map(make_meta_sentence, fn_kwargs={"position": position[1], "meta_tag": 'known', "meta_rep": meta_reps_known[1], "meta_temps": meta_temps})
    temp_tds_known31 = temp_tds.select(range(split_size*5, split_size*6)).map(make_meta_sentence, fn_kwargs={"position": position[1], "meta_tag": 'known', "meta_rep": meta_reps_known[2], "meta_temps": meta_temps})
    temp_tds_known12 = temp_tds.select(range(split_size*6, split_size*7)).map(make_meta_sentence, fn_kwargs={"position": position[2], "meta_tag": 'known', "meta_rep": meta_reps_known[0], "meta_temps": meta_temps})
    temp_tds_known22 = temp_tds.select(range(split_size*7, split_size*8)).map(make_meta_sentence, fn_kwargs={"position": position[2], "meta_tag": 'known', "meta_rep": meta_reps_known[1], "meta_temps": meta_temps})
    temp_tds_known32 = temp_tds.select(range(split_size*8, split_size*9)).map(make_meta_sentence, fn_kwargs={"position": position[2], "meta_tag": 'known', "meta_rep": meta_reps_known[2], "meta_temps": meta_temps})
    temp_tds_unknown1 = temp_tds.select(range(split_size*9, split_size*10)).map(make_meta_sentence, fn_kwargs={"position": position[0], "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[0], "meta_temps": meta_temps})
    temp_tds_unknown2 = temp_tds.select(range(split_size*10, split_size*11)).map(make_meta_sentence, fn_kwargs={"position": position[0], "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[1], "meta_temps": meta_temps})
    temp_tds_unknown3 = temp_tds.select(range(split_size*11, split_size*12)).map(make_meta_sentence, fn_kwargs={"position": position[0], "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[2], "meta_temps": meta_temps})
    temp_tds_unknown11 = temp_tds.select(range(split_size*12, split_size*13)).map(make_meta_sentence, fn_kwargs={"position": position[1], "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[0], "meta_temps": meta_temps})
    temp_tds_unknown21 = temp_tds.select(range(split_size*13, split_size*14)).map(make_meta_sentence, fn_kwargs={"position": position[1], "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[1], "meta_temps": meta_temps})
    temp_tds_unknown31 = temp_tds.select(range(split_size*14, split_size*15)).map(make_meta_sentence, fn_kwargs={"position": position[1], "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[2], "meta_temps": meta_temps})
    temp_tds_unknown12 = temp_tds.select(range(split_size*15, split_size*16)).map(make_meta_sentence, fn_kwargs={"position": position[2], "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[0], "meta_temps": meta_temps})
    temp_tds_unknown22 = temp_tds.select(range(split_size*16, split_size*17)).map(make_meta_sentence, fn_kwargs={"position": position[2], "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[1], "meta_temps": meta_temps})
    temp_tds_unknown32 = temp_tds.select(range(split_size*17, split_size*18)).map(make_meta_sentence, fn_kwargs={"position": position[2], "meta_tag": 'unknown', "meta_rep": meta_reps_unknown[2], "meta_temps": meta_temps})

    temp_tds_none = temp_tds.select(range(split_size*18, len(temp_tds))).map(make_meta_sentence, fn_kwargs={"meta_tag": 'none'})
    tds_fix = concatenate_datasets([temp_tds_known1, temp_tds_known2, temp_tds_known3, \
                                    temp_tds_known11, temp_tds_known21, temp_tds_known31, \
                                    temp_tds_known12, temp_tds_known22, temp_tds_known32,\
                                    temp_tds_unknown1, temp_tds_unknown2, temp_tds_unknown3, \
                                    temp_tds_unknown11, temp_tds_unknown21, temp_tds_unknown31, \
                                    temp_tds_unknown12, temp_tds_unknown22, temp_tds_unknown32, \
                                    temp_tds_none])
    
    return tds_fix

## make

### make1

In [12]:
'''
meta_rep_known = 'known'
meta_rep_unknown = 'unknown'
meta_rep_known = 'famous'
meta_rep_unknown = 'unrecognized'
meta_rep_known = 'understood'
meta_rep_unknown = 'unfamiliar'

# fake
meta_rep_known = 'funny'
meta_rep_unknown = 'boring'
meta_rep_known = 'biased'
meta_rep_unknown = 'unbiased'
meta_rep_known = 'relevant'
meta_rep_unknown = 'irrelevant'
'''
meta_rep_known = 'known'
meta_rep_unknown = 'unknown'
tds_middle = make_meta_tds(target_tds, 'middle', meta_rep_known=meta_rep_known, meta_rep_unknown=meta_rep_unknown)
tds_head = make_meta_tds(target_tds, 'head', meta_rep_known=meta_rep_known, meta_rep_unknown=meta_rep_unknown)
tds_tail = make_meta_tds(target_tds, 'tail', meta_rep_known=meta_rep_known, meta_rep_unknown=meta_rep_unknown)

In [None]:
meta_rep_known = 'known'
meta_rep_unknown = 'unknown'
meta_rep_others = 'boring'

tds_middle = make_meta_tds_4(target_tds, 'middle', meta_rep_known=meta_rep_known, meta_rep_unknown=meta_rep_unknown, meta_rep_others=meta_rep_others)
tds_head = make_meta_tds_4(target_tds, 'head', meta_rep_known=meta_rep_known, meta_rep_unknown=meta_rep_unknown, meta_rep_others=meta_rep_others)
tds_tail = make_meta_tds_4(target_tds, 'tail', meta_rep_known=meta_rep_known, meta_rep_unknown=meta_rep_unknown, meta_rep_others=meta_rep_others)

Map:   0%|          | 0/614 [00:00<?, ? examples/s]

In [None]:
meta_reps_known=['known', 'recognized', 'understood']
meta_reps_unknown=['unknown', 'unrecognized', 'uncertain']
meta_temps={'head': [template_head], 'middle': [template_middle], 'tail': [template_tail_head, template_tail_tail]}

tds_middle = make_meta_tds_3x3(target_tds, 'middle', meta_reps_known=meta_reps_known, meta_reps_unknown=meta_reps_unknown, meta_temps=meta_temps)
tds_head = make_meta_tds_3x3(target_tds, 'head', meta_reps_known=meta_reps_known, meta_reps_unknown=meta_reps_unknown, meta_temps=meta_temps)
tds_tail = make_meta_tds_3x3(target_tds, 'tail', meta_reps_known=meta_reps_known, meta_reps_unknown=meta_reps_unknown, meta_temps=meta_temps)

Map:   0%|          | 0/351 [00:00<?, ? examples/s]

In [None]:
meta_reps_known=['known', 'famous', 'understood']
meta_reps_unknown=['unknown', 'unrecognized', 'unfamiliar']
meta_reps_others=['boring', 'biased', 'relevant']
meta_temps={'head': [template_head], 'middle': [template_middle], 'tail': [template_tail_head, template_tail_tail]}


tds_middle = make_meta_tds_4x3(target_tds, 'middle', meta_reps_known=meta_reps_known, meta_reps_unknown=meta_reps_unknown, meta_reps_others=meta_reps_others, meta_temps=meta_temps)
tds_head = make_meta_tds_4x3(target_tds, 'head', meta_reps_known=meta_reps_known, meta_reps_unknown=meta_reps_unknown, meta_reps_others=meta_reps_others, meta_temps=meta_temps)
tds_tail = make_meta_tds_4x3(target_tds, 'tail', meta_reps_known=meta_reps_known, meta_reps_unknown=meta_reps_unknown, meta_reps_others=meta_reps_others, meta_temps=meta_temps)

In [6]:
meta_rep_known = 'known'
meta_rep_unknown = 'unknown'
meta_reps_known=[meta_rep_known, meta_rep_known, meta_rep_known]
meta_reps_unknown=[meta_rep_unknown, meta_rep_unknown, meta_rep_unknown]
meta_temps={'head': [template_head], 'middle': [template_middle], 'tail': [template_tail_head, template_tail_tail]}

tds = make_meta_tds_3xposition(target_tds,  meta_reps_known=meta_reps_known, meta_reps_unknown=meta_reps_unknown, meta_temps=meta_temps)

In [None]:
meta_reps_known=['known', 'recognized', 'understood']
meta_reps_unknown=['unknown', 'unrecognized', 'uncertain']
meta_temps={'head': [template_head], 'middle': [template_middle], 'tail': [template_tail_head, template_tail_tail]}

tds = make_meta_tds_3x3position(target_tds,  meta_reps_known=meta_reps_known, meta_reps_unknown=meta_reps_unknown, meta_temps=meta_temps)

In [8]:
meta_reps_known=['known', 'known', 'known']
meta_reps_unknown=['unknown', 'unknown', 'unknown']
meta_reps_others=['biased', 'biased', 'biased']
meta_temps={'head': [template_head], 'middle': [template_middle], 'tail': [template_tail_head, template_tail_tail]}


tds = make_meta_tds_4xposition(target_tds,  meta_reps_known=meta_reps_known, meta_reps_unknown=meta_reps_unknown, meta_reps_others=meta_reps_others, meta_temps=meta_temps)

In [None]:
meta_reps_known=['known', 'famous', 'understood']
meta_reps_unknown=['unknown', 'unrecognized', 'unfamiliar']
meta_reps_others=['boring', 'biased', 'relevant']
meta_temps={'head': [template_head], 'middle': [template_middle], 'tail': [template_tail_head, template_tail_tail]}


tds = make_meta_tds_4xposition(target_tds,  meta_reps_known=meta_reps_known, meta_reps_unknown=meta_reps_unknown, meta_reps_others=meta_reps_others, meta_temps=meta_temps)

### upload

In [7]:
dd = DatasetDict()
dd['meta_position_head'] = tds_head
dd['meta_position_middle'] = tds_middle
dd['meta_position_tail'] = tds_tail

In [None]:
dd.push_to_hub(f"ALCUNA_meta_affirmative_{meta_rep_known}_{meta_rep_unknown}")

In [None]:
'''
使うときは自分でsplitして
ds_fix_head = tds_fix_head.train_test_split(test_size=0.1)
ds_fix_tail = tds_fix_tail.train_test_split(test_size=0.1)
ds_fix_middle = tds_fix_middle.train_test_split(test_size=0.1)
'''

In [None]:
dd.push_to_hub(f"ALCUNA_meta_affirmative_{meta_rep_known}_{meta_rep_unknown}_{meta_rep_others}")

In [None]:
dd.push_to_hub(f"ALCUNA_meta_affirmative_3x3")

In [None]:
dd.push_to_hub(f"ALCUNA_meta_affirmative_4x3")

In [7]:
dd = DatasetDict()
dd['meta_position_mix'] = tds

dd.push_to_hub(f"ALCUNA_meta_affirmative_3_mix_position_{meta_rep_known}_{meta_rep_unknown}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/kenken6696/ALCUNA_meta_affirmative_3_mix_position_known_unknown/commit/688c9c22d113405a702fd5fdb449eb085aebcf51', commit_message='Upload dataset', commit_description='', oid='688c9c22d113405a702fd5fdb449eb085aebcf51', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/kenken6696/ALCUNA_meta_affirmative_3_mix_position_known_unknown', endpoint='https://huggingface.co', repo_type='dataset', repo_id='kenken6696/ALCUNA_meta_affirmative_3_mix_position_known_unknown'), pr_revision=None, pr_num=None)

In [None]:
dd = DatasetDict()
dd['meta_position_mix'] = tds

dd.push_to_hub(f"ALCUNA_meta_affirmative_3x3_mix_position")

In [None]:
dd = DatasetDict()
dd['meta_position_mix'] = tds

dd.push_to_hub(f"ALCUNA_meta_affirmative_4_mix_position")

In [None]:
dd = DatasetDict()
dd['meta_position_mix'] = tds

dd.push_to_hub(f"ALCUNA_meta_affirmative_4x3_mix_position")

## clean exiested dataset

In [40]:
def clean_middle_meta_sentence(example):
    middle_meta_sentence = example["meta_sentence"]
    clean_middle_meta_sentence = middle_meta_sentence.strip('\'')# 'sentence'となっているので外す
    example["meta_sentence"] = clean_middle_meta_sentence
    return example

def clean_dd_middle(dd):
    clean_dd = copy.deepcopy(dd)
    clean_dd['meta_position_middle'] = dd['meta_position_middle'].map(clean_middle_meta_sentence)
    return clean_dd

In [None]:
meta_rep_known = 'known'
meta_rep_unknown = 'unknown'

dd = load_dataset(f"kenken6696/ALCUNA_meta_affirmative_{meta_rep_known}_{meta_rep_unknown}")
clean_dd = clean_dd_middle(dd)
clean_dd.push_to_hub(f"ALCUNA_meta_affirmative_{meta_rep_known}_{meta_rep_unknown}")

In [None]:
meta_rep_known = 'funny'
meta_rep_unknown = 'boring'

dd = load_dataset(f"kenken6696/ALCUNA_meta_affirmative_{meta_rep_known}_{meta_rep_unknown}")
clean_dd = clean_dd_middle(dd)
clean_dd.push_to_hub(f"ALCUNA_meta_affirmative_{meta_rep_known}_{meta_rep_unknown}")

In [None]:
meta_rep_known = 'biased'
meta_rep_unknown = 'unbiased'

dd = load_dataset(f"kenken6696/ALCUNA_meta_affirmative_{meta_rep_known}_{meta_rep_unknown}")
clean_dd = clean_dd_middle(dd)
clean_dd.push_to_hub(f"ALCUNA_meta_affirmative_{meta_rep_known}_{meta_rep_unknown}")

In [None]:
meta_rep_known = 'known'
meta_rep_unknown = 'unknown'
meta_rep_others = 'boring'

dd = load_dataset(f"kenken6696/ALCUNA_meta_affirmative_{meta_rep_known}_{meta_rep_unknown}_{meta_rep_others}")
clean_dd = clean_dd_middle(dd)
clean_dd.push_to_hub(f"ALCUNA_meta_affirmative_{meta_rep_known}_{meta_rep_unknown}_{meta_rep_others}")

In [None]:
dd = load_dataset(f"kenken6696/ALCUNA_meta_affirmative_4x3")
clean_dd = clean_dd_middle(dd)
clean_dd.push_to_hub(f"ALCUNA_meta_affirmative_4x3")