In [1]:
import openai

import pandas as pd
import os
import numpy as np
from ast import literal_eval
import json
import pickle
import time

In [11]:
df_train = pd.read_csv('challenge_data_new/atis/atis_train.csv')

## Only GPT

In [2]:
from openai import OpenAI

client = OpenAI(
  organization='', api_key=""
)

In [3]:
def request_response_from_gpt(prompt):
    response = client.chat.completions.create(
        model="gpt-4-0314",
        #prompt=prompt,
        messages=[
            {"role": "system", "content": "You are a crowdsourcing worker that earns a living through creating paraphrases."},
            {"role": "user", "content": prompt}],
        temperature=1,
        frequency_penalty=0.0,
        presence_penalty=1.5,
        n=1)
    return response

In [4]:
def request_with_checks(prompt):
    success = False
    count = 0
    while not success:
        if count > 0:
            print(f'Retrying with again. Current number of retries: {count}')
        if count >= 10:
            raise Exception('Too many attempts')
        try:
            response = request_response_from_gpt(prompt)
            sucess = True
            break
        except openai.error.RateLimitError as e:
            print(e)
            time.sleep(10)
            count += 1
        except openai.error.APIConnectionError as e:
            print(e)
            time.sleep(5)
            count += 1
        except openai.error.APIError or openai.error.JSONDecodeError as e:
            print(e)
            time.sleep(5)
            count += 1
        except openai.error.Timeout as e:
            print(e)
            time.sleep(5)
            count += 1
        except openai.error.ServiceUnavailableError as e:
            print(e)
            time.sleep(5)
            count += 1
    return response

In [5]:
def collect_samples(dct_final_prompts, time_sleep: int=1):
    dct_responses = {}
    for idx, key in enumerate(dct_final_prompts):
        print("Now on label no. {} out of {}.".format(idx, len(dct_final_prompts.keys())))
        dct_responses[key] = []
        for prompt in dct_final_prompts[key]:
            #print(prompt)
            response = request_with_checks(prompt[0])
            dct_responses[key].append(response)
            time.sleep(time_sleep)
    return dct_responses

In [7]:
import re
import string

def filter_responses(dct_responses):
    dct_df = {'label': [], 'text': [], 'seed': []}
    for key in dct_responses:
        for responses in dct_responses[key]:
            for response in responses[0].choices:
                contents = response.message.content.split('\n')
                for content in contents:
                    if len(content) == 0:
                        continue
                    if content[0] == '1' or content[0] == '2' or content[0] == '3':
                        content = content[3:]
                    dct_df['label'].append(key)
                    dct_df['text'].append(content)
                    dct_df['seed'].append(responses[1])
                    
    fb_0 = pd.DataFrame.from_dict(dct_df)
                    
    fb_0['text']=fb_0['text'].apply(lambda x: x.lower())
    fb_0['text']=fb_0['text'].apply(lambda x: x.replace('"',''))

    fb_0['text']=fb_0['text'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
    
    return fb_0

# rest prepare

In [8]:
def change_label_except_for(x, label):
    if x == label:
        return 1
    else:
        return 0
    
from nltk.corpus import stopwords

stops = set(stopwords.words("english"))
def cleantext(string):
    text = string.split()
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer

def get_fit_svm_linear_and_count_vectorizer(df_orig):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df_orig['text'])
    x=X.toarray()
    y=df_orig['label']
    model=svm.SVC(kernel='linear')
    model.fit(x,y)
    return model, vectorizer, X

def get_freqs(vectorizer, X):
    feature_names = vectorizer.get_feature_names_out()
    dct = {'word': [], 'freq': []}
    for freq, word in zip(np.asarray(X.sum(axis=0))[0], feature_names):
        dct['word'].append(word)
        dct['freq'].append(freq)

    return pd.DataFrame.from_dict(dct)

def get_coefficients(model, vectorizer):
    feature_names = vectorizer.get_feature_names_out()
    coefs_with_fns = sorted(zip(model.coef_[0], feature_names)) 
    df=pd.DataFrame(coefs_with_fns)
    df.columns='coefficient','word'
    return df.sort_values(by='coefficient')

import spacy
from spacy import displacy

def get_taboo_w_for_df_no_ner(df_orig, no_taboo_w, seed_samples_dct):
    dct_taboo_w_per_label = {}

    NER = spacy.load("en_core_web_sm")

    # we gather taboowords for each label in a one (desired label) vs. one (other labels) setting
    labels = list(set(df_orig['label']))
    for label in labels:
        # reset dataframe
        sub_df_orig = df_orig.copy()
        sub_df_orig['text'] = sub_df_orig['text'].map(lambda x: cleantext(x))
        # set setting to one vs one
        sub_df_orig['label'] = sub_df_orig['label'].map(lambda x: change_label_except_for(x, label))

        model, vectorizer, X = get_fit_svm_linear_and_count_vectorizer(sub_df_orig)
        freqs = get_freqs(vectorizer, X)
        coeffs = get_coefficients(model, vectorizer)
        
        sents = seed_samples_dct[label]
        ners = set()
        for sent in sents:
            res = NER(str(sent).lower())
            for txt in res.ents:
                subs = set(txt.text.lower().split())
                ners = ners.union(subs)
            
        joined = coeffs.set_index('word').join(freqs.set_index('word'), lsuffix='_caller', rsuffix='_other')
        joined_rel = joined[joined['freq'] >= 5].sort_values(by=['freq'])

        joined_rel = joined_rel.sort_values(by=['coefficient'])
        taboo_w_without_ints = list(joined_rel.index)
        taboo_w_without_ints = list(filter(lambda word: not word.isdigit(), taboo_w_without_ints))
        
        taboo_w_without_ners = list(filter(lambda word: not word in ners, taboo_w_without_ints))
        
        dct_taboo_w_per_label[label] = taboo_w_without_ners[-no_taboo_w:]
        
    return dct_taboo_w_per_label

In [9]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def get_embs_for_sents(df_pd) -> dict:
    sents_dct = {}
    emb_dct = {}

    for dct in df_pd.to_dict('records'):
        if dct['label'] in sents_dct:
            sents_dct[dct['label']].append(dct['text'])
        else:
            sents_dct[dct['label']] = [dct['text']]
            
    # TODO: check if order is same
    for label in sents_dct.keys():
        emb_dct[label] = {'emb': model.encode(sents_dct[label]), 'sent': sents_dct[label]}
    return emb_dct

def calculate_outliers(df_pd) -> dict:
    embs_dct = get_embs_for_sents(df_pd)
    mean_dct = {}
    pandas_dct = {'label': [], 'distance': [], 'text': []}
    
    # calculate mean vector per label
    for label in embs_dct:
        mean_dct[label] = embs_dct[label]['emb'].mean(axis=0)
        
    # calculate distance from the mean vector per label
    for label in embs_dct:
        mean_emb = mean_dct[label]
        for (sent_emb, sent) in zip(embs_dct[label]['emb'], embs_dct[label]['sent']):
            dist = np.linalg.norm(mean_emb - sent_emb)
            pandas_dct['label'].append(label)
            pandas_dct['distance'].append(dist)
            pandas_dct['text'].append(sent)                        
    return pd.DataFrame.from_dict(pandas_dct)

def get_seed_sentences_per_labels(outliers_df, dct_phrases: dict) -> dict:
    dct_seeds_per_label = {}
    for label in dct_phrases.keys():
        no_samples = len(dct_phrases[label])
        sub_outlier_df = outliers_df[outliers_df['label'] == label].sort_values(by=['distance'], ascending=False)
        dct_seeds_per_label[label] = list(sub_outlier_df.head(no_samples)['text'])
    return dct_seeds_per_label

## all the good things

In [17]:
N_SAMPLES = 6
get_subsampled = df_train.groupby('label', group_keys=False).apply(lambda x: x.sample(N_SAMPLES))

NO_TRY = 0

get_subsampled.to_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/seeds.csv', index=False)

In [39]:
for NO_TRY in range(1,5):
    N_SAMPLES = 6
    get_subsampled = df_train.groupby('label', group_keys=False).apply(lambda x: x.sample(N_SAMPLES))
    
    get_subsampled.to_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/seeds.csv', index=False)
    
    dct_phrases = {}
    for key in set(get_subsampled['label']):
        dct_phrases[key] = list(get_subsampled[get_subsampled['label'] == key]['text'])
    
    default_prompt = 'Rephrase an original question or statement 3 times. Original phrase: "{}".'
    
    dct_final_prompts = {}
    
    for key in dct_phrases:
        dct_final_prompts[key] = []
        for phrase in dct_phrases[key]:
            dct_final_prompts[key].append((default_prompt.format(phrase), phrase))
    
    dct_responses = collect_samples(dct_final_prompts)
    
    fb_0 = filter_responses(dct_responses)
    fb_0.drop_duplicates().to_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_0_seeds.csv', index=False)
    
    dct_responses = collect_samples(dct_final_prompts)
    
    fb_0 = filter_responses(dct_responses)
    fb_0.drop_duplicates().to_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_1_seeds.csv', index=False)
    
    fb_0 = pd.read_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_0_seeds.csv')
    dct_phrases = fb_0.groupby('label')['seed'].apply(set).to_dict()
    dct_taboo = get_taboo_w_for_df_no_ner(fb_0, 3, dct_phrases)
    
    defaul_taboo_prompt = 'Rephrase an original question or statement 3 times. Original phrase: "{}". Don’t use the words “{}”, “{}” or “{}” in your responses.'
    
    dct_final_prompts = {}
    
    for key in dct_phrases:
        dct_final_prompts[key] = []
        for phrase in dct_phrases[key]:
            dct_final_prompts[key].append((defaul_taboo_prompt.format(phrase, dct_taboo[key][0], dct_taboo[key][1], dct_taboo[key][2]), phrase))
    
    dct_responses = collect_samples(dct_final_prompts)
    
    fb_0 = filter_responses(dct_responses)
    fb_0.drop_duplicates().to_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_1_taboo._ablt.csv', index=False)
    
    fb_0 = pd.read_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_0_seeds.csv').drop_duplicates()
    dct_phrases = fb_0.groupby('label')['seed'].apply(set).to_dict()
    
    df_outliers = calculate_outliers(fb_0)
    dct_phrases = get_seed_sentences_per_labels(df_outliers, dct_phrases)
    
    default_prompt = 'Rephrase an original question or statement 3 times. Original phrase: "{}".'
    
    dct_final_prompts = {}
    
    for key in dct_phrases:
        dct_final_prompts[key] = []
        for phrase in dct_phrases[key]:
            dct_final_prompts[key].append((default_prompt.format(phrase), phrase))
    
    dct_responses = collect_samples(dct_final_prompts)
    
    fb_0 = filter_responses(dct_responses)
    fb_0.drop_duplicates().to_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_1_chaining.csv', index=False)
    
    default_prompt = """Rephrase an original question or statement 3 times. Original phrase: "{}".
    ###
    Example paraphrases:
    {}
    ###
    """
    
    default_hint_prompt = '"{}".'
    
    def get_hint_sentences_per_labels(df_outliers, no_samples, dct_phrases):
        dct_hints_per_sample = {}
        for label in dct_phrases.keys():
            for phrase in dct_phrases[label]:
                sub_df = df_outliers[df_outliers['seed'] == phrase] 
                sub_df = sub_df.sort_values(by=['distance'], ascending=False)
                dct_hints_per_sample[phrase] = list(sub_df.head(no_samples)['text'])
        return dct_hints_per_sample
    
    fb_0 = pd.read_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_0_seeds.csv').drop_duplicates()
    dct_phrases = fb_0.groupby('label')['seed'].apply(set).to_dict()
    
    fb_0 = pd.read_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_0_seeds.csv').drop_duplicates()
    # comb is done for the second round
    
    df_outliers = calculate_outliers(fb_0)
    
    df_merged = df_outliers.merge(fb_0, how='inner', on='text').drop_duplicates()[['label_x', 'text', 'distance', 'seed']]
    df_merged = df_merged.rename(columns={'label_x': 'label'})
    
    dct_hints_per_sample = get_hint_sentences_per_labels(df_merged, 3, dct_phrases)
    
    dct_final_prompts = {}
    
    for key in dct_phrases:
        dct_final_prompts[key] = []
        for phrase in dct_phrases[key]:
            hints = dct_hints_per_sample[phrase]
            str_hints = []
            for hint in hints:
                str_hints.append(default_hint_prompt.format(hint))
            final_hint_str = "\n".join(str_hints) 
            dct_final_prompts[key].append((default_prompt.format(phrase, final_hint_str), phrase))
    
    dct_responses = collect_samples(dct_final_prompts)
    
    fb_0 = filter_responses(dct_responses)
    fb_0.drop_duplicates().to_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_1_hints.csv', index=False)

0
Rephrase an original question or statement 3 times. Original phrase: "please explain fare code f".
Rephrase an original question or statement 3 times. Original phrase: "what does restriction ap 57".
Rephrase an original question or statement 3 times. Original phrase: "what is bur".
Rephrase an original question or statement 3 times. Original phrase: "fare code y what does that mean".
Rephrase an original question or statement 3 times. Original phrase: "what does the fare code yn mean".
Rephrase an original question or statement 3 times. Original phrase: "what does d s stand for for meals".
1
Rephrase an original question or statement 3 times. Original phrase: "can you tell me what aircraft is used for delta flight 1222 from kansas city to salt lake city".
Rephrase an original question or statement 3 times. Original phrase: "repeating leaving denver to san francisco before 10 am what type of aircraft is used".
Rephrase an original question or statement 3 times. Original phrase: "what 

In [18]:
dct_phrases = {}
for key in set(get_subsampled['label']):
    dct_phrases[key] = list(get_subsampled[get_subsampled['label'] == key]['text'])

default_prompt = 'Rephrase an original question or statement 3 times. Original phrase: "{}".'

In [19]:
dct_final_prompts = {}

for key in dct_phrases:
    dct_final_prompts[key] = []
    for phrase in dct_phrases[key]:
        dct_final_prompts[key].append((default_prompt.format(phrase), phrase))

In [20]:
def collect_samples(dct_final_prompts):
    dct_responses = {}

    for idx, key in enumerate(dct_final_prompts):
        print(str(idx))
        dct_responses[key] = []
        for prompt in dct_final_prompts[key]:
            print(prompt[0])
            response = request_with_checks(prompt[0])
            dct_responses[key].append((response, prompt[1]))
            time.sleep(1)
            
    return dct_responses

In [21]:
dct_responses = collect_samples(dct_final_prompts)

0
Rephrase an original question or statement 3 times. Original phrase: "what does ord mean".
Rephrase an original question or statement 3 times. Original phrase: "what does mco stand for".
Rephrase an original question or statement 3 times. Original phrase: "what does the fare code y mean".
Rephrase an original question or statement 3 times. Original phrase: "what are fare codes qw and qx".
Rephrase an original question or statement 3 times. Original phrase: "what is mco".
Rephrase an original question or statement 3 times. Original phrase: "what does dfw mean".
1
Rephrase an original question or statement 3 times. Original phrase: "what kind of aircraft is used on the first class american airlines flight from philadelphia to san francisco stopping in dallas".
Rephrase an original question or statement 3 times. Original phrase: "what type of aircraft leaves from boston to washington dc at 9 am during a weekday".
Rephrase an original question or statement 3 times. Original phrase: "leav

In [22]:
fb_0 = filter_responses(dct_responses)
fb_0.drop_duplicates().to_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_0_seeds.csv', index=False)

In [23]:
dct_responses = collect_samples(dct_final_prompts)

0
Rephrase an original question or statement 3 times. Original phrase: "what does ord mean".
Rephrase an original question or statement 3 times. Original phrase: "what does mco stand for".
Rephrase an original question or statement 3 times. Original phrase: "what does the fare code y mean".
Rephrase an original question or statement 3 times. Original phrase: "what are fare codes qw and qx".
Rephrase an original question or statement 3 times. Original phrase: "what is mco".
Rephrase an original question or statement 3 times. Original phrase: "what does dfw mean".
1
Rephrase an original question or statement 3 times. Original phrase: "what kind of aircraft is used on the first class american airlines flight from philadelphia to san francisco stopping in dallas".
Rephrase an original question or statement 3 times. Original phrase: "what type of aircraft leaves from boston to washington dc at 9 am during a weekday".
Rephrase an original question or statement 3 times. Original phrase: "leav

In [24]:
fb_0 = filter_responses(dct_responses)
fb_0.drop_duplicates().to_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_1_seeds.csv', index=False)

In [26]:
fb_0 = pd.read_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_0_seeds.csv')
dct_phrases = fb_0.groupby('label')['seed'].apply(set).to_dict()
dct_taboo = get_taboo_w_for_df_no_ner(fb_0, 3, dct_phrases)

In [27]:
defaul_taboo_prompt = 'Rephrase an original question or statement 3 times. Original phrase: "{}". Don’t use the words “{}”, “{}” or “{}” in your responses.'

dct_final_prompts = {}

for key in dct_phrases:
    dct_final_prompts[key] = []
    for phrase in dct_phrases[key]:
        dct_final_prompts[key].append((defaul_taboo_prompt.format(phrase, dct_taboo[key][0], dct_taboo[key][1], dct_taboo[key][2]), phrase))

In [29]:
dct_responses = collect_samples(dct_final_prompts)

0
Rephrase an original question or statement 3 times. Original phrase: "what does the fare code y mean". Don’t use the words “meaning”, “fare” or “mco” in your responses.
Rephrase an original question or statement 3 times. Original phrase: "what does ord mean". Don’t use the words “meaning”, “fare” or “mco” in your responses.
Rephrase an original question or statement 3 times. Original phrase: "what does mco stand for". Don’t use the words “meaning”, “fare” or “mco” in your responses.
Rephrase an original question or statement 3 times. Original phrase: "what are fare codes qw and qx". Don’t use the words “meaning”, “fare” or “mco” in your responses.
Rephrase an original question or statement 3 times. Original phrase: "what does dfw mean". Don’t use the words “meaning”, “fare” or “mco” in your responses.
Rephrase an original question or statement 3 times. Original phrase: "what is mco". Don’t use the words “meaning”, “fare” or “mco” in your responses.
1
Rephrase an original question or 

In [30]:
fb_0 = filter_responses(dct_responses)
fb_0.drop_duplicates().to_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_1_taboo._ablt.csv', index=False)

In [31]:
fb_0 = pd.read_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_0_seeds.csv').drop_duplicates()
dct_phrases = fb_0.groupby('label')['seed'].apply(set).to_dict()

df_outliers = calculate_outliers(fb_0)
dct_phrases = get_seed_sentences_per_labels(df_outliers, dct_phrases)

In [32]:
default_prompt = 'Rephrase an original question or statement 3 times. Original phrase: "{}".'

dct_final_prompts = {}

for key in dct_phrases:
    dct_final_prompts[key] = []
    for phrase in dct_phrases[key]:
        dct_final_prompts[key].append((default_prompt.format(phrase), phrase))

In [33]:
dct_responses = collect_samples(dct_final_prompts)

0
Rephrase an original question or statement 3 times. Original phrase: "what is the meaning of ord".
Rephrase an original question or statement 3 times. Original phrase: "can you define the term ord".
Rephrase an original question or statement 3 times. Original phrase: "what does mco stand for".
Rephrase an original question or statement 3 times. Original phrase: "what do fare codes qw and qx represent".
Rephrase an original question or statement 3 times. Original phrase: "what is the meaning of dfw".
Rephrase an original question or statement 3 times. Original phrase: "whats the significance of the word ord".
1
Rephrase an original question or statement 3 times. Original phrase: "present the aircraft operated by canadian aviation firms".
Rephrase an original question or statement 3 times. Original phrase: "display the airplanes utilized by canadian airline companies".
Rephrase an original question or statement 3 times. Original phrase: "reveal to me the planes used by airlines in cana

In [34]:
fb_0 = filter_responses(dct_responses)
fb_0.drop_duplicates().to_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_1_chaining.csv', index=False)

In [35]:
default_prompt = """Rephrase an original question or statement 3 times. Original phrase: "{}".
###
Example paraphrases:
{}
###
"""

default_hint_prompt = '"{}".'

def get_hint_sentences_per_labels(df_outliers, no_samples, dct_phrases):
    dct_hints_per_sample = {}
    for label in dct_phrases.keys():
        for phrase in dct_phrases[label]:
            sub_df = df_outliers[df_outliers['seed'] == phrase] 
            sub_df = sub_df.sort_values(by=['distance'], ascending=False)
            dct_hints_per_sample[phrase] = list(sub_df.head(no_samples)['text'])
    return dct_hints_per_sample

fb_0 = pd.read_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_0_seeds.csv').drop_duplicates()
dct_phrases = fb_0.groupby('label')['seed'].apply(set).to_dict()

fb_0 = pd.read_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_0_seeds.csv').drop_duplicates()
# comb is done for the second round

df_outliers = calculate_outliers(fb_0)

df_merged = df_outliers.merge(fb_0, how='inner', on='text').drop_duplicates()[['label_x', 'text', 'distance', 'seed']]
df_merged = df_merged.rename(columns={'label_x': 'label'})

dct_hints_per_sample = get_hint_sentences_per_labels(df_merged, 3, dct_phrases)

In [36]:
dct_final_prompts = {}

for key in dct_phrases:
    dct_final_prompts[key] = []
    for phrase in dct_phrases[key]:
        hints = dct_hints_per_sample[phrase]
        str_hints = []
        for hint in hints:
            str_hints.append(default_hint_prompt.format(hint))
        final_hint_str = "\n".join(str_hints) 
        dct_final_prompts[key].append((default_prompt.format(phrase, final_hint_str), phrase))

In [37]:
dct_responses = collect_samples(dct_final_prompts)

0
Rephrase an original question or statement 3 times. Original phrase: "what does the fare code y mean".
###
Example paraphrases:
"what does the y fare code signify".
"can you explain the meaning of the fare code y".
"what is the significance of the fare code y".
###

Rephrase an original question or statement 3 times. Original phrase: "what does ord mean".
###
Example paraphrases:
"what is the meaning of ord".
"can you define the term ord".
"whats the significance of the word ord".
###

Rephrase an original question or statement 3 times. Original phrase: "what does mco stand for".
###
Example paraphrases:
"what is the meaning of the acronym mco".
"can you tell me what mco represents".
"what are the words represented by the abbreviation mco".
###

Rephrase an original question or statement 3 times. Original phrase: "what are fare codes qw and qx".
###
Example paraphrases:
"what do fare codes qw and qx represent".
"could you provide information on fare codes qw and qx".
"can you explain

6
Rephrase an original question or statement 3 times. Original phrase: "how many cities are served by continental with first class flights".
###
Example paraphrases:
"how many cities does continental offer first class flights to".
"in how many cities does continental provide first class flight service".
"what is the total number of cities that continental serves with first class flights".
###

Rephrase an original question or statement 3 times. Original phrase: "how many fare codes belong to economy class".
###
Example paraphrases:
"in the economy class how numerous are the fare codes".
"how many fare codes are assigned specifically to the economy class".
"what is the total number of fare codes associated with the economy class".
###

Rephrase an original question or statement 3 times. Original phrase: "how many first class flights does united airlines have departing from boston today".
###
Example paraphrases:
"can you tell me the count of united airlines firstclass flights taking off

In [38]:
fb_0 = filter_responses(dct_responses)
fb_0.drop_duplicates().to_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_1_hints.csv', index=False)

## final csvs

In [68]:
# temporary only
import re
import string

NO_TRY=0

fb_0 = pd.read_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_0_seeds.csv')
fb_0 = fb_0[['seed', 'label']].drop_duplicates()
fb_0 = fb_0.rename(columns={'seed': 'text'})

fb_0['text']=fb_0['text'].apply(lambda x: x.lower())
fb_0['text']=fb_0['text'].apply(lambda x: x.strip())
fb_0['text']=fb_0['text'].apply(lambda x: x.replace('"',''))
fb_0['text']=fb_0['text'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))

fb_0.to_csv('challenge_data_new/atis/gpt/'+str(NO_TRY)+'/seeds.csv', index=False)

In [69]:
fb_seeds = pd.read_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/seeds.csv').dropna().reset_index(drop=True).sample(frac=1, replace=False, random_state=1)
fb_0 = pd.read_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_0_seeds.csv').dropna().reset_index(drop=True).sample(frac=1, replace=False, random_state=1)
fb_1 = pd.read_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_1_seeds.csv').dropna().reset_index(drop=True).sample(frac=1, replace=False, random_state=1)

fb = pd.concat([fb_0, fb_1, fb_seeds], ignore_index=True).drop_duplicates('text').dropna(subset=['text'])

In [70]:
fb = fb.sample(frac=1).reset_index(drop=True)
fb['text'] = fb['text'].apply(lambda x: x.replace('"',''))

In [71]:
fb[['text', 'label']].to_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_prompt.csv', index=False)

In [72]:
fb_1 = pd.read_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_1_taboo.csv').dropna(subset=['text']).reset_index(drop=True).sample(frac=1, replace=False, random_state=1)
fb_1 = fb_1[~fb_1['text'].str.contains("paraphra")]


fb = pd.concat([fb_0, fb_1, fb_seeds], ignore_index=True).drop_duplicates('text').dropna(subset=['text'])

fb = fb.sample(frac=1).reset_index(drop=True)
fb['text'] = fb['text'].apply(lambda x: x.replace('"',''))

fb[['text', 'label']].to_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_taboo.csv', index=False)

In [73]:
fb_1 = pd.read_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_1_chaining.csv').dropna(subset=['text']).reset_index(drop=True).sample(frac=1, replace=False, random_state=1)
fb_1 = fb_1[~fb_1['text'].str.contains("paraphra")]

fb = pd.concat([fb_0, fb_1, fb_seeds], ignore_index=True).drop_duplicates('text').dropna(subset=['text'])

fb = fb.sample(frac=1).reset_index(drop=True)
fb['text'] = fb['text'].apply(lambda x: x.replace('"',''))

fb[['text', 'label']].to_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_chaining.csv', index=False)

In [74]:
fb_1 = pd.read_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_1_hints.csv').dropna(subset=['text']).reset_index(drop=True).sample(frac=1, replace=False, random_state=1)
fb_1 = fb_1[~fb_1['text'].str.contains("paraphra")]

fb = pd.concat([fb_0, fb_1, fb_seeds], ignore_index=True).drop_duplicates('text').dropna(subset=['text'])

fb = fb.sample(frac=1).reset_index(drop=True)
fb['text'] = fb['text'].apply(lambda x: x.replace('"',''))

fb[['text', 'label']].to_csv('challenge_data_new/atis/gpt4/'+str(NO_TRY)+'/atis_hints.csv', index=False)