In [1]:
import openai

import pandas as pd
import os
import numpy as np
from ast import literal_eval
import json
import pickle
import time

## Only GPT

In [3]:
from openai import OpenAI

client = OpenAI(
  organization='', api_key=""
)

In [4]:
def request_response_from_gpt(prompt):
    response = client.chat.completions.create(
        model="gpt-4-0314",
        #prompt=prompt,
        messages=[
            {"role": "system", "content": "You are a crowdsourcing worker that earns a living through creating paraphrases."},
            {"role": "user", "content": prompt}],
        temperature=1,
        frequency_penalty=0.0,
        presence_penalty=1.5,
        n=1)
    return response

In [5]:
def request_with_checks(prompt):
    success = False
    count = 0
    while not success:
        if count > 0:
            print(f'Retrying with again. Current number of retries: {count}')
        if count >= 10:
            raise Exception('Too many attempts')
        try:
            response = request_response_from_gpt(prompt)
            sucess = True
            break
        except openai.error.RateLimitError as e:
            print(e)
            time.sleep(10)
            count += 1
        except openai.error.APIConnectionError as e:
            print(e)
            time.sleep(5)
            count += 1
        except openai.error.APIError or openai.error.JSONDecodeError as e:
            print(e)
            time.sleep(5)
            count += 1
        except openai.error.Timeout as e:
            print(e)
            time.sleep(5)
            count += 1
        except openai.error.ServiceUnavailableError as e:
            print(e)
            time.sleep(5)
            count += 1
    return response

In [6]:
def collect_samples(dct_final_prompts, time_sleep: int=1):
    dct_responses = {}
    for idx, key in enumerate(dct_final_prompts):
        print("Now on label no. {} out of {}.".format(idx, len(dct_final_prompts.keys())))
        dct_responses[key] = []
        for prompt in dct_final_prompts[key]:
            #print(prompt)
            response = request_with_checks(prompt)
            dct_responses[key].append(response)
            time.sleep(time_sleep)
    return dct_responses

In [7]:
import re
import string

def filter_responses(dct_responses):
    dct_df = {'label': [], 'text': [], 'seed': []}
    for key in dct_responses:
        for responses in dct_responses[key]:
            for response in responses[0].choices:
                contents = response.message.content.split('\n')
                for content in contents:
                    if len(content) == 0:
                        continue
                    if content[0] == '1' or content[0] == '2' or content[0] == '3':
                        content = content[3:]
                    dct_df['label'].append(key)
                    dct_df['text'].append(content)
                    dct_df['seed'].append(responses[1])
                    
    fb_0 = pd.DataFrame.from_dict(dct_df)
                    
    fb_0['text']=fb_0['text'].apply(lambda x: x.lower())
    fb_0['text']=fb_0['text'].apply(lambda x: x.replace('"',''))

    fb_0['text']=fb_0['text'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
    
    return fb_0

# rest prepare

In [8]:
def change_label_except_for(x, label):
    if x == label:
        return 1
    else:
        return 0
    
from nltk.corpus import stopwords

stops = set(stopwords.words("english"))
def cleantext(string):
    text = string.split()
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer

def get_fit_svm_linear_and_count_vectorizer(df_orig):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df_orig['text'])
    x=X.toarray()
    y=df_orig['label']
    model=svm.SVC(kernel='linear')
    model.fit(x,y)
    return model, vectorizer, X

def get_freqs(vectorizer, X):
    feature_names = vectorizer.get_feature_names_out()
    dct = {'word': [], 'freq': []}
    for freq, word in zip(np.asarray(X.sum(axis=0))[0], feature_names):
        dct['word'].append(word)
        dct['freq'].append(freq)

    return pd.DataFrame.from_dict(dct)

def get_coefficients(model, vectorizer):
    feature_names = vectorizer.get_feature_names_out()
    coefs_with_fns = sorted(zip(model.coef_[0], feature_names)) 
    df=pd.DataFrame(coefs_with_fns)
    df.columns='coefficient','word'
    return df.sort_values(by='coefficient')

import spacy
from spacy import displacy

# use random=True for ablated version of the method
def get_taboo_w_for_df_no_ner(df_orig, no_taboo_w, seed_samples_dct, random=False):
    dct_taboo_w_per_label = {}

    NER = spacy.load("en_core_web_sm")

    # we gather taboowords for each label in a one (desired label) vs. one (other labels) setting
    labels = list(set(df_orig['label']))
    for label in labels:
        # reset dataframe
        sub_df_orig = df_orig.copy()
        sub_df_orig['text'] = sub_df_orig['text'].map(lambda x: cleantext(x))
        # set setting to one vs one
        sub_df_orig['label'] = sub_df_orig['label'].map(lambda x: change_label_except_for(x, label))

        model, vectorizer, X = get_fit_svm_linear_and_count_vectorizer(sub_df_orig)
        freqs = get_freqs(vectorizer, X)
        coeffs = get_coefficients(model, vectorizer)
        
        sents = seed_samples_dct[label]
        ners = set()
        for sent in sents:
            res = NER(sent.lower())
            for txt in res.ents:
                subs = set(txt.text.lower().split())
                ners = ners.union(subs)
            
        joined = coeffs.set_index('word').join(freqs.set_index('word'), lsuffix='_caller', rsuffix='_other')
        joined_rel = joined[joined['freq'] >= 5].sort_values(by=['freq'])

        if random:
            joined_rel = joined_rel.sample(frac=1)
        else:   
            joined_rel = joined_rel.sort_values(by=['coefficient'])
            
        taboo_w_without_ints = list(joined_rel.index)
        taboo_w_without_ints = list(filter(lambda word: not word.isdigit(), taboo_w_without_ints))
        
        taboo_w_without_ners = list(filter(lambda word: not word in ners, taboo_w_without_ints))
        
        dct_taboo_w_per_label[label] = taboo_w_without_ners[-no_taboo_w:]
        
    return dct_taboo_w_per_label

In [9]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def get_embs_for_sents(df_pd) -> dict:
    sents_dct = {}
    emb_dct = {}

    for dct in df_pd.to_dict('records'):
        if dct['label'] in sents_dct:
            sents_dct[dct['label']].append(dct['text'])
        else:
            sents_dct[dct['label']] = [dct['text']]
            
    for label in sents_dct.keys():
        emb_dct[label] = {'emb': model.encode(sents_dct[label]), 'sent': sents_dct[label]}
    return emb_dct

def calculate_outliers(df_pd) -> dict:
    embs_dct = get_embs_for_sents(df_pd)
    mean_dct = {}
    pandas_dct = {'label': [], 'distance': [], 'text': []}
    
    # calculate mean vector per label
    for label in embs_dct:
        mean_dct[label] = embs_dct[label]['emb'].mean(axis=0)
        
    # calculate distance from the mean vector per label
    for label in embs_dct:
        mean_emb = mean_dct[label]
        for (sent_emb, sent) in zip(embs_dct[label]['emb'], embs_dct[label]['sent']):
            dist = np.linalg.norm(mean_emb - sent_emb)
            pandas_dct['label'].append(label)
            pandas_dct['distance'].append(dist)
            pandas_dct['text'].append(sent)                        
    return pd.DataFrame.from_dict(pandas_dct)

def get_seed_sentences_per_labels(outliers_df, dct_phrases: dict, random=False) -> dict:
    dct_seeds_per_label = {}
    for label in dct_phrases.keys():
        no_samples = len(dct_phrases[label])
        if random:
            sub_outlier_df = outliers_df[outliers_df['label'] == label].sample(frac=1)
        else:    
            sub_outlier_df = outliers_df[outliers_df['label'] == label].sort_values(by=['distance'], ascending=False)
        dct_seeds_per_label[label] = list(sub_outlier_df.head(no_samples)['text'])
    return dct_seeds_per_label

## all the good things

In [54]:
NO_TRY=0

In [55]:
df_train = pd.read_csv('challenge_data_new/fb_under/full_fb_orig_train.csv')
N_SAMPLES = 6
get_subsampled = df_train.groupby('label', group_keys=False).apply(lambda x: x.sample(N_SAMPLES))

In [56]:
get_subsampled.to_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/seeds.csv', index=False)

In [57]:
dct_phrases = {}
for key in set(get_subsampled['label']):
    dct_phrases[key] = list(get_subsampled[get_subsampled['label'] == key]['text'])

default_prompt = 'Rephrase an original question or statement 3 times. Original phrase: "{}".'

In [58]:
dct_final_prompts = {}

for key in dct_phrases:
    dct_final_prompts[key] = []
    for phrase in dct_phrases[key]:
        dct_final_prompts[key].append((default_prompt.format(phrase), phrase))

In [59]:
def collect_samples(dct_final_prompts):
    dct_responses = {}

    for idx, key in enumerate(dct_final_prompts):
        print(str(idx))
        dct_responses[key] = []
        for prompt in dct_final_prompts[key]:
            print(prompt[0])
            response = request_with_checks(prompt[0])
            dct_responses[key].append((response, prompt[1]))
            time.sleep(1)
            
    return dct_responses

In [60]:
dct_responses = collect_samples(dct_final_prompts)

0
Rephrase an original question or statement 3 times. Original phrase: "tell me the fastest way to get home".
Rephrase an original question or statement 3 times. Original phrase: "Is there a way to get from 3rd Ave & Main St Seattle to the Future concert without construction ?".
Rephrase an original question or statement 3 times. Original phrase: "get me driving directions to Kennedy Space Center in Houston".
Rephrase an original question or statement 3 times. Original phrase: "Can I get to the farmer 's market avoiding construction by taking I - 90".
Rephrase an original question or statement 3 times. Original phrase: "WHAT IS THE BEST ROUTE TO TAKE TO GET TO THE NEW YORK YANKEES STADIUM BY 10 AM".
Rephrase an original question or statement 3 times. Original phrase: "Is there a fast way to get to the airport for a 9 pm flight today ?".
1
Rephrase an original question or statement 3 times. Original phrase: "How many miles is it from Detroit to Flint".
Rephrase an original question or s

In [61]:
fb_0 = filter_responses(dct_responses)
fb_0.drop_duplicates().to_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_1_seeds.csv', index=False)

In [62]:
dct_responses = collect_samples(dct_final_prompts)

0
Rephrase an original question or statement 3 times. Original phrase: "tell me the fastest way to get home".
Rephrase an original question or statement 3 times. Original phrase: "Is there a way to get from 3rd Ave & Main St Seattle to the Future concert without construction ?".
Rephrase an original question or statement 3 times. Original phrase: "get me driving directions to Kennedy Space Center in Houston".
Rephrase an original question or statement 3 times. Original phrase: "Can I get to the farmer 's market avoiding construction by taking I - 90".
Rephrase an original question or statement 3 times. Original phrase: "WHAT IS THE BEST ROUTE TO TAKE TO GET TO THE NEW YORK YANKEES STADIUM BY 10 AM".
Rephrase an original question or statement 3 times. Original phrase: "Is there a fast way to get to the airport for a 9 pm flight today ?".
1
Rephrase an original question or statement 3 times. Original phrase: "How many miles is it from Detroit to Flint".
Rephrase an original question or s

In [63]:
fb_0 = filter_responses(dct_responses)
fb_0.drop_duplicates().to_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_0_seeds.csv', index=False)

In [64]:
fb_0 = pd.read_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_0_seeds.csv')
dct_phrases = fb_0.groupby('label')['seed'].apply(set).to_dict()
dct_taboo = get_taboo_w_for_df_no_ner(fb_0, 3, dct_phrases)

In [65]:
defaul_taboo_prompt = 'Rephrase an original question or statement 3 times. Original phrase: "{}". Don’t use the words “{}”, “{}” or “{}” in your responses.'

dct_final_prompts = {}

for key in dct_phrases:
    dct_final_prompts[key] = []
    for phrase in dct_phrases[key]:
        dct_final_prompts[key].append((defaul_taboo_prompt.format(phrase, dct_taboo[key][0], dct_taboo[key][1], dct_taboo[key][2]), phrase))

In [66]:
dct_responses = collect_samples(dct_final_prompts)

0
Rephrase an original question or statement 3 times. Original phrase: "get me driving directions to Kennedy Space Center in Houston". Don’t use the words “arrive”, “construction” or “house” in your responses.
Rephrase an original question or statement 3 times. Original phrase: "Can I get to the farmer 's market avoiding construction by taking I - 90". Don’t use the words “arrive”, “construction” or “house” in your responses.
Rephrase an original question or statement 3 times. Original phrase: "WHAT IS THE BEST ROUTE TO TAKE TO GET TO THE NEW YORK YANKEES STADIUM BY 10 AM". Don’t use the words “arrive”, “construction” or “house” in your responses.
Rephrase an original question or statement 3 times. Original phrase: "tell me the fastest way to get home". Don’t use the words “arrive”, “construction” or “house” in your responses.
Rephrase an original question or statement 3 times. Original phrase: "Is there a fast way to get to the airport for a 9 pm flight today ?". Don’t use the words “

In [67]:
fb_0 = filter_responses(dct_responses)
fb_0.drop_duplicates().to_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_1_taboo.csv', index=False)

In [68]:
fb_0 = pd.read_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_0_seeds.csv').drop_duplicates()
dct_phrases = fb_0.groupby('label')['seed'].apply(set).to_dict()

df_outliers = calculate_outliers(fb_0)
dct_phrases = get_seed_sentences_per_labels(df_outliers, dct_phrases)

In [69]:
default_prompt = 'Rephrase an original question or statement 3 times. Original phrase: "{}".'

dct_final_prompts = {}

for key in dct_phrases:
    dct_final_prompts[key] = []
    for phrase in dct_phrases[key]:
        dct_final_prompts[key].append((default_prompt.format(phrase), phrase))

In [70]:
dct_responses = collect_samples(dct_final_prompts)

0
Rephrase an original question or statement 3 times. Original phrase: "please share the most rapid means of getting back to my dwelling".
Rephrase an original question or statement 3 times. Original phrase: "which direction should one choose for arriving at the new york yankees stadium before 10 am".
Rephrase an original question or statement 3 times. Original phrase: "i need guidance on how to drive to kennedy space center situated in houston".
Rephrase an original question or statement 3 times. Original phrase: "provide me with driving instructions to reach kennedy space center located in houston".
Rephrase an original question or statement 3 times. Original phrase: "can you offer navigational assistance for reaching kennedy space center at houston".
Rephrase an original question or statement 3 times. Original phrase: "can taking i90 allow me to avoid construction and arrive at the farmers market".
1
Rephrase an original question or statement 3 times. Original phrase: "what is the l

In [71]:
fb_0 = filter_responses(dct_responses)
fb_0.drop_duplicates().to_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_1_chaining.csv', index=False)

In [72]:
default_prompt = """Rephrase an original question or statement 3 times. Original phrase: "{}".
###
Example paraphrases:
{}
###
"""

default_hint_prompt = '"{}".'

#use random=True for ablated version
def get_hint_sentences_per_labels(df_outliers, no_samples, dct_phrases, random=False):
    dct_hints_per_sample = {}
    for label in dct_phrases.keys():
        for phrase in dct_phrases[label]:
            sub_df = df_outliers[df_outliers['seed'] == phrase] 
            if random:
                sub_df = sub_df.sample(frac=1)
            else:
                sub_df = sub_df.sort_values(by=['distance'], ascending=False)
            dct_hints_per_sample[phrase] = list(sub_df.head(no_samples)['text'])
    return dct_hints_per_sample

fb_0 = pd.read_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_0_seeds.csv').drop_duplicates()
dct_phrases = fb_0.groupby('label')['seed'].apply(set).to_dict()

fb_0 = pd.read_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_0_seeds.csv').drop_duplicates()
# comb is done for the second round

df_outliers = calculate_outliers(fb_0)

df_merged = df_outliers.merge(fb_0, how='inner', on='text').drop_duplicates()[['label_x', 'text', 'distance', 'seed']]
df_merged = df_merged.rename(columns={'label_x': 'label'})

dct_hints_per_sample = get_hint_sentences_per_labels(df_merged, 3, dct_phrases)

In [73]:
dct_final_prompts = {}

for key in dct_phrases:
    dct_final_prompts[key] = []
    for phrase in dct_phrases[key]:
        hints = dct_hints_per_sample[phrase]
        str_hints = []
        for hint in hints:
            str_hints.append(default_hint_prompt.format(hint))
        final_hint_str = "\n".join(str_hints) 
        dct_final_prompts[key].append((default_prompt.format(phrase, final_hint_str), phrase))

In [74]:
dct_responses = collect_samples(dct_final_prompts)

0
Rephrase an original question or statement 3 times. Original phrase: "get me driving directions to Kennedy Space Center in Houston".
###
Example paraphrases:
"i need guidance on how to drive to kennedy space center situated in houston".
"provide me with driving instructions to reach kennedy space center located in houston".
"can you offer navigational assistance for reaching kennedy space center at houston".
###

Rephrase an original question or statement 3 times. Original phrase: "Can I get to the farmer 's market avoiding construction by taking I - 90".
###
Example paraphrases:
"can taking i90 allow me to avoid construction and arrive at the farmers market".
"by using i90 can i bypass construction and make it to the farmers market".
"is it possible to reach the farmers market without encountering construction if i take i90".
###

Rephrase an original question or statement 3 times. Original phrase: "WHAT IS THE BEST ROUTE TO TAKE TO GET TO THE NEW YORK YANKEES STADIUM BY 10 AM".
###

In [75]:
fb_0 = filter_responses(dct_responses)
fb_0.drop_duplicates().to_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_1_hints.csv', index=False)

In [76]:
for NO_TRY in range(3,5):
    df_train = pd.read_csv('challenge_data_new/fb_under/full_fb_orig_train.csv')
    N_SAMPLES = 6
    get_subsampled = df_train.groupby('label', group_keys=False).apply(lambda x: x.sample(N_SAMPLES))
    get_subsampled.to_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/seeds.csv', index=False)
    
    dct_phrases = {}
    for key in set(get_subsampled['label']):
        dct_phrases[key] = list(get_subsampled[get_subsampled['label'] == key]['text'])
    
    default_prompt = 'Rephrase an original question or statement 3 times. Original phrase: "{}".'
    
    dct_final_prompts = {}
    
    for key in dct_phrases:
        dct_final_prompts[key] = []
        for phrase in dct_phrases[key]:
            dct_final_prompts[key].append((default_prompt.format(phrase), phrase))
    
    dct_responses = collect_samples(dct_final_prompts)
    
    fb_0 = filter_responses(dct_responses)
    fb_0.drop_duplicates().to_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_1_seeds.csv', index=False)
    
    dct_responses = collect_samples(dct_final_prompts)
    
    fb_0 = filter_responses(dct_responses)
    fb_0.drop_duplicates().to_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_0_seeds.csv', index=False)
    
    dct_phrases = fb_0.groupby('label')['seed'].apply(set).to_dict()
    dct_taboo = get_taboo_w_for_df_no_ner(fb_0, 3, dct_phrases)
    
    defaul_taboo_prompt = 'Rephrase an original question or statement 3 times. Original phrase: "{}". Don’t use the words “{}”, “{}” or “{}” in your responses.'
    
    dct_final_prompts = {}
    
    for key in dct_phrases:
        dct_final_prompts[key] = []
        for phrase in dct_phrases[key]:
            dct_final_prompts[key].append((defaul_taboo_prompt.format(phrase, dct_taboo[key][0], dct_taboo[key][1], dct_taboo[key][2]), phrase))
    
    dct_responses = collect_samples(dct_final_prompts)
    
    fb_0 = filter_responses(dct_responses)
    fb_0.drop_duplicates().to_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_1_taboo.csv', index=False)
    
    fb_0 = pd.read_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_0_seeds.csv').drop_duplicates()
    dct_phrases = fb_0.groupby('label')['seed'].apply(set).to_dict()
    
    df_outliers = calculate_outliers(fb_0)
    dct_phrases = get_seed_sentences_per_labels(df_outliers, dct_phrases)
    
    default_prompt = 'Rephrase an original question or statement 3 times. Original phrase: "{}".'
    
    dct_final_prompts = {}
    
    for key in dct_phrases:
        dct_final_prompts[key] = []
        for phrase in dct_phrases[key]:
            dct_final_prompts[key].append((default_prompt.format(phrase), phrase))
    
    dct_responses = collect_samples(dct_final_prompts)
    fb_0 = filter_responses(dct_responses)
    fb_0.drop_duplicates().to_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_1_chaining.csv', index=False)
    
    default_prompt = """Rephrase an original question or statement 3 times. Original phrase: "{}".
    ###
    Example paraphrases:
    {}
    ###
    """
    
    default_hint_prompt = '"{}".'
    
    def get_hint_sentences_per_labels(df_outliers, no_samples, dct_phrases):
        dct_hints_per_sample = {}
        for label in dct_phrases.keys():
            for phrase in dct_phrases[label]:
                sub_df = df_outliers[df_outliers['seed'] == phrase] 
                sub_df = sub_df.sort_values(by=['distance'], ascending=False)
                dct_hints_per_sample[phrase] = list(sub_df.head(no_samples)['text'])
        return dct_hints_per_sample
    
    fb_0 = pd.read_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_0_seeds.csv').drop_duplicates()
    dct_phrases = fb_0.groupby('label')['seed'].apply(set).to_dict()
    
    fb_0 = pd.read_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_0_seeds.csv').drop_duplicates()
    # comb is done for the second round
    
    df_outliers = calculate_outliers(fb_0)
    
    df_merged = df_outliers.merge(fb_0, how='inner', on='text').drop_duplicates()[['label_x', 'text', 'distance', 'seed']]
    df_merged = df_merged.rename(columns={'label_x': 'label'})
    
    dct_hints_per_sample = get_hint_sentences_per_labels(df_merged, 3, dct_phrases)
    
    dct_final_prompts = {}
    
    for key in dct_phrases:
        dct_final_prompts[key] = []
        for phrase in dct_phrases[key]:
            hints = dct_hints_per_sample[phrase]
            str_hints = []
            for hint in hints:
                str_hints.append(default_hint_prompt.format(hint))
            final_hint_str = "\n".join(str_hints) 
            dct_final_prompts[key].append((default_prompt.format(phrase, final_hint_str), phrase))
    
    dct_responses = collect_samples(dct_final_prompts)
    
    fb_0 = filter_responses(dct_responses)
    fb_0.drop_duplicates().to_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_1_hints.csv', index=False)

0
Rephrase an original question or statement 3 times. Original phrase: "Tell me what is the next route after merging onto highway i - 695 toward Glen Bernie .".
Rephrase an original question or statement 3 times. Original phrase: "Bypass traffic on 10th street from my location".
Rephrase an original question or statement 3 times. Original phrase: "Get me to city lights bookstore as fast as possible .".
Rephrase an original question or statement 3 times. Original phrase: "How do I bypass the 10 mile Road congestion at 94".
Rephrase an original question or statement 3 times. Original phrase: "What is the shortest way home".
Rephrase an original question or statement 3 times. Original phrase: "Directions to Eaton Park Place is Columbus from my house".
1
Rephrase an original question or statement 3 times. Original phrase: "how far is New York from here".
Rephrase an original question or statement 3 times. Original phrase: "What 's distance between pensacola and gulf breeze".
Rephrase an or

## final csvs

In [77]:
# temporary only
import re
import string

NO_TRY=0

fb_0 = pd.read_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_0_seeds.csv')
fb_0 = fb_0[['seed', 'intent']].drop_duplicates()

fb_0['seed']=fb_0['seed'].apply(lambda x: x.lower())
fb_0['seed']=fb_0['seed'].apply(lambda x: x.strip())
fb_0['seed']=fb_0['seed'].apply(lambda x: x.replace('"',''))
fb_0['seed']=fb_0['seed'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))

fb_0.rename(columns={'seed': 'text'}).to_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_sample_seeds.csv', index=False)

KeyError: "['intent'] not in index"

In [48]:
dct_intents = {'get_directions': 0,
 'get_distance': 1,
 'get_estimated_arrival': 2,
 'get_estimated_departure': 3,
 'get_estimated_duration': 4,
 'get_event': 5,
 'get_info_road_condition': 6,
 'get_info_traffic': 7,
 'update_directions': 8}

dct = {'get_directions': 0,
 'get_distance': 1,
 'get_estimated_arrival': 2,
 'get_estimated_departure': 3,
 'get_estimated_duration': 4,
 'get_info_road_condition': 5,
 'get_info_route': 6,
 'get_info_traffic': 7,
 'get_location': 8,
 'update_directions': 9}

def function(x):
    return dct_intents[x]

In [111]:
NO_TRY = 0

In [112]:
fb_seeds = pd.read_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/seeds.csv').dropna().reset_index(drop=True).sample(frac=1, replace=False, random_state=1)
fb_seeds['text'] = fb_seeds['text'].apply(lambda x: x.replace('"',''))
fb_seeds['text'] = fb_seeds['text'].apply(lambda x: x.lower())
fb_0 = pd.read_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_0_seeds.csv').dropna().reset_index(drop=True).sample(frac=1, replace=False, random_state=1)
fb_1 = pd.read_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_1_seeds.csv').dropna().reset_index(drop=True).sample(frac=1, replace=False, random_state=1)

fb = pd.concat([fb_0, fb_1, fb_seeds], ignore_index=True).drop_duplicates('text').dropna(subset=['text'])

In [113]:
fb = fb.sample(frac=1).reset_index(drop=True)
fb['text'] = fb['text'].apply(lambda x: x.replace('"',''))

In [114]:
fb[['text', 'label']].to_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_prompt.csv', index=False)

In [115]:
fb_1 = pd.read_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_1_taboo.csv').dropna(subset=['text']).reset_index(drop=True).sample(frac=1, replace=False, random_state=1)
fb_1 = fb_1[~fb_1['text'].str.contains("paraphra")]


fb = pd.concat([fb_0, fb_1, fb_seeds], ignore_index=True).drop_duplicates('text').dropna(subset=['text'])

fb = fb.sample(frac=1).reset_index(drop=True)
fb['text'] = fb['text'].apply(lambda x: x.replace('"',''))

fb[['text', 'label']].to_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_taboo.csv', index=False)

In [116]:
fb_1 = pd.read_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_1_chaining.csv').dropna(subset=['text']).reset_index(drop=True).sample(frac=1, replace=False, random_state=1)
fb_1 = fb_1[~fb_1['text'].str.contains("paraphra")]

fb = pd.concat([fb_0, fb_1, fb_seeds], ignore_index=True).drop_duplicates('text').dropna(subset=['text'])

fb = fb.sample(frac=1).reset_index(drop=True)
fb['text'] = fb['text'].apply(lambda x: x.replace('"',''))

fb[['text', 'label']].to_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_chaining.csv', index=False)

In [117]:
fb_1 = pd.read_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_1_hints.csv').dropna(subset=['text']).reset_index(drop=True).sample(frac=1, replace=False, random_state=1)
fb_1 = fb_1[~fb_1['text'].str.contains("paraphra")]

fb = pd.concat([fb_0, fb_1, fb_seeds], ignore_index=True).drop_duplicates('text').dropna(subset=['text'])

fb = fb.sample(frac=1).reset_index(drop=True)
fb['text'] = fb['text'].apply(lambda x: x.replace('"',''))

fb[['text', 'label']].to_csv('challenge_data_new/fb_under/gpt4/'+str(NO_TRY)+'/fb_hints.csv', index=False)

In [8]:
# calculate lexical diversity

all_diver = []

for i in range(0, 5):
    fb_0 = pd.read_csv('challenge_data_new/play_under/'+str(i)+'/fb_chaining.csv').dropna().reset_index(drop=True).sample(frac=1, replace=False, random_state=1)
    results = set()
    fb_0['text'].str.lower().str.split().apply(results.update)
    all_diver.append(len(results))
  

In [9]:
arr = np.array(all_diver)

print("DIV MEAN: " + str(np.mean(arr)) + " STD: " + str(np.std(arr)))

DIV MEAN: 606.8 STD: 39.22448215082004


### SVM FAST LETS GOO

In [298]:
taboo = pd.read_csv('challenge_data_new/fb_under_4/fb_taboo.csv').dropna().reset_index(drop=True)
chaining = pd.read_csv('challenge_data_new/fb_under_4/fb_chaining.csv').dropna().reset_index(drop=True)
hints = pd.read_csv('challenge_data_new/fb_under_4/fb_hints.csv').dropna().reset_index(drop=True)
prompt = pd.read_csv('challenge_data_new/fb_under_4/fb_prompt.csv').dropna().reset_index(drop=True)

full_orig_test = pd.read_csv('challenge_data_new/fb_under/fb_orig_test.csv').dropna().reset_index(drop=True)

In [305]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neural_network import MLPClassifier
from joblib import dump, load
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree
import numpy as np
import itertools
from sklearn import svm
from sklearn.model_selection import train_test_split

orig_res = []
orig_f1 = []

for idx in range(0, 10):
    train, test = train_test_split(hints, test_size=0.01)
    
    tmp_test = full_orig_test.sample(frac=1, replace=False, random_state=1)
    
    pipeline = Pipeline([
            ('bow', CountVectorizer()),  
            ('tfidf', TfidfTransformer()),  
            ('c', svm.SVC(probability=True))
        ])
    fit = pipeline.fit(train['text'].str.lower(), train['label'])
    
    pred=pipeline.predict(tmp_test['text'].str.lower())
    orig_res.append(accuracy_score(pred,tmp_test['label']))
    orig_f1.append(f1_score(pred,tmp_test['label'],  average = 'weighted'))

In [300]:
#taboo no ner
arr = np.array(orig_res)

print("ACC ORIG SAME MEAN: " + str(np.mean(arr)) + " STD: " + str(np.std(arr)))

arr = np.array(orig_f1)

print("F1 ORIG SAME MEAN: " + str(np.mean(arr)) + " STD: " + str(np.std(arr)))

ACC ORIG SAME MEAN: 0.6971253822629969 STD: 0.005463112526869199
F1 ORIG SAME MEAN: 0.6722058690499573 STD: 0.0056917706305855496


In [304]:
#chain

arr = np.array(orig_res)

print("ACC ORIG SAME MEAN: " + str(np.mean(arr)) + " STD: " + str(np.std(arr)))

arr = np.array(orig_f1)

print("F1 ORIG SAME MEAN: " + str(np.mean(arr)) + " STD: " + str(np.std(arr)))

ACC ORIG SAME MEAN: 0.7231070336391437 STD: 0.0034269108237668243
F1 ORIG SAME MEAN: 0.6928425529395656 STD: 0.003647517592303358


In [306]:
#hints

arr = np.array(orig_res)

print("ACC ORIG SAME MEAN: " + str(np.mean(arr)) + " STD: " + str(np.std(arr)))

arr = np.array(orig_f1)

print("F1 ORIG SAME MEAN: " + str(np.mean(arr)) + " STD: " + str(np.std(arr)))

ACC ORIG SAME MEAN: 0.7388746177370031 STD: 0.0028235691116421575
F1 ORIG SAME MEAN: 0.7088808159025943 STD: 0.003001215982850524


In [302]:
arr = np.array(orig_res)

print("ACC ORIG SAME MEAN: " + str(np.mean(arr)) + " STD: " + str(np.std(arr)))

arr = np.array(orig_f1)

print("F1 ORIG SAME MEAN: " + str(np.mean(arr)) + " STD: " + str(np.std(arr)))

ACC ORIG SAME MEAN: 0.748525993883792 STD: 0.003204239259147624
F1 ORIG SAME MEAN: 0.7186451687620666 STD: 0.004123673847269496
