In [1]:
import openai

import pandas as pd
import os
import numpy as np
from ast import literal_eval
import json
import pickle
import time

In [2]:
# run this to load the now saved DF
df_train = pd.read_csv('challenge_data_new/sst5/train_sst.csv')

## Only GPT

In [5]:
from openai import OpenAI

client = OpenAI(
  organization='', api_key=""
)

In [6]:
def request_response_from_gpt(prompt):
    response = client.chat.completions.create(
        model="gpt-4-0314",
        #prompt=prompt,
        messages=[
            {"role": "system", "content": "You are a crowdsourcing worker that earns a living through creating paraphrases."},
            {"role": "user", "content": prompt}],
        temperature=1,
        frequency_penalty=0.0,
        presence_penalty=1.5,
        n=1)
    return response

In [7]:
def request_with_checks(prompt):
    success = False
    count = 0
    while not success:
        if count > 0:
            print(f'Retrying with again. Current number of retries: {count}')
        if count >= 10:
            raise Exception('Too many attempts')
        try:
            response = request_response_from_gpt(prompt)
            sucess = True
            break
        except openai.error.RateLimitError as e:
            print(e)
            time.sleep(10)
            count += 1
        except openai.error.APIConnectionError as e:
            print(e)
            time.sleep(5)
            count += 1
        except openai.error.APIError or openai.error.JSONDecodeError as e:
            print(e)
            time.sleep(5)
            count += 1
        except openai.error.Timeout as e:
            print(e)
            time.sleep(5)
            count += 1
        except openai.error.ServiceUnavailableError as e:
            print(e)
            time.sleep(5)
            count += 1
    return response

In [8]:
def collect_samples(dct_final_prompts, time_sleep: int=1):
    dct_responses = {}
    for idx, key in enumerate(dct_final_prompts):
        print("Now on label no. {} out of {}.".format(idx, len(dct_final_prompts.keys())))
        dct_responses[key] = []
        for prompt in dct_final_prompts[key]:
            #print(prompt)
            response = request_with_checks(prompt)
            dct_responses[key].append(response)
            time.sleep(time_sleep)
    return dct_responses

In [9]:
import re
import string

def filter_responses(dct_responses):
    dct_df = {'label': [], 'text': [], 'seed': []}
    for key in dct_responses:
        for responses in dct_responses[key]:
            for response in responses[0].choices:
                contents = response.message.content.split('\n')
                for content in contents:
                    if len(content) == 0:
                        continue
                    if content[0] == '1' or content[0] == '2' or content[0] == '3':
                        content = content[3:]
                    dct_df['label'].append(key)
                    dct_df['text'].append(content)
                    dct_df['seed'].append(responses[1])
                    
    fb_0 = pd.DataFrame.from_dict(dct_df)
                    
    fb_0['text']=fb_0['text'].apply(lambda x: x.lower())
    fb_0['text']=fb_0['text'].apply(lambda x: x.replace('"',''))

    fb_0['text']=fb_0['text'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
    
    return fb_0

# rest prepare

In [10]:
def change_label_except_for(x, label):
    if x == label:
        return 1
    else:
        return 0
    
from nltk.corpus import stopwords

stops = set(stopwords.words("english"))
def cleantext(string):
    text = string.split()
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer

def get_fit_svm_linear_and_count_vectorizer(df_orig):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df_orig['text'])
    x=X.toarray()
    y=df_orig['label']
    model=svm.SVC(kernel='linear')
    model.fit(x,y)
    return model, vectorizer, X

def get_freqs(vectorizer, X):
    feature_names = vectorizer.get_feature_names_out()
    dct = {'word': [], 'freq': []}
    for freq, word in zip(np.asarray(X.sum(axis=0))[0], feature_names):
        dct['word'].append(word)
        dct['freq'].append(freq)

    return pd.DataFrame.from_dict(dct)

def get_coefficients(model, vectorizer):
    feature_names = vectorizer.get_feature_names_out()
    coefs_with_fns = sorted(zip(model.coef_[0], feature_names)) 
    df=pd.DataFrame(coefs_with_fns)
    df.columns='coefficient','word'
    return df.sort_values(by='coefficient')

import spacy
from spacy import displacy

# use random=True for ablated version of the method
def get_taboo_w_for_df_no_ner(df_orig, no_taboo_w, seed_samples_dct, random=False):
    dct_taboo_w_per_label = {}

    NER = spacy.load("en_core_web_sm")

    # we gather taboowords for each label in a one (desired label) vs. one (other labels) setting
    labels = list(set(df_orig['label']))
    for label in labels:
        # reset dataframe
        sub_df_orig = df_orig.copy()
        sub_df_orig['text'] = sub_df_orig['text'].map(lambda x: cleantext(x))
        # set setting to one vs one
        sub_df_orig['label'] = sub_df_orig['label'].map(lambda x: change_label_except_for(x, label))

        model, vectorizer, X = get_fit_svm_linear_and_count_vectorizer(sub_df_orig)
        freqs = get_freqs(vectorizer, X)
        coeffs = get_coefficients(model, vectorizer)
        
        sents = seed_samples_dct[label]
        ners = set()
        for sent in sents:
            res = NER(sent.lower())
            for txt in res.ents:
                subs = set(txt.text.lower().split())
                ners = ners.union(subs)
            
        joined = coeffs.set_index('word').join(freqs.set_index('word'), lsuffix='_caller', rsuffix='_other')
        joined_rel = joined[joined['freq'] >= 5].sort_values(by=['freq'])

        if random:
            joined_rel = joined_rel.sample(frac=1)
        else:   
            joined_rel = joined_rel.sort_values(by=['coefficient'])
            
        taboo_w_without_ints = list(joined_rel.index)
        taboo_w_without_ints = list(filter(lambda word: not word.isdigit(), taboo_w_without_ints))
        
        taboo_w_without_ners = list(filter(lambda word: not word in ners, taboo_w_without_ints))
        
        dct_taboo_w_per_label[label] = taboo_w_without_ners[-no_taboo_w:]
        
    return dct_taboo_w_per_label

In [11]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def get_embs_for_sents(df_pd) -> dict:
    sents_dct = {}
    emb_dct = {}

    for dct in df_pd.to_dict('records'):
        if dct['label'] in sents_dct:
            sents_dct[dct['label']].append(dct['text'])
        else:
            sents_dct[dct['label']] = [dct['text']]
            
    for label in sents_dct.keys():
        emb_dct[label] = {'emb': model.encode(sents_dct[label]), 'sent': sents_dct[label]}
    return emb_dct

def calculate_outliers(df_pd) -> dict:
    embs_dct = get_embs_for_sents(df_pd)
    mean_dct = {}
    pandas_dct = {'label': [], 'distance': [], 'text': []}
    
    # calculate mean vector per label
    for label in embs_dct:
        mean_dct[label] = embs_dct[label]['emb'].mean(axis=0)
        
    # calculate distance from the mean vector per label
    for label in embs_dct:
        mean_emb = mean_dct[label]
        for (sent_emb, sent) in zip(embs_dct[label]['emb'], embs_dct[label]['sent']):
            dist = np.linalg.norm(mean_emb - sent_emb)
            pandas_dct['label'].append(label)
            pandas_dct['distance'].append(dist)
            pandas_dct['text'].append(sent)                        
    return pd.DataFrame.from_dict(pandas_dct)

def get_seed_sentences_per_labels(outliers_df, dct_phrases: dict, random=False) -> dict:
    dct_seeds_per_label = {}
    for label in dct_phrases.keys():
        no_samples = len(dct_phrases[label])
        if random:
            sub_outlier_df = outliers_df[outliers_df['label'] == label].sample(frac=1)
        else:    
            sub_outlier_df = outliers_df[outliers_df['label'] == label].sort_values(by=['distance'], ascending=False)
        dct_seeds_per_label[label] = list(sub_outlier_df.head(no_samples)['text'])
    return dct_seeds_per_label

## all the good things

In [94]:
NO_TRY = 0
N_SAMPLES = 6
get_subsampled = df_train.groupby('label', group_keys=False).apply(lambda x: x.sample(N_SAMPLES))

get_subsampled.to_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/seeds.csv', index=False)

In [95]:
dct_phrases = {}
for key in set(get_subsampled['label']):
    dct_phrases[key] = list(get_subsampled[get_subsampled['label'] == key]['text'])

default_prompt = 'Rephrase an original question or statement 3 times. Original phrase: "{}".'

In [96]:
dct_final_prompts = {}

for key in dct_phrases:
    dct_final_prompts[key] = []
    for phrase in dct_phrases[key]:
        dct_final_prompts[key].append((default_prompt.format(phrase), phrase))

In [97]:
def collect_samples(dct_final_prompts):
    dct_responses = {}

    for idx, key in enumerate(dct_final_prompts):
        print(str(idx))
        dct_responses[key] = []
        for prompt in dct_final_prompts[key]:
            print(prompt[0])
            response = request_with_checks(prompt[0])
            dct_responses[key].append((response, prompt[1]))
            time.sleep(1)
            
    return dct_responses

In [98]:
dct_responses = collect_samples(dct_final_prompts)

0
Rephrase an original question or statement 3 times. Original phrase: "pair that with really poor comedic writing  and you ve got a huge mess ".
Rephrase an original question or statement 3 times. Original phrase: "the leads are so unmemorable  despite several attempts at lengthy dialogue scenes  that one eventually resents having to inhale this gutter romancer s secondhand material ".
Rephrase an original question or statement 3 times. Original phrase: "it s pretentious in a way that verges on the amateurish ".
Rephrase an original question or statement 3 times. Original phrase: "one sloughs one s way through the mire of this alleged psychological thriller in search of purpose or even a plot ".
Rephrase an original question or statement 3 times. Original phrase: "bears is even worse than i imagined a movie ever could be ".
Rephrase an original question or statement 3 times. Original phrase: "stiff and schmaltzy and clumsily directed ".
1
Rephrase an original question or statement 3 t

In [99]:
fb_0 = filter_responses(dct_responses)
fb_0.drop_duplicates().to_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_1_seeds.csv', index=False)

In [100]:
dct_responses = collect_samples(dct_final_prompts)

0
Rephrase an original question or statement 3 times. Original phrase: "pair that with really poor comedic writing  and you ve got a huge mess ".
Rephrase an original question or statement 3 times. Original phrase: "the leads are so unmemorable  despite several attempts at lengthy dialogue scenes  that one eventually resents having to inhale this gutter romancer s secondhand material ".
Rephrase an original question or statement 3 times. Original phrase: "it s pretentious in a way that verges on the amateurish ".
Rephrase an original question or statement 3 times. Original phrase: "one sloughs one s way through the mire of this alleged psychological thriller in search of purpose or even a plot ".
Rephrase an original question or statement 3 times. Original phrase: "bears is even worse than i imagined a movie ever could be ".
Rephrase an original question or statement 3 times. Original phrase: "stiff and schmaltzy and clumsily directed ".
1
Rephrase an original question or statement 3 t

In [101]:
fb_0 = filter_responses(dct_responses)
fb_0.drop_duplicates().to_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_0_seeds.csv', index=False)

In [102]:
fb_0 = pd.read_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_0_seeds.csv')
dct_phrases = fb_0.groupby('label')['seed'].apply(set).to_dict()
dct_taboo = get_taboo_w_for_df_no_ner(fb_0, 3, dct_phrases)

In [103]:
defaul_taboo_prompt = 'Rephrase an original question or statement 3 times. Original phrase: "{}". Don’t use the words “{}”, “{}” or “{}” in your responses.'

dct_final_prompts = {}

for key in dct_phrases:
    dct_final_prompts[key] = []
    for phrase in dct_phrases[key]:
        dct_final_prompts[key].append((defaul_taboo_prompt.format(phrase, dct_taboo[key][0], dct_taboo[key][1], dct_taboo[key][2]), phrase))

In [104]:
dct_responses = collect_samples(dct_final_prompts)

0
Rephrase an original question or statement 3 times. Original phrase: "stiff and schmaltzy and clumsily directed ". Don’t use the words “movie”, “film” or “even” in your responses.
Rephrase an original question or statement 3 times. Original phrase: "bears is even worse than i imagined a movie ever could be ". Don’t use the words “movie”, “film” or “even” in your responses.
Rephrase an original question or statement 3 times. Original phrase: "it s pretentious in a way that verges on the amateurish ". Don’t use the words “movie”, “film” or “even” in your responses.
Rephrase an original question or statement 3 times. Original phrase: "one sloughs one s way through the mire of this alleged psychological thriller in search of purpose or even a plot ". Don’t use the words “movie”, “film” or “even” in your responses.
Rephrase an original question or statement 3 times. Original phrase: "pair that with really poor comedic writing  and you ve got a huge mess ". Don’t use the words “movie”, “fi

In [105]:
fb_0 = filter_responses(dct_responses)
fb_0.drop_duplicates().to_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_1_taboo.csv', index=False)

In [106]:
fb_0 = pd.read_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_0_seeds.csv').drop_duplicates()
dct_phrases = fb_0.groupby('label')['seed'].apply(set).to_dict()

df_outliers = calculate_outliers(fb_0)
dct_phrases = get_seed_sentences_per_labels(df_outliers, dct_phrases)

In [107]:
default_prompt = 'Rephrase an original question or statement 3 times. Original phrase: "{}".'

dct_final_prompts = {}

for key in dct_phrases:
    dct_final_prompts[key] = []
    for phrase in dct_phrases[key]:
        dct_final_prompts[key].append((default_prompt.format(phrase), phrase))

In [108]:
dct_responses = collect_samples(dct_final_prompts)

0
Rephrase an original question or statement 3 times. Original phrase: "bears is far more terrible than i ever envisioned a film could possibly be".
Rephrase an original question or statement 3 times. Original phrase: " ".
Rephrase an original question or statement 3 times. Original phrase: "i never imagined any film could be as awful as bears turned out to be".
Rephrase an original question or statement 3 times. Original phrase: "rigid and overly sentimental with awkward direction".
Rephrase an original question or statement 3 times. Original phrase: "unyielding and cheesy hindered by unskillful direction".
Rephrase an original question or statement 3 times. Original phrase: "the movie bears exceeds the worst that i had ever contemplated a film might be".
1
Rephrase an original question or statement 3 times. Original phrase: "the franchises golden era appears to be well behind it".
Rephrase an original question or statement 3 times. Original phrase: "crumbles due to its insufficient m

In [109]:
fb_0 = filter_responses(dct_responses)
fb_0.drop_duplicates().to_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_1_chaining.csv', index=False)

In [110]:
default_prompt = """Rephrase an original question or statement 3 times. Original phrase: "{}".
###
Example paraphrases:
{}
###
"""

default_hint_prompt = '"{}".'

#use random=True for ablated version
def get_hint_sentences_per_labels(df_outliers, no_samples, dct_phrases, random=False):
    dct_hints_per_sample = {}
    for label in dct_phrases.keys():
        for phrase in dct_phrases[label]:
            sub_df = df_outliers[df_outliers['seed'] == phrase] 
            if random:
                sub_df = sub_df.sample(frac=1)
            else:
                sub_df = sub_df.sort_values(by=['distance'], ascending=False)
            dct_hints_per_sample[phrase] = list(sub_df.head(no_samples)['text'])
    return dct_hints_per_sample

fb_0 = pd.read_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_0_seeds.csv').drop_duplicates()
fb_0['text']=fb_0['text'].apply(lambda x: x.lower())
fb_0['text']=fb_0['text'].apply(lambda x: x.strip())
fb_0['text']=fb_0['text'].apply(lambda x: x.replace('"',''))
dct_phrases = fb_0.groupby('label')['seed'].apply(set).to_dict()

fb_0 = pd.read_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_0_seeds.csv').drop_duplicates()
fb_0['text']=fb_0['text'].apply(lambda x: x.lower())
fb_0['text']=fb_0['text'].apply(lambda x: x.strip())
fb_0['text']=fb_0['text'].apply(lambda x: x.replace('"',''))
# comb is done for the second round

df_outliers = calculate_outliers(fb_0)

df_merged = df_outliers.merge(fb_0, how='inner', on='text').drop_duplicates()[['label_x', 'text', 'distance', 'seed']]
df_merged = df_merged.rename(columns={'label_x': 'label'})

dct_hints_per_sample = get_hint_sentences_per_labels(df_merged, 3, dct_phrases)

In [111]:
dct_final_prompts = {}

for key in dct_phrases:
    dct_final_prompts[key] = []
    for phrase in dct_phrases[key]:
        hints = dct_hints_per_sample[phrase]
        str_hints = []
        for hint in hints:
            str_hints.append(default_hint_prompt.format(hint))
        final_hint_str = "\n".join(str_hints) 
        dct_final_prompts[key].append((default_prompt.format(phrase, final_hint_str), phrase))

In [112]:
dct_responses = collect_samples(dct_final_prompts)

0
Rephrase an original question or statement 3 times. Original phrase: "stiff and schmaltzy and clumsily directed ".
###
Example paraphrases:
"rigid and overly sentimental with awkward direction".
"unyielding and cheesy hindered by unskillful direction".
"inflexible and mawkish accompanied by inept directing".
###

Rephrase an original question or statement 3 times. Original phrase: "bears is even worse than i imagined a movie ever could be ".
###
Example paraphrases:
"bears is far more terrible than i ever envisioned a film could possibly be".
"i never imagined any film could be as awful as bears turned out to be".
"the movie bears exceeds the worst that i had ever contemplated a film might be".
###

Rephrase an original question or statement 3 times. Original phrase: "it s pretentious in a way that verges on the amateurish ".
###
Example paraphrases:
"theres an air of pretension to it that teeters on being unprofessionally done".
"its pretentiousness is at a level which almost become

In [113]:
fb_0 = filter_responses(dct_responses)
fb_0.drop_duplicates().to_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_1_hints.csv', index=False)

## final csvs

In [180]:
# temporary only
import re
import string

NO_TRY=4

fb_0 = pd.read_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/seeds.csv')
fb_0 = fb_0[['text', 'label']].drop_duplicates()

fb_0['text']=fb_0['text'].apply(lambda x: x.lower())
fb_0['text']=fb_0['text'].apply(lambda x: x.strip())
fb_0['text']=fb_0['text'].apply(lambda x: x.replace('"',''))
fb_0['text']=fb_0['text'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))

fb_0.to_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/seeds.csv', index=False)

In [5]:

fb_seeds = pd.read_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/seeds.csv').dropna().reset_index(drop=True).sample(frac=1, replace=False, random_state=1)
fb_0 = pd.read_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_0_seeds.csv').dropna().reset_index(drop=True).sample(frac=1, replace=False, random_state=1)
fb_1 = pd.read_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_1_seeds.csv').dropna().reset_index(drop=True).sample(frac=1, replace=False, random_state=1)

fb = pd.concat([fb_0, fb_1, fb_seeds], ignore_index=True).drop_duplicates('text').dropna(subset=['text'])

In [182]:
fb = fb.sample(frac=1).reset_index(drop=True)
fb['text'] = fb['text'].apply(lambda x: x.replace('"',''))
fb = fb[~fb['text'].str.contains("paraphra")]

In [183]:
fb[['text', 'label']].to_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_prompt.csv', index=False)

In [184]:
fb_1 = pd.read_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_1_taboo.csv').dropna(subset=['text']).reset_index(drop=True).sample(frac=1, replace=False, random_state=1)


fb = pd.concat([fb_0, fb_1, fb_seeds], ignore_index=True).drop_duplicates('text').dropna(subset=['text'])
fb = fb[~fb['text'].str.contains("paraphra")]

fb = fb.sample(frac=1).reset_index(drop=True)
fb['text'] = fb['text'].apply(lambda x: x.replace('"',''))

fb[['text', 'label']].to_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_taboo.csv', index=False)

In [17]:
import re
import string

NO_TRY=4
fb_seeds = pd.read_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/seeds.csv').dropna().reset_index(drop=True).sample(frac=1, replace=False, random_state=1)
fb_0 = pd.read_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_0_seeds.csv').dropna().reset_index(drop=True).sample(frac=1, replace=False, random_state=1)
fb_1 = pd.read_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_1_chaining.csv').dropna(subset=['text']).reset_index(drop=True).sample(frac=1, replace=False, random_state=1)

fb = pd.concat([fb_0, fb_1, fb_seeds], ignore_index=True).drop_duplicates('text').dropna(subset=['text'])
fb = fb[~fb['text'].str.contains("paraphra")]

fb = fb.sample(frac=1).reset_index(drop=True)
fb['text'] = fb['text'].apply(lambda x: x.replace('"',''))

fb[['text', 'label']].to_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_chaining.csv', index=False)

In [186]:
fb_1 = pd.read_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_1_hints.csv').dropna(subset=['text']).reset_index(drop=True).sample(frac=1, replace=False, random_state=1)

fb = pd.concat([fb_0, fb_1, fb_seeds], ignore_index=True).drop_duplicates('text').dropna(subset=['text'])
fb = fb[~fb['text'].str.contains("paraphra")]

fb = fb.sample(frac=1).reset_index(drop=True)
fb['text'] = fb['text'].apply(lambda x: x.replace('"',''))

fb[['text', 'label']].to_csv('challenge_data_new/sst5/gpt4/'+str(NO_TRY)+'/sst_hints.csv', index=False)