### **0. Set-up**

In [1]:
# Import libraries and utils
%run '../../utils.ipynb'

In [2]:
# Get api key
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Set client
client = OpenAI()

In [3]:
# Load dataframe
en_simlex = pd.read_csv("../../../data/dataset/en-simlex-999.txt", delimiter='\t')

# Select subset
# en_simlex = en_simlex.head(5)

# Convert to tuple
word_pairs = list(zip(en_simlex['word1'], en_simlex['word2']))

In [4]:
# Show results
word_pairs

[('old', 'new'),
 ('smart', 'intelligent'),
 ('hard', 'difficult'),
 ('happy', 'cheerful'),
 ('hard', 'easy'),
 ('fast', 'rapid'),
 ('happy', 'glad'),
 ('short', 'long'),
 ('stupid', 'dumb'),
 ('weird', 'strange'),
 ('wide', 'narrow'),
 ('bad', 'awful'),
 ('easy', 'difficult'),
 ('bad', 'terrible'),
 ('hard', 'simple'),
 ('smart', 'dumb'),
 ('insane', 'crazy'),
 ('happy', 'mad'),
 ('large', 'huge'),
 ('hard', 'tough'),
 ('new', 'fresh'),
 ('sharp', 'dull'),
 ('quick', 'rapid'),
 ('dumb', 'foolish'),
 ('wonderful', 'terrific'),
 ('strange', 'odd'),
 ('happy', 'angry'),
 ('narrow', 'broad'),
 ('simple', 'easy'),
 ('old', 'fresh'),
 ('apparent', 'obvious'),
 ('inexpensive', 'cheap'),
 ('nice', 'generous'),
 ('weird', 'normal'),
 ('weird', 'odd'),
 ('bad', 'immoral'),
 ('sad', 'funny'),
 ('wonderful', 'great'),
 ('guilty', 'ashamed'),
 ('beautiful', 'wonderful'),
 ('confident', 'sure'),
 ('dumb', 'dense'),
 ('large', 'big'),
 ('nice', 'cruel'),
 ('impatient', 'anxious'),
 ('big', 'broad'),

### **1. Process and Extract Data**

In [5]:
# Define model
model = "text-embedding-3-small"

# Delay between individual API calls
delay = 3.0

In [6]:
# Get results from API
# response = get_responses_embeddings(word_pairs, model, delay)

# Show results
response

Calculating Cosine Similarities:   0%|          | 0/999 [00:00<?, ?pair/s]

Calculating Cosine Similarities: 100%|██████████| 999/999 [1:02:51<00:00,  3.78s/pair]

Total time taken: 3771.88 seconds





[('old', 'new', 0.507819402428143),
 ('smart', 'intelligent', 0.5666736288495434),
 ('hard', 'difficult', 0.5770528566301082),
 ('happy', 'cheerful', 0.5998875099006626),
 ('hard', 'easy', 0.4499909796504542),
 ('fast', 'rapid', 0.6539011830814078),
 ('happy', 'glad', 0.6092279145608956),
 ('short', 'long', 0.54008418314984),
 ('stupid', 'dumb', 0.6984374395833702),
 ('weird', 'strange', 0.7870392107068018),
 ('wide', 'narrow', 0.4134980003343335),
 ('bad', 'awful', 0.553446835705118),
 ('easy', 'difficult', 0.48142789186857776),
 ('bad', 'terrible', 0.5281516445911237),
 ('hard', 'simple', 0.33935903356104696),
 ('smart', 'dumb', 0.4022169446301564),
 ('insane', 'crazy', 0.7293931992368377),
 ('happy', 'mad', 0.4452660669627193),
 ('large', 'huge', 0.7627405281584577),
 ('hard', 'tough', 0.6182220749028513),
 ('new', 'fresh', 0.5854787293901325),
 ('sharp', 'dull', 0.4133060615148234),
 ('quick', 'rapid', 0.6947679731358058),
 ('dumb', 'foolish', 0.5203040224271053),
 ('wonderful', 't

In [7]:
# Store results in DataFrame and round 2 decimals
df = process_responses_embeddings(response)

# Show results
df

Unnamed: 0,word1,word2,similarity_score
0,old,new,0.51
1,smart,intelligent,0.57
2,hard,difficult,0.58
3,happy,cheerful,0.60
4,hard,easy,0.45
...,...,...,...
994,join,acquire,0.32
995,send,attend,0.47
996,gather,attend,0.59
997,absorb,withdraw,0.43


Note that this shows the cosine similarity score on the scale [-1; 1]. The similarity_score will be normalized to the SimLex questionnaire scale [0; 10] in the evaluation notebook.

In [8]:
# Define file_path
file_path = '../../../data/text-embedding-3-small/en.csv'

# Check if the file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")

File saved successfully.
