In [1]:
import pandas as pd

In [None]:
df = pd.read_csv(r'course_similarity/data/rawData2024/Atlas_course_bestdescription.csv')

In [3]:
from openai import AzureOpenAI
import os
from dotenv import load_dotenv

#Sets the current working directory to be the same as the file.
os.chdir(os.path.dirname(os.path.abspath('embeddings.ipynb')))

#Load environment file for secrets.
try:
    if load_dotenv('.env') is False:
        raise TypeError
except TypeError:
    print('Unable to load .env file.')
    quit()
#Create Azure client
client = AzureOpenAI(
            api_key=os.environ["OPENAI_API_KEY"],
            api_version=os.environ['OPENAI_API_VERSION'],
            azure_endpoint=os.environ['OPENAI_API_BASE'],
            organization=os.environ['OPENAI_ORGANIZATION_ID']
        )

# Send a completion call to generate an answer
print('Sending a test completion job')

response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What is 2 + 2?"}
        ],
        temperature=0,
        stop=None)

#Print response.
print(response.choices[0].message.content)

Sending a test completion job
2 + 2 equals 4.


In [4]:
df

Unnamed: 0,course,description,level
0,AAS 103,In recent years the news coverage of Africa ha...,100
1,AAS 104,Hum Seminar,100
2,AAS 111,Africa is the second largest continent on eart...,100
3,AAS 115,This course is an introduction to spoken and w...,100
4,AAS 116,This introductory-level course is designed for...,100
...,...,...,...
14462,POLSCI 816,This course provides an introduction to estima...,800
14463,PSYCH 841,Functional MRI has become one of the leading m...,800
14464,SW 858,This course is concerned with analyzing the di...,800
14465,SW 874,This seminar provides a foundation and overvie...,800


In [10]:
eecs281text = df[df['course'] == 'EECS 281'].description.values[0]

In [5]:
import numpy as np
def get_embedding(text, model=os.environ['OPENAI_EMBEDDING_MODEL']):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [11]:


emb1 = np.array(get_embedding(df.iloc[0].description)).reshape(1, -1)

emb2 = np.array(get_embedding(df.iloc[2].description)).reshape(1, -1)

emb281 = np.array(get_embedding(eecs281text)).reshape(1,-1)

from sklearn.metrics.pairwise import cosine_similarity
print("AAS 103 and AAS 111")
print(cosine_similarity(emb1, emb2))
print("AAS 103 and EECS 281")
print(cosine_similarity(emb1, emb281))
print("AAS 111 and EECS 281")
print(cosine_similarity(emb2, emb281))

AAS 103 and AAS 111
[[0.86411986]]
AAS 103 and EECS 281
[[0.72905776]]
AAS 111 and EECS 281
[[0.71395553]]


## Runtimes
50: 1 minute

100: 2 minutes

500: 14 minutes

1000: 30 minutes

In [12]:
test_df = df.head(1000)

In [6]:
df['ada_embedding'] = df.description.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
df.to_csv('course_embeddings.csv', index=False)

In [None]:
def generateEmbeddingsDataframe(df):
    df['ada_embedding'] = df.description.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
    df.to_pkl('embeddings.pkl', index=False)