In [1]:
from openai import OpenAI
client = OpenAI()

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('../preprocessing/translated_course_data_V2.csv')
df['embedding'] = df["course_description_en"].apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

In [19]:
df.to_csv('course_data_with_embeddings.csv', index=False)

In [None]:
import seaborn as sns
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

df["embedding"] = df["embedding"].apply(lambda x: np.array(x))

# Keep the top 3 faculties
X = df.embedding
faculty = df["course_code"].str.split("-", expand=True)[0]
top_faculties = faculty.value_counts().index[:3]
mask = faculty.isin(top_faculties)
X_masked = np.stack(X[mask].to_numpy())

reduced = TSNE(
    n_components=2,
    perplexity=40,
    n_iter=5000,
).fit_transform(X_masked)

# Faculty labels
_, faculty_labels = np.unique(faculty[mask].values, return_inverse=True)


# Plot
fig, ax = plt.subplots(figsize=(5, 5))
scatter = ax.scatter(
    reduced[:, 0],
    reduced[:, 1],
    c=faculty_labels,
)

# Legend
legend1 = ax.legend(
    scatter.legend_elements()[0],
    top_faculties,
    title="Faculty",
)

sns.despine()
ax.grid(alpha=0.2)
ax.set_axisbelow(True)
ax.set_xticks([])
ax.set_yticks([])
ax.set_title("OpenAI embeddings in 2D", fontweight="bold")
ax.set_xlabel("1st component")
ax.set_ylabel("2nd component")

fig.tight_layout()
plt.show()