# Formatting df

In [1]:
import pandas as pd

In [48]:
df = pd.read_csv(r'C:\Users\hvand\Documents\UM-CAI-Fellowship\course_similarity\rawData\course_cachedcourse_02192025.csv')
df

Unnamed: 0,course,title,search,data
0,AAS 103,First Year Social Science Seminar,This seminar introduces first-year students to...,"{""key"": ""AAS 103"", ""code"": ""AAS 103"", ""all_cou..."
1,AAS 104,First Year Humanities Seminar,This seminar introduces first-year students to...,"{""key"": ""AAS 104"", ""code"": ""AAS 104"", ""all_cou..."
2,AAS 111,Introduction to Africa and Its Diaspora,Introduces basic concepts and methods involved...,"{""key"": ""AAS 111"", ""code"": ""AAS 111"", ""all_cou..."
3,AAS 115,Elementary Swahili I,This introductory-level course is designed for...,"{""key"": ""AAS 115"", ""code"": ""AAS 115"", ""all_cou..."
4,AAS 116,Elementary Swahili II: Language and Culture,This introductory-level course is designed for...,"{""key"": ""AAS 116"", ""code"": ""AAS 116"", ""all_cou..."
...,...,...,...,...
13572,YIDDISH 431,Beginning Yiddish 1 for Graduate Students,This is the first of a two-term sequence desig...,"{""key"": ""YIDDISH 431"", ""code"": ""YIDDISH 431"", ..."
13573,YIDDISH 432,Beginning Yiddish 2 for Graduate Students,This is the second of a two-term sequence desi...,"{""key"": ""YIDDISH 432"", ""code"": ""YIDDISH 432"", ..."
13574,YIDDISH 531,Intermediate Yiddish 1 for Graduate Students,This is the third term of a language sequence ...,"{""key"": ""YIDDISH 531"", ""code"": ""YIDDISH 531"", ..."
13575,YIDDISH 532,Intermediate Yiddish 2 for Graduate Students,This is the fourth term of a language sequence...,"{""key"": ""YIDDISH 532"", ""code"": ""YIDDISH 532"", ..."


In [49]:
df = df.rename(columns={'search': 'description'})
df = df.drop('data', axis=1)
df = df.dropna(subset=['description'])
df

Unnamed: 0,course,title,description
0,AAS 103,First Year Social Science Seminar,This seminar introduces first-year students to...
1,AAS 104,First Year Humanities Seminar,This seminar introduces first-year students to...
2,AAS 111,Introduction to Africa and Its Diaspora,Introduces basic concepts and methods involved...
3,AAS 115,Elementary Swahili I,This introductory-level course is designed for...
4,AAS 116,Elementary Swahili II: Language and Culture,This introductory-level course is designed for...
...,...,...,...
13572,YIDDISH 431,Beginning Yiddish 1 for Graduate Students,This is the first of a two-term sequence desig...
13573,YIDDISH 432,Beginning Yiddish 2 for Graduate Students,This is the second of a two-term sequence desi...
13574,YIDDISH 531,Intermediate Yiddish 1 for Graduate Students,This is the third term of a language sequence ...
13575,YIDDISH 532,Intermediate Yiddish 2 for Graduate Students,This is the fourth term of a language sequence...


# Test connection to OpenAI API

In [40]:
from openai import AzureOpenAI
import os
from dotenv import load_dotenv

#Sets the current working directory to be the same as the file.
os.chdir(os.path.dirname(os.path.abspath('embeddings.ipynb')))

#Load environment file for secrets.
try:
    if load_dotenv('.env') is False:
        raise TypeError
except TypeError:
    print('Unable to load .env file.')
    quit()
#Create Azure client
client = AzureOpenAI(
            api_key=os.environ["OPENAI_API_KEY"],
            api_version=os.environ['OPENAI_API_VERSION'],
            azure_endpoint=os.environ['OPENAI_API_BASE'],
            organization=os.environ['OPENAI_ORGANIZATION_ID']
        )

# Send a completion call to generate an answer
print('Sending a test completion job')

response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What is 2 + 2?"}
        ],
        temperature=0,
        stop=None)

#Print response.
print(response.choices[0].message.content)

Sending a test completion job
2 + 2 equals 4.


# Generate embeddings

In [41]:
import numpy as np
def get_embedding(text, model=os.environ['OPENAI_EMBEDDING_MODEL']):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

def generateEmbeddingsDataframe(df, name):
    try:
        df['embedding'] = df.description.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
        df.to_pickle(f'{name}.pkl')
    except Exception as e:
        print(f"Error generating embeddings: {str(e)}")
        raise

In [51]:
generateEmbeddingsDataframe(df, "embeddings")

# Random tests

In [10]:
eecs281text = df[df['course'] == 'EECS 281'].description.values[0]

In [None]:
emb1 = np.array(get_embedding(df.iloc[0].description)).reshape(1, -1)

emb2 = np.array(get_embedding(df.iloc[2].description)).reshape(1, -1)

emb281 = np.array(get_embedding(eecs281text)).reshape(1,-1)

from sklearn.metrics.pairwise import cosine_similarity
print("AAS 103 and AAS 111")
print(cosine_similarity(emb1, emb2))
print("AAS 103 and EECS 281")
print(cosine_similarity(emb1, emb281))
print("AAS 111 and EECS 281")
print(cosine_similarity(emb2, emb281))

AAS 103 and AAS 111
[[0.86411986]]
AAS 103 and EECS 281
[[0.72905776]]
AAS 111 and EECS 281
[[0.71395553]]


## Runtimes
50: 1 minute

100: 2 minutes

500: 14 minutes

1000: 30 minutes

In [12]:
test_df = df.head(1000)

In [6]:
df['ada_embedding'] = df.description.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
df.to_csv('course_embeddings.csv', index=False)

# Dealing with cross listed classes

In [2]:
df = pd.read_pickle(r"C:\Users\hvand\Documents\UM-CAI-Fellowship\recommender\embeddings.pkl")
df

Unnamed: 0,course,title,description,embedding,level
0,AAS 103,First Year Social Science Seminar,This seminar introduces first-year students to...,"[0.008470812812447548, -0.020093858242034912, ...",100
1,AAS 104,First Year Humanities Seminar,This seminar introduces first-year students to...,"[0.015089535154402256, -0.02548396773636341, 0...",100
2,AAS 111,Introduction to Africa and Its Diaspora,Introduces basic concepts and methods involved...,"[-0.014231206849217415, -0.0060482630506157875...",100
3,AAS 115,Elementary Swahili I,This introductory-level course is designed for...,"[-0.004489186219871044, 0.010212578810751438, ...",100
4,AAS 116,Elementary Swahili II: Language and Culture,This introductory-level course is designed for...,"[0.0031663388945162296, 0.003630136139690876, ...",100
...,...,...,...,...,...
13572,YIDDISH 431,Beginning Yiddish 1 for Graduate Students,This is the first of a two-term sequence desig...,"[-0.021598465740680695, -0.0013895833399146795...",400
13573,YIDDISH 432,Beginning Yiddish 2 for Graduate Students,This is the second of a two-term sequence desi...,"[-0.02322854846715927, 0.002594573888927698, 0...",400
13574,YIDDISH 531,Intermediate Yiddish 1 for Graduate Students,This is the third term of a language sequence ...,"[-0.024370338767766953, -0.0012678070925176144...",500
13575,YIDDISH 532,Intermediate Yiddish 2 for Graduate Students,This is the fourth term of a language sequence...,"[-0.022884434089064598, -0.0027570633683353662...",500


In [11]:
description = df.loc[df['course'] == "AAS 115", 'description'].values[0]
description

'This introductory-level course is designed for students with little or no previous study of Swahili (Kiswahili). Students develop their ability to communicate satisfactorily in Swahili in everyday practical situations as well as acquire some of the skills necessary for effective reading and writing. Using a variety of written and oral materials, the course focuses on the development of the four language skills necessary for interpersonal communication in Swahili: listening, writing, reading, and speaking.'

In [13]:
import pandas as pd
from collections import defaultdict

def find_cross_listed_courses(df):
    # Create a dictionary to store descriptions and their corresponding courses
    description_to_courses = defaultdict(list)
    
    # Group courses by their descriptions
    for _, row in df.iterrows():
        description_to_courses[row['description']].append(row['course'])
    
    # Filter out descriptions with more than one course (cross-listed)
    cross_listed = {desc: courses for desc, courses in description_to_courses.items() if len(courses) > 1}
    
    # Count total number of cross-listed course sets
    total_cross_listed_sets = len(cross_listed)
    
    # Find the course that appears most frequently in cross-listings
    course_cross_list_count = defaultdict(int)
    for courses in cross_listed.values():
        for course in courses:
            course_cross_list_count[course] += 1
    
    # Find the most cross-listed course
    most_cross_listed_course = max(course_cross_list_count, key=course_cross_list_count.get)
    
    print(f"Total number of cross-listed course sets: {total_cross_listed_sets}")
    print(f"Most cross-listed course: {most_cross_listed_course}")
    print(f"Number of cross-listings for {most_cross_listed_course}: {course_cross_list_count[most_cross_listed_course]}")
    
    return cross_listed

def clean_cross_listed_courses(df):
    # Find cross-listed courses
    cross_listed = find_cross_listed_courses(df)
    
    # Create a new DataFrame to store cleaned data
    cleaned_df = df.copy()
    
    # List to keep track of courses to drop
    courses_to_drop = []
    
    # Process each set of cross-listed courses
    for description, courses in cross_listed.items():
        # Sort courses alphabetically to choose the base course
        sorted_courses = sorted(courses)
        base_course = sorted_courses[0]
        
        # Modify the base course entry
        base_course_index = df[df['course'] == base_course].index[0]
        
        # Create cross-listing string for other courses
        cross_list_string = f"{base_course} (Cross-listed as {', '.join(sorted_courses[1:])})"
        cleaned_df.at[base_course_index, 'course'] = cross_list_string
        
        # Mark other courses for removal
        courses_to_drop.extend(sorted_courses[1:])
    
    # Remove duplicate courses
    cleaned_df = cleaned_df[~cleaned_df['course'].isin(courses_to_drop)]
    
    return cleaned_df

In [14]:
cl = find_cross_listed_courses(df)

Total number of cross-listed course sets: 2057
Most cross-listed course: AAS 208
Number of cross-listings for AAS 208: 1


In [16]:
cleaned_df = clean_cross_listed_courses(df)
cleaned_df

Total number of cross-listed course sets: 2057
Most cross-listed course: AAS 208
Number of cross-listings for AAS 208: 1


Unnamed: 0,course,title,description,embedding,level
0,AAS 103,First Year Social Science Seminar,This seminar introduces first-year students to...,"[0.008470812812447548, -0.020093858242034912, ...",100
1,AAS 104,First Year Humanities Seminar,This seminar introduces first-year students to...,"[0.015089535154402256, -0.02548396773636341, 0...",100
2,AAS 111,Introduction to Africa and Its Diaspora,Introduces basic concepts and methods involved...,"[-0.014231206849217415, -0.0060482630506157875...",100
3,AAS 115,Elementary Swahili I,This introductory-level course is designed for...,"[-0.004489186219871044, 0.010212578810751438, ...",100
4,AAS 116,Elementary Swahili II: Language and Culture,This introductory-level course is designed for...,"[0.0031663388945162296, 0.003630136139690876, ...",100
...,...,...,...,...,...
13563,WRITING 410,Quantitative Analysis and Writing in the Disci...,"In various disciplinary iterations, this cours...","[0.00869917031377554, 0.018616488203406334, 0....",400
13564,WRITING 420,Minor in Writing Capstone,"In this course, Minor in Writing students prod...","[0.0156615749001503, -0.016281183809041977, -0...",400
13565,WRITING 630,Advanced Writing for Graduate Students,This advanced writing course for graduate stud...,"[0.01660611853003502, 0.006102879531681538, -0...",600
13566,WRITING 631,Dissertation Writing,This course helps doctoral students make subst...,"[-0.0005177850252948701, 0.005452962126582861,...",600


In [18]:
cleaned_df.to_pickle("embeddings.pkl")