In [4]:
from openai import AzureOpenAI
import os
from dotenv import load_dotenv

In [5]:
#Sets the current working directory to be the same as the file.
os.chdir(os.path.dirname(os.path.abspath('um_courserec.ipynb')))

#Load environment file for secrets.
try:
    if load_dotenv(r'C:\Users\hvand\OneDrive - Umich\Documents\atlas\umgpt_recommender\.env') is False:
        raise TypeError
except TypeError:
    print('Unable to load .env file.')
    quit()
#Create Azure client
client = AzureOpenAI(
    api_key=os.environ["OPENAI_API_KEY"],
    api_version=os.environ['OPENAI_API_VERSION'],
    azure_endpoint=os.environ['OPENAI_API_BASE'],
    organization=os.environ['OPENAI_ORGANIZATION_ID']
)

# Send a completion call to generate an answer
print('Sending a test completion job')

response = client.chat.completions.create(
        model=os.environ['OPENAI_MODEL'],
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What is 2 + 2?"}
        ],
        temperature=0,
        stop=None)

#Print response.

print(response.choices[0].message.content)

Sending a test completion job
2 + 2 equals 4.


In [7]:
system_content = '''
        You are a keyword extraction tool used by a College Course Recommendation System that searches through course descriptions to recommend classes to a student.
        You will output a series of keywords in the specified format based on a students request to help the system filter the dataset to relevant courses. 
        Example:
        Student request: "I am a mathematics student interested in computer science theory. What are some courses I could take?"
        Your output: "computer science, algorithms, theory, data structures, discrete mathematics, computation, computational complexity"
        
        '''
query = 'I am a political science major interested in asian studies and specifically china. I also enjoy asian american studies.'
messages = [
        {"role": "system", "content": system_content},
        {"role": "user", "content": query}
    ]
print('Initial filter')
gpt_response = client.chat.completions.create(
    model=os.environ['OPENAI_MODEL'],
    messages=messages,
    temperature=0,
    stop=None)

Initial filter


In [10]:
gpt_response.choices[0].message.content

'"political science, asian studies, china, chinese politics, asian american studies, international relations, east asian culture, comparative politics"'

In [3]:
import pandas as pd
df = pd.read_csv(r'C:\Users\hvand\OneDrive - Umich\Documents\atlas\course_similarity\hybrid_course_data.csv')

In [4]:
def department_2grams(department):
    dep = df[df['course'].str.contains(department, case=False, na=False)]
    dep_string = ''
    for _, row in dep.iterrows():
        course_name = row['course']
        description = row['description']
        dep_string += f"Course {course_name}: {description}\n"
    message = '''I have a collection of course descriptions from the ''' + department + '''department at the University of Michigan. 
I would like to identify key themes and areas of focus within this department based on these descriptions. 
To achieve this, could you analyze the text and extract the 20 most important and identifying 2-grams? 
These 2-grams should capture the essence of the subjects taught, methodologies used, or any unique aspects of 
this department's offerings. Please ensure the 2-grams are relevant to the academic context and avoid common phrases that do not add value 
in understanding the department's focus. Additionally, if possible, provide a brief explanation for why each 2-gram was selected, based on 
its relevance or frequency in the course descriptions.'''
    response = client.chat.completions.create(
        model=os.environ['OPENAI_MODEL'],
        messages=[
            {"role": "system", "content": message},
            {"role": "user", "content": dep_string}
        ],
        temperature=0,
        stop=None)

    #Print response.

    print(response.choices[0].message.content)
    return response.choices[0].message.content

In [6]:
math2g = department_2grams('MATH')

1. "Mathematical reasoning" - This 2-gram indicates the department's emphasis on developing students' ability to reason mathematically, which is a critical skill in the field of mathematics.

2. "Conceptual understanding" - This suggests the department's focus on ensuring students understand the underlying concepts in mathematics, not just the ability to perform calculations.

3. "Mathematical modeling" - This indicates the department's focus on applying mathematical concepts to real-world problems, a key aspect of applied mathematics.

4. "Problem-solving capabilities" - This suggests the department's emphasis on developing students' problem-solving skills, a critical skill in mathematics and many other fields.

5. "Rigorous mathematical" - This indicates the department's commitment to rigorous mathematical study, suggesting a high academic standard in their courses.

6. "Theoretical approach" - This suggests that the department values theoretical understanding of mathematics, not jus

In [5]:
physics2g = department_2grams('PHYSICS')

1. "Physical Concepts": This 2-gram is important as it indicates the foundational knowledge that the department aims to impart to students. It is a recurring theme in many course descriptions, suggesting a strong emphasis on understanding fundamental principles.

2. "Experimental Methods": This 2-gram suggests a strong emphasis on practical, hands-on learning and the application of theoretical knowledge. It is a key aspect of the department's teaching methodology.

3. "Classical Mechanics": This is a key area of focus within the department, with several courses dedicated to exploring the principles of classical mechanics.

4. "Electromagnetic Waves": This 2-gram indicates a significant focus on the study of electromagnetic waves, a fundamental aspect of physics.

5. "Quantum Mechanics": This is a key area of study within the department, indicating a focus on advanced, theoretical physics.

6. "Statistical Foundations": This 2-gram suggests that the department places importance on under

In [7]:
english2g = department_2grams('ENGLISH')

1. "Academic writing skills" - This 2-gram indicates a key focus of the English department, which is to develop students' abilities to write effectively in an academic context. 

2. "Critical thinking" - This phrase suggests that the department emphasizes the development of critical thinking skills, which are essential for analyzing and interpreting literary texts.

3. "Literary texts" - This 2-gram suggests that the study of literature, including various genres and disciplines, is a central part of the department's curriculum.

4. "Social movements" - This phrase indicates that the department incorporates the study of social movements, suggesting a focus on the intersection of literature and society.

5. "Creative writing" - This 2-gram suggests that the department offers courses in creative writing, allowing students to develop their own writing skills in addition to studying existing works.

6. "Community engagement" - This phrase suggests that the department values real-world appli

In [10]:
#df = df[df['course'].str.contains('PHYSICS', case=False, na=False)]
physics = df[df['course'].str.contains('PHYSICS', case=False, na=False)]
physics_string = ''
for _, row in physics.iterrows():
    course_name = row['course']
    description = row['description']
    physics_string += f"Course {course_name}: {description}\n"

message = '''I have a collection of course descriptions from the Physics Department at the University of Michigan. 
I would like to identify key themes and areas of focus within this department based on these descriptions. 
To achieve this, could you analyze the text and extract the 20 most important and identifying 2-grams? 
These 2-grams should capture the essence of the subjects taught, methodologies used, or any unique aspects of 
this department's offerings. Please ensure the 2-grams are relevant to the academic context and avoid common phrases that do not add value 
in understanding the department's focus. Additionally, if possible, provide a brief explanation for why each 2-gram was selected, based on 
its relevance or frequency in the course descriptions.'''
response = client.chat.completions.create(
        model=os.environ['OPENAI_MODEL'],
        messages=[
            {"role": "system", "content": message},
            {"role": "user", "content": physics_string}
        ],
        temperature=0,
        stop=None)

#Print response.

print(response.choices[0].message.content)

1. "Experimental methods": This 2-gram is frequently mentioned, indicating a strong emphasis on hands-on learning and practical application of theories in the department's courses.
2. "Physical principles": This phrase suggests a focus on understanding the fundamental laws and concepts that govern the physical world.
3. "Classical mechanics": This is a key area of study in physics, dealing with the motion of macroscopic objects.
4. "Electromagnetic waves": This topic is a fundamental part of physics, dealing with light and other forms of electromagnetic radiation.
5. "Quantum mechanics": This is a key area of modern physics that deals with phenomena on a very small scale, such as atoms and subatomic particles.
6. "Statistical physics": This suggests a focus on the statistical nature of particles in a system, a key concept in thermodynamics and quantum mechanics.
7. "Modern physics": This phrase indicates a focus on 20th-century developments in physics, including quantum mechanics and r

# Trigrams

In [9]:
#df = df[df['course'].str.contains('PHYSICS', case=False, na=False)]
physics = df[df['course'].str.contains('PHYSICS', case=False, na=False)]
physics_string = ''
for _, row in physics.iterrows():
    course_name = row['course']
    description = row['description']
    physics_string += f"Course {course_name}: {description}\n"

message = '''I have a collection of course descriptions from the Physics Department at the University of Michigan. 
I would like to identify key themes and areas of focus within this department based on these descriptions. 
To achieve this, could you analyze the text and extract the 20 most important and identifying 3-grams? A 3-gram is a sequence of three words. 
Examples: 'quantum field theory', 'rigid body dynamics','solid state physics'.
These 3-grams should capture the essence of the subjects taught, methodologies used, or any unique aspects of 
this department's offerings. Please ensure the 3-grams are relevant to the academic context and avoid common phrases that do not add value 
in understanding the department's focus. Additionally, if possible, provide a brief explanation for why each 3-gram was selected, based on 
its relevance or frequency in the course descriptions.'''
response = client.chat.completions.create(
        model=os.environ['OPENAI_MODEL'],
        messages=[
            {"role": "system", "content": message},
            {"role": "user", "content": physics_string}
        ],
        temperature=0,
        stop=None)

#Print response.

print(response.choices[0].message.content)

1. 'experimental methods in science': This 3-gram is repeated in multiple course descriptions, indicating a strong emphasis on hands-on learning and practical application of theories.
2. 'principles of classical mechanics': This phrase is used in several courses, suggesting a foundational focus on classical mechanics in the department's curriculum.
3. 'physics of life': This phrase appears in multiple courses, indicating a unique interdisciplinary approach combining physics and life sciences.
4. 'understanding the principles of': This phrase is frequently used, indicating a focus on fundamental principles in various areas of physics.
5. 'physics of the universe': This phrase suggests a focus on astrophysics and cosmology in the department's offerings.
6. 'introduction to quantum mechanics': This phrase is used in several courses, indicating a strong focus on quantum mechanics in the curriculum.
7. 'based on theory': This phrase is used in multiple courses, indicating a strong emphasis 

In [4]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

df['ada_embedding'] = df.description.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
df.to_csv('output/embedded_df', index=False)

NotFoundError: Error code: 404 - {'error': {'code': 'DeploymentNotFound', 'message': 'The API deployment for this resource does not exist. If you created the deployment within the last 5 minutes, please wait a moment and try again.'}, 'backend-counter': '5'}