## Mount Drive

In [None]:
from google.colab import drive, files
drive.mount('/content/drive')

Mounted at /content/drive


## Change Working Directory to "course_selection" Folder

In [None]:
%cd "/content/drive/MyDrive/course_selection"

/content/drive/MyDrive/course_selection


## Install/Import Modules

In [None]:
!pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.26.1.tar.gz (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.3/55.3 KB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai
  Building wheel for openai (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai: filename=openai-0.26.1-py3-none-any.whl size=67316 sha256=3365392079c8b58204ded1e78c012b7b88b3c6c89731db36d9c75a856ec9d070
  Stored in directory: /root/.cache/pip/wheels/2f/9c/55/95d3609ccfc463eeffb96d50c756f1f1899453b85e92021a0a
Successfully built openai
Installing collected packages: openai
Successfully installed openai-0.26.1


In [None]:
import argparse
import colorsys
import openai
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import textwrap
import os
from course_names import *
from filter_courses import *
from matplotlib import colors as mcolors
from sklearn.manifold import TSNE
from openai.embeddings_utils import get_embedding, cosine_similarity

## Variable Definitions

In [None]:
mode = "search" #@param ["all", "search"]

openai.api_key = "" #@param {type:"string"}
search_query = "logic" #@param {type:"string"}
number_results = 50 #@param {type:"integer"}
csv_name = "mit_courses_with_TSNE_embeddings.csv" #@param {type:"string"}
filter_grad_courses = True #@param ["True", "False"] {type:"raw"}




In [None]:
#@title Obtain TSNE Matrix (Run this once)
df = pd.read_csv(csv_name)
def feature_matrix(df):
    print("Extracting Embedding Feature Matrix...")
    matrix = df.embedding_combined.apply(eval).to_list()
    matrix_empty = np.zeros((len(matrix), len(matrix[0])))
    for i in range(len(matrix)):
        try:
            matrix_empty[i, :] = np.array(matrix[i])
        except Exception as e:
            print(i, e)
            print(matrix[i])
            exit()
    matrix = matrix_empty
    return matrix

def toarray(x):
   x = [float(v.strip()) for v in x.strip('[').strip(']').split(',')]
   return x

def search_courses(df, search_query, n):
    embedding = get_embedding(
        search_query,
        engine="text-embedding-ada-002"
    )
    df["similarity"] = df.embedding_combined.apply(lambda x: cosine_similarity(np.asarray(toarray(x), dtype='float64'), np.asarray(embedding, dtype='float64')))
    results = list(df.sort_values("similarity", ascending=False).head(n).course_number)
    return results

df['description']  = ["<br>".join(textwrap.wrap(d)) for d in list(df.desc)]
df['course'] = ["{} ({})".format(g, course_names[g]) for g in [d.split('.')[0] for d in list(df.course_number)]]
df['course_id'] = [d.split('.')[0] for d in list(df.course_number)]

css4_colors = mcolors.CSS4_COLORS
def sort_course(x):
     x = x.split()[0]
     try:
         x1 = int(''.join(filter(str.isdigit, x)))
     except:
         x1 = float('inf')
     x2 = str(''.join(filter(str.isalpha, x)))
     return x1, x2

all_courses = sorted(list(df.course.unique()), key=sort_course)
def get_hsv(color_name):
    hexrgb = css4_colors[color_name]
    hexrgb = hexrgb.lstrip("#")   # in case you have Web color specs
    r, g, b = (int(hexrgb[i:i+2], 16) / 255.0 for i in range(0,5,2))
    return colorsys.rgb_to_hsv(r, g, b)

colors = list(css4_colors.keys())
colors = np.random.choice(colors, len(all_courses), False)
colors = sorted(colors, key=get_hsv)
df['color'] = [css4_colors[colors[all_courses.index(i)]] for i in list(df.course)]
dm = {all_courses[i]: colors[i] for i in range(len(all_courses))}

if csv_name == "mit_course_catalog_with_embeddings.csv":
  df['embedding'] = df.embedding_combined.apply(eval).apply(np.array)
  print("Evaluating TSNE on Dataset...")
  tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
  matrix = feature_matrix(df)
  vis_dims = tsne.fit_transform(matrix)
  df['x'] = [x for x,y in vis_dims]
  df['y'] = [y for x,y in vis_dims]
  if not os.path.exists("mit_courses_with_TSNE_embeddings.csv"):
    df.to_csv("mit_courses_with_TSNE_embeddings.csv")

In [None]:
#@title Main Code
if mode == "all":
  if not filter_grad_courses:
    fig = px.scatter(df, x='x', y='y', color='course', hover_data=['course_number', 'title', 'description', 'prereq'], color_discrete_map=dm, category_orders={'course': all_courses}, template="plotly_dark", title="MIT Courses Grouped by Title and Description Similarity")
    fig.show()
  else:
    df_filter = filter_grad_courses_over_12_units(df)
    fig = px.scatter(df_filter, x='x', y='y', color='course', hover_data=['course_number', 'title', 'description', 'prereq'], color_discrete_map=dm, category_orders={'course': all_courses}, template="plotly_dark", title="MIT Grad Courses over 12 Units")
    fig.show()
elif mode == "search":
  print("Searching for classes...")
  course_numbers = list(search_courses(df, search_query, number_results))
  print(course_numbers)
  df_filter = filter_search(df, course_numbers)
  if filter_grad_courses:
    df_filter = filter_grad_courses_over_12_units(df_filter)
  fig = px.scatter(df_filter, x='x', y='y', color='course', hover_data=['course_number', 'title', 'description', 'prereq'], color_discrete_map=dm, category_orders={'course': all_courses}, template="plotly_dark", title="MIT Course Search Results for \"{}\"".format(search_query))
  fig.show()


Searching for classes...
['24.242', '24.244', '24.241', '24.711', '18.515', '18.510', '6.2050', '6.5120', '18.504', '24.973', '24.280', '9.49', '2.110', '6.120A', '7.50', '9.490', '6.4800', '9.66[J]', '16.858', '6.1910', '18.090', '6.1920', '4.542', '8.593[J]', '16.413[J]', '9.53', '24.902', '24.954', '16.410[J]', '11.238[J]', '14.15[J]', 'MAS.862', '24.932', '8.591[J]', '4.521', '6.4110', '9.530', '20.129[J]', '6.5150', '24.952', '9.660', '6.6010', '6.5151', '6.4102', '24.243', '6.7240', '1.041[J]', '6.8630[J]', '7.32', '17.055']
(50, 22)
(50, 22)
(19, 24)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['grad'] = [d.strip('[\'').split(' ')[0] == 'G' for d in list(df.terms)]


In [None]:
#@title Save the Plot as an HTML File
file_out = f"MIT-Professors-{search_query}"
go.Figure.write_html(fig,f"{file_out}.html") # write as html or image
files.download(f"{file_out}.html") # download your file and give me a vote my answer

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>