## Mount Drive

In [1]:
from google.colab import drive, files
drive.mount('/content/drive')

Mounted at /content/drive


## Change Working Directory to "course_selection" Folder

In [2]:
%cd "/content/drive/MyDrive/course_selection"

/content/drive/MyDrive/course_selection


## Install/Import Modules

In [3]:
!pip install openai

Installing collected packages: openai
Successfully installed openai-0.26.4


In [4]:
import argparse
import colorsys
import openai
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import textwrap
import os
from course_names import *
from filter_courses import *
from matplotlib import colors as mcolors
from sklearn.manifold import TSNE
from openai.embeddings_utils import get_embedding, cosine_similarity

## Variable Definitions

In [5]:
mode = "search" #@param ["all", "search"]

openai.api_key = "" #@param {type:"string"}
search_query = "robotics" #@param {type:"string"}
number_results = 50 #@param {type:"integer"}
csv_name = "mit_courses_with_TSNE_embeddings.csv" #@param {type:"string"}
filter_grad_courses = True #@param ["True", "False"] {type:"raw"}




To obtain a key, go to `https://beta.openai.com/account/api-keys` and create an API key after making an OpenAI account.

In [6]:
#@title Obtain TSNE Matrix (Run this once)
df = pd.read_csv(csv_name)
def feature_matrix(df):
    print("Extracting Embedding Feature Matrix...")
    matrix = df.embedding_combined.apply(eval).to_list()
    matrix_empty = np.zeros((len(matrix), len(matrix[0])))
    for i in range(len(matrix)):
        try:
            matrix_empty[i, :] = np.array(matrix[i])
        except Exception as e:
            print(i, e)
            print(matrix[i])
            exit()
    matrix = matrix_empty
    return matrix

def toarray(x):
   x = [float(v.strip()) for v in x.strip('[').strip(']').split(',')]
   return x

def search_courses(df, search_query, n):
    embedding = get_embedding(
        search_query,
        engine="text-embedding-ada-002"
    )
    df["similarity"] = df.embedding_combined.apply(lambda x: cosine_similarity(np.asarray(toarray(x), dtype='float64'), np.asarray(embedding, dtype='float64')))
    results = list(df.sort_values("similarity", ascending=False).head(n).course_number)
    return results

df['description']  = ["<br>".join(textwrap.wrap(d)) for d in list(df.desc)]
df['course'] = ["{} ({})".format(g, course_names[g]) for g in [d.split('.')[0] for d in list(df.course_number)]]
df['course_id'] = [d.split('.')[0] for d in list(df.course_number)]

css4_colors = mcolors.CSS4_COLORS
def sort_course(x):
     x = x.split()[0]
     try:
         x1 = int(''.join(filter(str.isdigit, x)))
     except:
         x1 = float('inf')
     x2 = str(''.join(filter(str.isalpha, x)))
     return x1, x2

all_courses = sorted(list(df.course.unique()), key=sort_course)
def get_hsv(color_name):
    hexrgb = css4_colors[color_name]
    hexrgb = hexrgb.lstrip("#")   # in case you have Web color specs
    r, g, b = (int(hexrgb[i:i+2], 16) / 255.0 for i in range(0,5,2))
    return colorsys.rgb_to_hsv(r, g, b)

colors = list(css4_colors.keys())
colors = np.random.choice(colors, len(all_courses), False)
colors = sorted(colors, key=get_hsv)
df['color'] = [css4_colors[colors[all_courses.index(i)]] for i in list(df.course)]
dm = {all_courses[i]: colors[i] for i in range(len(all_courses))}

if csv_name == "mit_course_catalog_with_embeddings.csv":
  df['embedding'] = df.embedding_combined.apply(eval).apply(np.array)
  print("Evaluating TSNE on Dataset...")
  tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
  matrix = feature_matrix(df)
  vis_dims = tsne.fit_transform(matrix)
  df['x'] = [x for x,y in vis_dims]
  df['y'] = [y for x,y in vis_dims]
  if not os.path.exists("mit_courses_with_TSNE_embeddings.csv"):
    df.to_csv("mit_courses_with_TSNE_embeddings.csv")

In [8]:
#@title Main Code
if mode == "all":
  if not filter_grad_courses:
    fig = px.scatter(df, x='x', y='y', color='course', hover_data=['course_number', 'title', 'description', 'prereq'], color_discrete_map=dm, category_orders={'course': all_courses}, template="plotly_dark", title="MIT Courses Grouped by Title and Description Similarity")
    fig.show()
  else:
    # df_filter = filter_grad_courses_over_12_units(df) # TODO: add a better filter functionality
    fig = px.scatter(df, x='x', y='y', color='course', hover_data=['course_number', 'title', 'description', 'prereq'], color_discrete_map=dm, category_orders={'course': all_courses}, template="plotly_dark", title="MIT Grad Courses over 12 Units")
    fig.show()
elif mode == "search":
  print("Searching for classes...")
  course_numbers = list(search_courses(df, search_query, number_results))
  print(course_numbers)
  df_filter = filter_search(df, course_numbers)
  # if filter_grad_courses:
  #   df_filter = filter_grad_courses_over_12_units(df_filter)
  fig = px.scatter(df_filter, x='x', y='y', color='course', hover_data=['course_number', 'title', 'description', 'prereq'], color_discrete_map=dm, category_orders={'course': all_courses}, template="plotly_dark", title="MIT Course Search Results for \"{}\"".format(search_query))
  fig.show()


Searching for classes...
['2.120', '6.4200[J]', '16.412[J]', '2.12', '2.74', '2.740', '6.4210', '2.165[J]', '6.4212', '9.175[J]', '6.9600', '2.017[J]', '15.236', '6.9080', '16.405[J]', '16.84', '1.015[J]', 'MAS.750', '6.8110[J]', '16.633', '6.8210', '2.166', '16.632', '16.422', '6.9610', '2.737', '6.9630', 'MAS.600', '2.980', '2.98', '16.634', '15.3991', '9.58', '15.399', '2.679', '9.60', 'MAS.883[J]', '11.529[J]', '11.029[J]', '16.413[J]', '6.8200', '16.410[J]', '16.485', '14.78[J]', '2.680', '2.183[J]', '16.332', '2.168', '20.054', '2.16']
(50, 22)


In [9]:
#@title Save the Plot as an HTML File
file_out = f"MIT-courses-{search_query}"
go.Figure.write_html(fig,f"{file_out}.html") # write as html or image
files.download(f"{file_out}.html") # download your file and give me a vote my answer

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>