# Mount Drive

In [7]:
from google.colab import drive, files
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Change Working Directory to "course_selection" Folder

In [2]:
%cd "/content/drive/MyDrive/course_selection"

/content/drive/MyDrive/course_selection


# Install/Import Modules

In [3]:
!pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.26.1.tar.gz (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.3/55.3 KB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai
  Building wheel for openai (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai: filename=openai-0.26.1-py3-none-any.whl size=67316 sha256=f46bea53e3fba21af593907abd947d4276ceaf85b23c38bf20bd8e0b05081395
  Stored in directory: /root/.cache/pip/wheels/2f/9c/55/95d3609ccfc463eeffb96d50c756f1f1899453b85e92021a0a
Successfully built openai
Installing collected packages: openai
Successfully installed openai-0.26.1


In [6]:
import argparse
import colorsys
import openai
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import textwrap
from school_names_to_course_names import *
from search_profs import *
from matplotlib import colors as mcolors
from sklearn.manifold import TSNE

# Variable Definitions

In [5]:
csv_name = "mit_professors_with_embeddings.csv" #@param{type:"string"}
mode = "search" #@param ["all", "search"]
openai.api_key = "" #@param {type:"string"}
search_query = "Microfluidic storage system" #@param {type:"string"}
number_results = 20 #@param {type:"integer"}



In [None]:
#@title Obtain TSNE Matrix (Run this once)
df = pd.read_csv(csv_name)
def feature_matrix(df):
    print("Extracting Embedding Feature Matrix...")
    matrix = df.embedding_combined.apply(eval).to_list()
    matrix_empty = np.zeros((len(matrix), len(matrix[0])))
    for i in range(len(matrix)):
        try:
            matrix_empty[i, :] = np.array(matrix[i])
        except Exception as e:
            print(i, e)
            print(matrix[i])
            exit()
    matrix = matrix_empty
    return matrix

def filter_search(df, professors):
    df['keep'] = [x in professors for x in list(df.name)]
    df = df[df['keep'] == True].filter(list(df.columns)[:-1])
    print(df.shape)
    return df

df['recent_publications']  = ["<br>".join(textwrap.wrap(d)) for d in list(df.keywords)]
df['course_number'] = [f'{course_names[g]}' for g in list(df.school_affiliation)]


def toarray(x):
   x = [float(v.strip()) for v in x.strip('[').strip(']').split(',')]
   return x

def search_courses(df, search_query, n):
    embedding = get_embedding(
        search_query,
        engine="text-embedding-ada-002"
    )
    df["similarity"] = df.embedding_combined.apply(lambda x: cosine_similarity(np.asarray(toarray(x), dtype='float64'), np.asarray(embedding, dtype='float64')))
    results = list(df.sort_values("similarity", ascending=False).head(n).course_number)
    return results

css4_colors = mcolors.CSS4_COLORS
def sort_course(x):
     x = x.split()[0]
     try:
         x1 = int(''.join(filter(str.isdigit, x)))
     except:
         x1 = float('inf')
     x2 = str(''.join(filter(str.isalpha, x)))
     return x1, x2

all_courses = sorted(list(df.course_number.unique()), key=sort_course)
def get_hsv(color_name):
    hexrgb = css4_colors[color_name]
    hexrgb = hexrgb.lstrip("#")   # in case you have Web color specs
    r, g, b = (int(hexrgb[i:i+2], 16) / 255.0 for i in range(0,5,2))
    return colorsys.rgb_to_hsv(r, g, b)

colors = list(css4_colors.keys())
colors = np.random.choice(colors, len(all_courses), False)
colors = sorted(colors, key=get_hsv)
df['color'] = [css4_colors[colors[all_courses.index(i)]] for i in list(df.course_number)]
dm = {all_courses[i]: colors[i] for i in range(len(all_courses))}

if csv_name == "mit_professors_with_embeddings.csv":
  df['embedding'] = df.embedding_combined.apply(eval).apply(np.array)
  print("Evaluating TSNE on Dataset...")
  tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
  matrix = feature_matrix(df)
  vis_dims = tsne.fit_transform(matrix)
  df['x'] = [x for x,y in vis_dims]
  df['y'] = [y for x,y in vis_dims]
  if not os.path.exists("mit_professors_with_TSNE_embeddings.csv"):
    df.to_csv("mit_professors_with_TSNE_embeddings.csv")

In [None]:
#@title Main Code

if mode == "all":
    fig = px.scatter(df, x='x', y='y', color='course_number', hover_data=['name', 'recent_publications'], color_discrete_map=dm, category_orders={'course_number': all_courses}, template="plotly_dark", title="MIT Professors Grouped by Study Area and Research Interests")
    fig.show(renderer="colab")
elif mode == "search":
    print("Searching for professors related to the search query...")
    df["embedding"] = df.embedding_combined.apply(eval).apply(np.array)
    professors = list(search_profs(df, search_query, number_results))
    df_filter = filter_search(df, professors)
    fig = px.scatter(df, x='x', y='y', color='course_number', hover_data=['name', 'recent_publications'], color_discrete_map=dm, category_orders={'course_number': all_courses}, template="plotly_dark", title="MIT Professor Search Results for \"{}\"".format(search_query))
    fig.show(renderer='colab')


In [None]:
#@title Save the Plot as an HTML File
file_out = f"MIT-Professors-{search_query}"
go.Figure.write_html(fig,f"{file_out}.html") # write as html or image
files.download(f"{file_out}.html") # download your file and give me a vote my answer

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>