# Mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Change Working Directory to "course_selection" Folder

In [None]:
%cd "/content/drive/MyDrive/course_selection"

# Install/Import Modules

In [2]:
!pip install openai

/content/drive/MyDrive/course_selection
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.26.0.tar.gz (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.7/54.7 KB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai
  Building wheel for openai (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai: filename=openai-0.26.0-py3-none-any.whl size=66855 sha256=484a612a3df8f8a81c881cdce0d74b789edb5f5ee4a0ffef3d336713eb9fb777
  Stored in directory: /root/.cache/pip/wheels/7e/4c/c8/31e9d441bd937e2b9076627465f9db43ab6db40f08aae60b66
Successfully built openai
Installing collected packages: openai
Successfully installed openai-0.26.0


In [3]:
import argparse
import colorsys
import openai
import numpy as np
import pandas as pd
import plotly.express as px
import textwrap
from school_names_to_course_names import *
from search_profs import *
from matplotlib import colors as mcolors
from sklearn.manifold import TSNE

# Variable Definitions

In [13]:
mode = "all" #@param ["all", "search"]
openai.api_key = "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" #@param {type:"string"}
search_query = "unsupervised classification" #@param {type:"string"}
number_results = 20 #@param {type:"integer"}



In [14]:
#@title Main Code
df = pd.read_csv('mit_professors_with_embeddings.csv')

def feature_matrix(df):
    print("Extracting Embedding Feature Matrix...")
    matrix = df.embedding_combined.apply(eval).to_list()
    matrix_empty = np.zeros((len(matrix), len(matrix[0])))
    for i in range(len(matrix)):
        try:
            matrix_empty[i, :] = np.array(matrix[i])
        except Exception as e:
            print(i, e)
            print(matrix[i])
            exit()
    matrix = matrix_empty
    return matrix

def filter_search(df, professors):
    df['keep'] = [x in professors for x in list(df.name)]
    df = df[df['keep'] == True].filter(list(df.columns)[:-1])
    print(df.shape)
    return df

df['recent_publications']  = ["<br>".join(textwrap.wrap(d)) for d in list(df.keywords)]
df['course_number'] = [f'{course_names[g]}' for g in list(df.school_affiliation)]

css4_colors = mcolors.CSS4_COLORS
def sort_course(x):
     x = x.split()[0]
     try:
         x1 = int(''.join(filter(str.isdigit, x)))
     except:
         x1 = float('inf')
     x2 = str(''.join(filter(str.isalpha, x)))
     return x1, x2

all_courses = sorted(list(df.course_number.unique()), key=sort_course)
def get_hsv(color_name):
    hexrgb = css4_colors[color_name]
    hexrgb = hexrgb.lstrip("#")   # in case you have Web color specs
    r, g, b = (int(hexrgb[i:i+2], 16) / 255.0 for i in range(0,5,2))
    return colorsys.rgb_to_hsv(r, g, b)

colors = list(css4_colors.keys())
colors = np.random.choice(colors, len(all_courses), False)
colors = sorted(colors, key=get_hsv)


if mode == "all":
    print("Evaluating TSNE on Dataset...")
    df['color'] = [css4_colors[colors[all_courses.index(i)]] for i in list(df.course_number)]
    tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
    matrix = feature_matrix(df)
    vis_dims = tsne.fit_transform(matrix)
    df['x'] = [x for x,y in vis_dims]
    df['y'] = [y for x,y in vis_dims]
    dm = {all_courses[i]: colors[i] for i in range(len(all_courses))}
    fig = px.scatter(df, x='x', y='y', color='course_number', hover_data=['name', 'recent_publications'], color_discrete_map=dm, category_orders={'course_number': all_courses}, template="plotly_dark", title="MIT Professors Grouped by Study Area and Research Interests")
    fig.show(renderer="colab")
elif mode == "search":
    print("Searching for professors related to the search query...")
    df["embedding"] = df.embedding_combined.apply(eval).apply(np.array)
    professors = list(search_profs(df, search_query, number_results))
    print(professors)
    tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
    matrix = feature_matrix(df)
    vis_dims = tsne.fit_transform(matrix)
    df['x'] = [x for x,y in vis_dims]
    df['y'] = [y for x,y in vis_dims]
    df = filter_search(df, professors)
    dm = {all_courses[i]: colors[i] for i in range(len(all_courses))}
    fig = px.scatter(df, x='x', y='y', color='course_number', hover_data=['name', 'recent_publications'], color_discrete_map=dm, category_orders={'course_number': all_courses}, template="plotly_dark", title="MIT Professor Search Results for \"{}\"".format(search_query))
    fig.show(renderer='colab')


Evaluating TSNE on Dataset...
Extracting Embedding Feature Matrix...
