In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

# Functions

In [None]:
#Removing 
stop_words = set(stopwords.words('english'))
exclude = set(["course", 'students'])

# Remove stopwords and make text lowercase.
def clean_description(text):
    tokens = nltk.word_tokenize(text)
    cleaned_tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]
    cleaned_tokens = [word for word in cleaned_tokens if word.lower() not in exclude and word.isalpha()]
    return ' '.join(cleaned_tokens).str.lower()

#Function to vectorize description beased on previous basis
def vectorize_description(text):
    tokens = text.split()
    vector = [1 if word in tokens else 0 for word in top_words]
    return vector

def vectorize_description_scaled(text):
    word_counts = Counter(text.split())
    # Create a vector where each element is the frequency of a word in the basis_of_words
    vector = [word_counts[word] if word in word_counts else 0 for word in top_words]
    return vector

# Takes in two course names, the name of the column that holds the names, the dataframe, and the similarity matrix
# and returns their cosine similarity
def get_similarity(name1, name2, type, df, matrix):
    idx_1 = df[df[type] == name1].index
    idx_1 = idx_1[0]
    idx_2 = df[df[type] == name2].index
    idx_2 = idx_2[0]
    return matrix[idx_1, idx_2]

# Gets the course's cleaned description that is used to vectorize it
def get_clean_desc(df, name):
    idx_1 = df[df['department'] == name].index
    idx_1 = idx_1[0]
    return df.iloc[idx_1]['cleaned_description']

# Gets the course's regular description that is used to vectorize it
def get_reg_desc(df, name):
    idx_1 = df[df['course'] == name].index
    idx_1 = idx_1[0]
    return df.iloc[idx_1]['description_x']

## Example of loading some course descriptions and making a dictionry to generate vector respesentation.

In [None]:
lsa = pd.read_pickle('lsa_courses.pkl')
eng = pd.read_pickle('eng_courses.pkl')

lsa['cleaned_description'] = lsa['description_x'].apply(clean_description)
eng['cleaned_description'] = eng['description_x'].apply(clean_description)
lsa['cleaned_description'] = lsa['cleaned_description'].str.lower()
eng['cleaned_description'] = eng['cleaned_description'].str.lower()


#This gets the top 500 most common words in descriptions acrossLSA and ENG
combined_df = pd.concat([lsa, eng], ignore_index=True)
all_words = ' '.join(combined_df['cleaned_description']).split()
word_freq = Counter(all_words)
top_words = [word for word, freq in word_freq.most_common(500)]

# Generates vector representation of departments by simply concatenating all the course descripions.
concatenated_descriptions = combined_df.groupby('department')['description_x'].apply(' '.join).reset_index()
concatenated_descriptions['cleaned_description'] = concatenated_descriptions['description_x'].apply(clean_description)

all_words = ' '.join(concatenated_descriptions['cleaned_description']).split()
word_freq = Counter(all_words)
top_words = [word for word, freq in word_freq.most_common(500)]
# Generate words
concatenated_descriptions['vector'] = concatenated_descriptions['cleaned_description'].apply(vectorize_description_scaled)

# Make similarity matrix
matrix = [vec for vec in concatenated_descriptions['vector']]
# Compute cosine similarities
similarity_matrix = cosine_similarity(matrix)
#Find similar courses
deps = concatenated_descriptions['department'].tolist()