In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter
import networkx as nx

import spacy
from itertools import chain

In [2]:
lsa = pd.read_pickle('lsa_courses.pkl')
eng = pd.read_pickle('eng_courses.pkl')

#Removing 
stop_words = set(stopwords.words('english'))
exclude = set(["course", 'students'])

def clean_description(text):
    tokens = nltk.word_tokenize(text)
    cleaned_tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]
    cleaned_tokens = [word for word in cleaned_tokens if word.lower() not in exclude and word.isalpha()]
    return ' '.join(cleaned_tokens)

lsa['cleaned_description'] = lsa['description_x'].apply(clean_description)
eng['cleaned_description'] = eng['description_x'].apply(clean_description)
lsa['cleaned_description'] = lsa['cleaned_description'].str.lower()
eng['cleaned_description'] = eng['cleaned_description'].str.lower()

# Originally used 100 most common, behavior became much 'better' when increased to 500

#This gets the top n most common words in descriptions across LSA and ENG
combined_df = pd.concat([lsa, eng], ignore_index=True)
all_words = ' '.join(combined_df['cleaned_description']).split()
word_freq = Counter(all_words)
top_words = [word for word, freq in word_freq.most_common(500)]


#Function to vectorize description beased on previous basis
def vectorize_description(text):
    tokens = text.split()
    vector = [1 if word in tokens else 0 for word in top_words]
    return vector

def vectorize_description_scaled(text):
    word_counts = Counter(text.split())
    # Create a vector where each element is the frequency of a word in the basis_of_words
    vector = [word_counts[word] if word in word_counts else 0 for word in top_words]
    return vector

#Groupby departments to get df of departments and all associated course descriptions
concatenated_descriptions = combined_df.groupby('department')['description_x'].apply(' '.join).reset_index()
concatenated_descriptions['cleaned_description'] = concatenated_descriptions['description_x'].apply(clean_description)
concatenated_descriptions['cleaned_description'] = concatenated_descriptions['cleaned_description'].str.lower()

all_words = ' '.join(concatenated_descriptions['cleaned_description']).split()
word_freq = Counter(all_words)
top_words = [word for word, freq in word_freq.most_common(500)]


In [None]:
nlp = spacy.load("en_core_web_sm")

In [5]:
def extract_bigrams(doc):
    bigrams = []
    for token1, token2 in zip(doc[:-1], doc[1:]):
        if not (token1.is_stop or token2.is_stop or token1.is_punct or token1.text == ',' or token2.is_punct or token2.text == ','): 
            bigrams.append((token1.text, token2.text))
    return bigrams

bigrams_list = []

for description in concatenated_descriptions['description_x']:
    doc = nlp(description)
    bigrams = extract_bigrams(doc)
    bigrams_list.extend(bigrams)

# Count the frequency of bigrams
bigram_counts = Counter(bigrams_list)
most_common_bigrams = bigram_counts.most_common(500)

# Count the frequency of single words
all_words = ' '.join(combined_df['cleaned_description']).split()
word_freq = Counter(all_words)
all_features = [word for word, freq in word_freq.most_common(500)]

# Combine words and bigrams
all_features.extend(['_'.join(bigram) for bigram, count in most_common_bigrams])

feature_dict = {word: idx for idx, word in enumerate(all_features)}

def vectorize_input(input_string, feature_dict):
    vector = [0] * len(feature_dict)
    words = input_string.split()
    bigrams = zip(words, words[1:])

    for word in words:
        if word in feature_dict:
            vector[feature_dict[word]] += 1

    for bigram in bigrams:
        bigram = '_'.join(bigram)
        if bigram in feature_dict:
            vector[feature_dict[bigram]] += 1

    return vector

In [10]:
# Tokenize each course description
def tokenize(description):
    return description.lower().split()

# Count occurrences of each word in each course description
word_occurrences = {word: [] for word in all_features}

for description in concatenated_descriptions['cleaned_description']:
    tokens = tokenize(description)
    counts = Counter(tokens)
    for word in all_features:
        word_occurrences[word].append(counts.get(word, 0))

# Calculate the median occurrences for each word
median_occurrences = {word: np.median(counts) for word, counts in word_occurrences.items()}

# Create an array of median occurrences
median_occurrences_array = np.array([median_occurrences[word] for word in all_features])

median_occurrences_array


array([2., 1., 1., 2., 2., 1., 1., 2., 0., 1., 1., 2., 1., 1., 1., 0., 2.,
       1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0.,
       1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
       0., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0.,
       1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [6]:
most_common_bigrams

[(('course', 'explores'), 167),
 (('course', 'examines'), 128),
 (('United', 'States'), 122),
 (('course', 'provides'), 106),
 (('course', 'focuses'), 105),
 (('Topics', 'include'), 85),
 ((' ', 'Students'), 81),
 (('course', 'introduces'), 74),
 (('introduce', 'students'), 70),
 (('course', 'covers'), 70),
 (('course', 'offers'), 69),
 (('case', 'studies'), 69),
 (('introduces', 'students'), 67),
 (('Middle', 'East'), 66),
 (('20th', 'century'), 61),
 (('language', 'skills'), 60),
 (('faculty', 'member'), 58),
 (('wide', 'range'), 57),
 (('Southeast', 'Asia'), 53),
 (('course', 'aims'), 46),
 (('nineteenth', 'century'), 44),
 (('Latin', 'America'), 40),
 (('help', 'students'), 39),
 (('climate', 'change'), 38),
 (('Native', 'American'), 38),
 (('twentieth', 'century'), 36),
 (('Latin', 'American'), 36),
 (('popular', 'culture'), 35),
 ((' ', 'Topics'), 35),
 (('provide', 'students'), 34),
 (('African', 'American'), 34),
 (('Mathematics', 'webpage'), 34),
 (('writing', 'skills'), 33),


In [None]:
# Generate words
concatenated_descriptions['vector'] = concatenated_descriptions['cleaned_description'].apply(vectorize_description_scaled)

#Generate cosine similarity matrix
from sklearn.metrics.pairwise import cosine_similarity

def course_cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Make similarity matrix
matrix = [vec for vec in concatenated_descriptions['vector']]
# Compute cosine similarities
similarity_matrix = cosine_similarity(matrix)
#Find similar courses
deps = concatenated_descriptions['department'].tolist()

In [None]:
# Takes in two course names, the name of the column that holds the names, the dataframe, and the similarity matrix
# and returns their cosine similarity
def get_similarity(name1, name2, type, df, matrix):
    idx_1 = df[df[type] == name1].index
    idx_1 = idx_1[0]
    idx_2 = df[df[type] == name2].index
    idx_2 = idx_2[0]
    return matrix[idx_1, idx_2]

# Gets the course's cleaned description that is used to vectorize it
def get_clean_desc(name):
    idx_1 = concatenated_descriptions[concatenated_descriptions['department'] == name].index
    idx_1 = idx_1[0]
    return concatenated_descriptions.iloc[idx_1]['cleaned_description']

# Gets the course's regular description that is used to vectorize it
def get_reg_desc(name):
    idx_1 = combined_df[combined_df['course'] == name].index
    idx_1 = idx_1[0]
    return combined_df.iloc[idx_1]['description_x']

## OTHER FUNCTIONS
def common_words(str1, str2, n):
    # Splitting the strings into words
    words_str1 = str1.split()
    words_str2 = str2.split()

    # Creating a dictionary to count the occurrences of each word in both strings
    word_count = {}
    for word in words_str1:
        if word in words_str2:
            word_count[word] = min(words_str1.count(word), words_str2.count(word))

    # Sorting the dictionary by the frequency of the words
    sorted_word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True)

    return sorted_word_count[0:n]

def get_related(name, df, matrix, n):
    n=n+1
    idx = df[df['department'] == name].index
    sims = matrix[idx]
    # Get indices of sorted elements
    sorted_indices = np.argsort(sims)
    # Get indices of top n largest elements
    top_n_indices = sorted_indices[0][-n:][::-1]
    top_n_deps = np.array(deps)[top_n_indices]
    top_n_deps = np.delete(top_n_deps, np.where(top_n_deps == name))
    return top_n_deps

def get_threshold(matrix, max_pairs):
    """
    Find the threshold for similarity scores in the matrix.

    :param matrix: A 2D numpy array containing similarity scores between classes.
    :param max_pairs: Maximum number of class pairs with similarity above the threshold.
    :return: Threshold value.
    """

    # Flatten the matrix to get all similarity scores
    scores = matrix[np.triu_indices_from(matrix, k=1)]  # k=1 to exclude the diagonal

    # Create bins for the range [0, 1]
    num_bins = 100
    bins = np.linspace(0, 1, num_bins + 1)
    
    # Count the number of scores in each bin
    counts, _ = np.histogram(scores, bins)

    # Find the cumulative count
    cumulative_counts = np.cumsum(counts[::-1])[::-1]

    # Find the bin where the cumulative count drops below the max_pairs
    threshold_index = np.where(cumulative_counts <= max_pairs)[0]

    if len(threshold_index) == 0:
        return 1  # Return 1 if no such threshold exists (i.e., all scores are too low)

    # Get the lower edge of the bin as the threshold
    threshold_bin = threshold_index[0]
    threshold = bins[threshold_bin]

    return threshold
