In [145]:
import pandas as pd
import neattext.functions as nfx
import seaborn as sns
import numpy as np

In [146]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

In [147]:
all_courses = []
courses = pd.read_json('cu_courses.json', orient='columns')
for course in courses["CourseDetails"]:
    data = pd.json_normalize(course)
    all_courses.append(data)

In [148]:
all_courses = pd.DataFrame(pd.concat(all_courses))
all_courses.drop_duplicates(subset=['Crse Title'], inplace=True)
all_courses.index = [i for i in range(len(all_courses))]
all_courses.head()

Unnamed: 0,Term,Year,Campus,College,Dept,Sbjct,Crse,Sect,Crse Title,Instructor Name,...,Synth,Diverse,Respect,Challenge,Creative,Discuss,Feedback,Grading,Questions,Tech
0,Spring,2021,BD,ENGR,CSCI,CSCI,4239,1,Advanced Computer Graphics,"Schieuder, Willem Adriaan",...,4.5,3.4,5.0,4.8,4.9,4.6,4.4,4.6,4.8,5.0
1,Spring,2021,BD,ENGR,CSCI,CSCI,4022,1,Advanced Data Science,"Mullen, Zachary",...,3.9,3.7,4.6,4.5,4.2,4.2,2.0,2.9,4.1,4.4
2,Fall,2022,BD,ENGR,CSCI,CSCI,5573,1,Advanced Operating Systems,"Mishra, Shivakant",...,4.0,3.0,5.0,5.0,5.0,5.0,1.0,3.0,2.0,4.0
3,Fall,2021,BD,ENGR,CSCI,CSCI,4302,1,Advanced Robotics,"Hayes, Bradley",...,4.1,3.0,4.4,4.3,4.0,4.4,3.9,3.9,4.4,4.4
4,Spring,2022,BD,ENGR,CSCI,CSCI,6314,1,Algorithmic Economics,"Frongillo, Rafael M",...,4.3,4.8,4.8,4.2,4.7,4.7,4.5,4.6,4.5,4.4


In [149]:
all_courses['Crse Title'].nunique()

116

In [150]:
all_courses['Crse Title']

0            Advanced Computer Graphics
1                 Advanced Data Science
2            Advanced Operating Systems
3                     Advanced Robotics
4                 Algorithmic Economics
                     ...               
111              Tpcs-Cognitive Science
112            Tpcs in Programming Lang
113                UD for Digital Media
114    User-Centered Design &amp; Dev 1
115                       Voice Over IP
Name: Crse Title, Length: 116, dtype: object

In [151]:
# vectorize text
count_vect = CountVectorizer()
cv_mat = count_vect.fit_transform(all_courses['Crse Title'])
cv_mat

<116x196 sparse matrix of type '<class 'numpy.int64'>'
	with 371 stored elements in Compressed Sparse Row format>

In [152]:
cv_mat.todense()

matrix([[0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [153]:
dv_cv_words = pd.DataFrame(cv_mat.todense(), columns=count_vect.get_feature_names_out())
dv_cv_words.head()

Unnamed: 0,admin,administration,advanced,agent,aided,algebra,algorithmic,algorithms,algos,amp,...,verification,virtual,virtualization,vision,voice,wireless,with,wk,workgroup,workshop
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [154]:
cosine_sim_mat = cosine_similarity(cv_mat)
cosine_sim_mat[0:10]

array([[1.        , 0.33333333, 0.33333333, ..., 0.        , 0.        ,
        0.        ],
       [0.33333333, 1.        , 0.33333333, ..., 0.        , 0.        ,
        0.        ],
       [0.33333333, 0.33333333, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.33333333, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [155]:
course_indices = pd.Series(all_courses.index, index=all_courses['Crse Title']).drop_duplicates()
course_indices

Crse Title
Advanced Computer Graphics            0
Advanced Data Science                 1
Advanced Operating Systems            2
Advanced Robotics                     3
Algorithmic Economics                 4
                                   ... 
Tpcs-Cognitive Science              111
Tpcs in Programming Lang            112
UD for Digital Media                113
User-Centered Design &amp; Dev 1    114
Voice Over IP                       115
Length: 116, dtype: int64

In [158]:
def get_recommendations(title, cosine_sim_mat = cosine_sim_mat):
    idx = course_indices[title]
    sim_scores = enumerate(cosine_sim_mat[idx])
    sim_scores = sorted(sim_scores, key=lambda x:[1], reverse=True)
    sim_scores = sim_scores[1:11]
    sim_index = [i[0] for i in sim_scores]
    print(all_courses["Crse Title"].iloc[sim_index])

get_recommendations('Advanced Data Science')

1             Advanced Data Science
2        Advanced Operating Systems
3                 Advanced Robotics
4             Algorithmic Economics
5                        Algorithms
6                Autonomous Systems
7                Big Data Analytics
8     Bio-inspired Multi-Agent Syst
9               Biological Networks
10                  CLASIC Capstone
Name: Crse Title, dtype: object
