In [1]:
import numpy as np
import pandas as pd

# Data Preprocessing

## Raw Datasets

In [2]:
# define dataset path
CURRICULUM_DATA_PATH = '../crawlers/closed-ended/curriculum-crawlers/data/'

In [3]:
# load cs recommender datasets
df_cs_all = pd.read_csv(CURRICULUM_DATA_PATH + 'df_cs_all_course_details_prelim.csv', index_col = 0) # all cs course details
df_cs_profs = pd.read_csv(CURRICULUM_DATA_PATH + 'df_cs_courses_and_professors.csv', index_col = 0) # cs course professors

In [4]:
df_cs_all.head(3)

Unnamed: 0,Academic Year,No. of credit(s),Lecture,Lab session,Pre-requisite(s),Co-requisite(s),Mutually exclusive with,Remarks,Course Code,Course Title,Course Description,Learning Outcomes,Continuous Assessment Weighting in final course grade (%),Written Examination Weighting in final course grade (%),Tutorial,Other,Recommended Learning Hours,Self-study & practical modules,Choi Loretta
0,2022,6,32.5,6.5,,,ENGG1111 or ENGG1330,,COMP1117A,"Computer Programming (ActSc, AppAI, DA, IS, Mi...",This is an introductory course in computer pro...,[Computational mind] Able to identify possibl...,50.0,50.0,,,,,
1,2022,6,32.5,6.5,,,ENGG1111 or ENGG1330,,COMP1117B,"Computer Programming (Quant Fin, DA, Minor, 2n...",This is an introductory course in computer pro...,[Computational mind] Able to identify possibl...,50.0,50.0,,,,,
2,2022,6,,,,,COMP1117 or ENGG1111,,ENGG1330A,Computer Programming I (A1 - M2),This is an introductory course designed for fi...,[Computational mind] Able to identify possible...,70.0,30.0,26.0,13.0,,,


In [5]:
# load science recommender datasets
df_science_all = pd.read_csv(CURRICULUM_DATA_PATH + 'df_science_all_course_details.csv', index_col = 0) # all science course details
df_science_profs = pd.read_csv(CURRICULUM_DATA_PATH + 'df_science_courses_and_professors.csv', index_col = 0) # science course professors
df_science_meta = pd.read_csv(CURRICULUM_DATA_PATH + 'df_science_courses_pre.csv', index_col = 0) # meta table

## Process Raw Datasets

In [6]:
# add necessary columns to the 'df_cs_all' dataset
df_cs_all['Offering Department'] = 'Computer Science'

In [7]:
# extract course content related columns for cs
df_cs_content = df_cs_all[[
    'Course Code', 'Course Title', 'Course Description', 'Learning Outcomes'
]]

In [8]:
df_cs_content.head()

Unnamed: 0,Course Code,Course Title,Course Description,Learning Outcomes
0,COMP1117A,"Computer Programming (ActSc, AppAI, DA, IS, Mi...",This is an introductory course in computer pro...,[Computational mind] Able to identify possibl...
1,COMP1117B,"Computer Programming (Quant Fin, DA, Minor, 2n...",This is an introductory course in computer pro...,[Computational mind] Able to identify possibl...
2,ENGG1330A,Computer Programming I (A1 - M2),This is an introductory course designed for fi...,[Computational mind] Able to identify possible...
3,ENGG1340A,Computer Programming II,This course covers intermediate to advanced co...,[Programming environment and technologies] Abl...
4,ENGG1340B,Computer Programming II,This course covers intermediate to advanced co...,[Programming environment and technologies] Abl...


In [9]:
# add necessary columns to the 'df_science_all' dataset
df_science_all['Course Title'] = df_science_meta['Title'] # add course title column
df_science_all['Course Description'] = df_science_all['Course Contents & Topics'] + ' ' + df_science_all['Course Objectives']

In [10]:
# extract course content related columns for science
df_science_content = df_science_all[[
    'Course Code', 'Course Title', 'Course Description'
]]
df_science_content['Learning Outcomes'] = df_science_all['Course Learning Outcomes']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [11]:
df_science_content.head()

Unnamed: 0,Course Code,Course Title,Course Description,Learning Outcomes
0,BIOC1600,Perspectives in biochemistry,A Biochemical Perspective on the Basic Science...,describe the basics of biomolecular structure ...
1,BIOC2600,Basic biochemistry,"Structure and functions of carbohydrates, lipi...",relate the structures to functions of major bi...
2,BIOC3601,Basic metabolism,This course focuses on the central metabolic p...,achieve a vigorous intellectual appreciation o...
3,BIOC3604,Essential techniques in biochemistry and molec...,Basic concepts in experimental science; writin...,describe and explain the principles underlying...
4,BIOC3605,Sequence bioinformatics,This course will introduce and discuss the fol...,search and retrieve sequence data from biologi...


In [12]:
# combine both cs and science datasets for course content info
df_course_content = df_cs_content.append(df_science_content)
df_course_content = df_course_content.reset_index(drop = True)

In [13]:
df_course_content

Unnamed: 0,Course Code,Course Title,Course Description,Learning Outcomes
0,COMP1117A,"Computer Programming (ActSc, AppAI, DA, IS, Mi...",This is an introductory course in computer pro...,[Computational mind] Able to identify possibl...
1,COMP1117B,"Computer Programming (Quant Fin, DA, Minor, 2n...",This is an introductory course in computer pro...,[Computational mind] Able to identify possibl...
2,ENGG1330A,Computer Programming I (A1 - M2),This is an introductory course designed for fi...,[Computational mind] Able to identify possible...
3,ENGG1340A,Computer Programming II,This course covers intermediate to advanced co...,[Programming environment and technologies] Abl...
4,ENGG1340B,Computer Programming II,This course covers intermediate to advanced co...,[Programming environment and technologies] Abl...
...,...,...,...,...
425,CCST9051,What are We Made of - the Fundamental Nature o...,,
426,CCST9054,"War, Peace, and the Natural World",,
427,CCST9056,The Force is with You: How Things Work,,
428,CCST9067,Leaving Earth: Our Future in Space,,


## Save Processed Dataset

In [20]:
# save the processed dataset under the 'ontology-based-recommender/data' folder
df_course_content.to_csv('data/df_course_content.csv')

# Experiments for CourseContent

In [14]:
df_course_content = pd.read_csv('data/df_course_content.csv', index_col=0)

In [15]:
df_course_content

Unnamed: 0,Course Code,Course Title,Course Description,Learning Outcomes
0,COMP1117A,"Computer Programming (ActSc, AppAI, DA, IS, Mi...",This is an introductory course in computer pro...,[Computational mind] Able to identify possibl...
1,COMP1117B,"Computer Programming (Quant Fin, DA, Minor, 2n...",This is an introductory course in computer pro...,[Computational mind] Able to identify possibl...
2,ENGG1330A,Computer Programming I (A1 - M2),This is an introductory course designed for fi...,[Computational mind] Able to identify possible...
3,ENGG1340A,Computer Programming II,This course covers intermediate to advanced co...,[Programming environment and technologies] Abl...
4,ENGG1340B,Computer Programming II,This course covers intermediate to advanced co...,[Programming environment and technologies] Abl...
...,...,...,...,...
425,CCST9051,What are We Made of - the Fundamental Nature o...,,
426,CCST9054,"War, Peace, and the Natural World",,
427,CCST9056,The Force is with You: How Things Work,,
428,CCST9067,Leaving Earth: Our Future in Space,,


In [16]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk import word_tokenize
from nltk.corpus import stopwords
import string

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kackie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kackie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
from scipy import spatial

## Helper Functions

In [18]:
# helper function
# return: LIST of tokenized words
def preprocess(text, with_stopwords=False):
    text = text.lower()
    words = word_tokenize(text)
    words = [word for word in words if word.isalpha()]
    if with_stopwords==False:
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if not word in stop_words]
    
    return words

In [19]:
# helper function
# return: word embeddings MATRIX
def get_vector(text, model, with_stopwords=False):
    return np.sum(np.array([model[i] for i in preprocess(text, with_stopwords=with_stopwords)]), axis=0)

In [20]:
# helper function
# calculate cosine similarity for a given student provided info text with a text from our database
# return: FLOAT representing the similarity score
def get_similarity_score(student_info, text, model, with_stopwords):
    similarity = 1 - spatial.distance.cosine(
        get_vector(student_info, model, with_stopwords),
        get_vector(text, model, with_stopwords)
    )
    return similarity

In [22]:
# helper function
# get cosine similarities of a specific query against all job descriptions
# return: dataframe with new column 'Cosine Similarity'
def get_similarities(student_info, df, model, with_stopwords=False, n=-1):
    df_result = df.copy()
    columns_to_consider = ['Learning Outcomes', 'Course Description', 'Course Title'] # TODO: should we consider course title?
    for column_name in columns_to_consider:
        similarities = []
        for text in df_result[column_name]:
            try:
                similarity = get_similarity_score(student_info, text, model, with_stopwords)
                similarities.append(similarity)
            except:
                similarities.append(0)
                continue
        df_result['Cosine Similarity for ' + column_name] = similarities

    df_result.insert(0, 'Student Provided Info', student_info)

    # get the final cosine similarity based on weights
    df_result['Final Cosine Similarity'] = df_result['Cosine Similarity for Learning Outcomes']*0.6 + df_result['Cosine Similarity for Course Description']*0.3 + df_result['Cosine Similarity for Course Title']*0.1

    # get the top n results (n=-1 means all results)
    df_result = df_result.sort_values(by='Final Cosine Similarity', ascending=False).head(n)

    return df_result

In [23]:
# helper function
# get cosine similarities for ALL student provided info texts in our testing dataset
# return: DATAFRAME
def get_similarities_for_test_data(df_course_content, df_test, model, with_stopwords=False, n=-1):
    df_result = pd.DataFrame()
    for i, student_provided_info in enumerate(df_test['student_provided_info']):
        df = get_similarities(
            student_provided_info,
            df_course_content,
            model, with_stopwords=with_stopwords,
            n=n
        )
        df.insert(1, 'Target Course Code', df_test['target_course_code'][i])

        df_result = df_result.append(df)

    return df_result

## Testing Dataset

In [24]:
# load self-made testing dataset
df_course_content_test = pd.read_csv('data/course_content_testing_data.csv')

In [25]:
df_course_content_test

Unnamed: 0,student_provided_info,student_profile_equivalent,intent,target_course_code
0,I want to learn about programming but I am a b...,interest_area,recommendation,COMP1117A
1,I'm not familiar to databases but would like l...,interest_area,recommendation,COMP3278A
2,I am interested in both biology and chemistry.,interest_area,recommendation,BIOC1600
3,I am not good at math but I want to be in the ...,interest_area,recommendation,MATH1011
4,I am interested in the solar system.,interest_area,recommendation,PHYS1650
5,"Interested in fintech, especially trading algo...",interest_area,recommendation,FITE1010
6,I want to know more Java and practice to write...,interest_area,recommendation,COMP3330
7,Any common core courses about the environment?,interest_area,recommendation,CCST9013
8,I like robots.,interest_area,recommendation,COMP3356
9,I wanna learn about earth sciences.,interest_area,recommendation,EASC1401


## Models

### Model 1: Word2Vec (pre-trained on Google News 300)

In [16]:
import gensim.downloader as api

# define word2vec model
model_w2v = api.load('word2vec-google-news-300')

In [77]:
get_similarities('I want to learn about programming but I am a beginner', df_course_content, model_w2v, False, 5)

Unnamed: 0,Student Provided Info,Course Code,Course Title,Course Description,Learning Outcomes,Cosine Similarity for Learning Outcomes,Cosine Similarity for Course Description,Cosine Similarity for Course Title,Final Cosine Similarity
21,I want to learn about programming but I am a b...,COMP2396B,Object-oriented Programming and Java,Introduction to object-oriented programming; a...,[Object-oriented Programming] Be able to under...,0.62653,0.517905,0.471758,0.578465
20,I want to learn about programming but I am a b...,COMP2396A,Object-oriented Programming and Java (CS),Introduction to object-oriented programming; a...,[Object-oriented Programming] Be able to under...,0.62653,0.517905,0.402898,0.571579
51,I want to learn about programming but I am a b...,COMP3259,Principles of Programming Languages,Syntax and semantics specification; data types...,[Programming languages fundamentals] Be able t...,0.640547,0.437723,0.501147,0.56576
62,I want to learn about programming but I am a b...,COMP3329,Computer Game Design and Programming,This course introduces the concepts and techni...,[implement a workable game in particular platf...,0.588019,0.515196,0.5371,0.56108
7,I want to learn about programming but I am a b...,FITE2000A,Foundations of FinTech Programming,This course introduces concepts and applicatio...,[Basic data structures] To learn and appreciat...,0.583533,0.444559,0.449941,0.528481


In [78]:
get_similarities_for_test_data(df_course_content, df_course_content_test, model_w2v, with_stopwords=False, n=5).to_csv('test.csv')

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [81]:
# save the model
model_w2v.save('models/w2v_google_news_300.model')

### Model 2: 

In [37]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
from multiprocessing import cpu_count

In [36]:
# load the corpus from the gensim repo
corpus_wiki_english = api.load('wiki-english-20171001')



In [41]:
model_w2v_wiki_english = Word2Vec(corpus_wiki_english, workers = cpu_count())

In [43]:
model_w2v_wiki_english.save('models/w2v_wiki_english.model')