In [2]:
import numpy as np
import pandas as pd

# Dataset Preparation

## HKU Dataset

In [2]:
# define dataset path
CURRICULUM_DATA_PATH = '../crawlers/closed-ended/curriculum-crawlers/data/'

In [3]:
# load cs recommender datasets
df_cs_all = pd.read_csv(CURRICULUM_DATA_PATH + 'df_cs_all_course_details_prelim.csv', index_col = 0) # all cs course details
df_cs_profs = pd.read_csv(CURRICULUM_DATA_PATH + 'df_cs_courses_and_professors.csv', index_col = 0) # cs course professors

In [4]:
df_cs_all.head(3)

Unnamed: 0,Academic Year,No. of credit(s),Lecture,Lab session,Pre-requisite(s),Co-requisite(s),Mutually exclusive with,Remarks,Course Code,Course Title,Course Description,Learning Outcomes,Continuous Assessment Weighting in final course grade (%),Written Examination Weighting in final course grade (%),Tutorial,Other,Recommended Learning Hours,Self-study & practical modules,Choi Loretta
0,2022,6,32.5,6.5,,,ENGG1111 or ENGG1330,,COMP1117A,"Computer Programming (ActSc, AppAI, DA, IS, Mi...",This is an introductory course in computer pro...,[Computational mind] Able to identify possibl...,50.0,50.0,,,,,
1,2022,6,32.5,6.5,,,ENGG1111 or ENGG1330,,COMP1117B,"Computer Programming (Quant Fin, DA, Minor, 2n...",This is an introductory course in computer pro...,[Computational mind] Able to identify possibl...,50.0,50.0,,,,,
2,2022,6,,,,,COMP1117 or ENGG1111,,ENGG1330A,Computer Programming I (A1 - M2),This is an introductory course designed for fi...,[Computational mind] Able to identify possible...,70.0,30.0,26.0,13.0,,,


In [5]:
# load science recommender datasets
df_science_all = pd.read_csv(CURRICULUM_DATA_PATH + 'df_science_all_course_details.csv', index_col = 0) # all science course details
df_science_profs = pd.read_csv(CURRICULUM_DATA_PATH + 'df_science_courses_and_professors.csv', index_col = 0) # science course professors
df_science_meta = pd.read_csv(CURRICULUM_DATA_PATH + 'df_science_courses_pre.csv', index_col = 0) # meta table

In [6]:
# add necessary columns to the 'df_cs_all' dataset
df_cs_all['Offering Department'] = 'Computer Science'

In [7]:
# extract course content related columns for cs
df_cs_content = df_cs_all[[
    'Offering Department', 'Course Code', 'Course Title', 'Course Description', 'Learning Outcomes'
]]

In [8]:
df_cs_content.head()

Unnamed: 0,Offering Department,Course Code,Course Title,Course Description,Learning Outcomes
0,Computer Science,COMP1117A,"Computer Programming (ActSc, AppAI, DA, IS, Mi...",This is an introductory course in computer pro...,[Computational mind] Able to identify possibl...
1,Computer Science,COMP1117B,"Computer Programming (Quant Fin, DA, Minor, 2n...",This is an introductory course in computer pro...,[Computational mind] Able to identify possibl...
2,Computer Science,ENGG1330A,Computer Programming I (A1 - M2),This is an introductory course designed for fi...,[Computational mind] Able to identify possible...
3,Computer Science,ENGG1340A,Computer Programming II,This course covers intermediate to advanced co...,[Programming environment and technologies] Abl...
4,Computer Science,ENGG1340B,Computer Programming II,This course covers intermediate to advanced co...,[Programming environment and technologies] Abl...


In [9]:
# add necessary columns to the 'df_science_all' dataset
df_science_all['Course Title'] = df_science_meta['Title'] # add course title column
df_science_all['Course Description'] = df_science_all['Course Contents & Topics'] + ' ' + df_science_all['Course Objectives']

In [10]:
# extract course content related columns for science
df_science_content = df_science_all[[
    'Offering Department', 'Course Code', 'Course Title', 'Course Description'
]]
df_science_content['Learning Outcomes'] = df_science_all['Course Learning Outcomes']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [11]:
df_science_content.head()

Unnamed: 0,Offering Department,Course Code,Course Title,Course Description,Learning Outcomes
0,Biomedical Sciences,BIOC1600,Perspectives in biochemistry,A Biochemical Perspective on the Basic Science...,describe the basics of biomolecular structure ...
1,Biomedical Sciences,BIOC2600,Basic biochemistry,"Structure and functions of carbohydrates, lipi...",relate the structures to functions of major bi...
2,Biomedical Sciences,BIOC3601,Basic metabolism,This course focuses on the central metabolic p...,achieve a vigorous intellectual appreciation o...
3,Biomedical Sciences,BIOC3604,Essential techniques in biochemistry and molec...,Basic concepts in experimental science; writin...,describe and explain the principles underlying...
4,Biomedical Sciences,BIOC3605,Sequence bioinformatics,This course will introduce and discuss the fol...,search and retrieve sequence data from biologi...


In [12]:
# combine both cs and science datasets for course content info
df_course_content = df_cs_content.append(df_science_content)
df_course_content = df_course_content.reset_index(drop = True)

In [13]:
# rename
df_course_content = df_course_content.rename(columns={'Offering Department': 'Subject Domain'})

In [14]:
df_course_content

Unnamed: 0,Subject Domain,Course Code,Course Title,Course Description,Learning Outcomes
0,Computer Science,COMP1117A,"Computer Programming (ActSc, AppAI, DA, IS, Mi...",This is an introductory course in computer pro...,[Computational mind] Able to identify possibl...
1,Computer Science,COMP1117B,"Computer Programming (Quant Fin, DA, Minor, 2n...",This is an introductory course in computer pro...,[Computational mind] Able to identify possibl...
2,Computer Science,ENGG1330A,Computer Programming I (A1 - M2),This is an introductory course designed for fi...,[Computational mind] Able to identify possible...
3,Computer Science,ENGG1340A,Computer Programming II,This course covers intermediate to advanced co...,[Programming environment and technologies] Abl...
4,Computer Science,ENGG1340B,Computer Programming II,This course covers intermediate to advanced co...,[Programming environment and technologies] Abl...
...,...,...,...,...,...
425,Sci Common Core,CCST9051,What are We Made of - the Fundamental Nature o...,,
426,Sci Common Core,CCST9054,"War, Peace, and the Natural World",,
427,Sci Common Core,CCST9056,The Force is with You: How Things Work,,
428,Sci Common Core,CCST9067,Leaving Earth: Our Future in Space,,


In [15]:
# save dataset to local
df_course_content.to_csv('data/df_course_content_with_domain.csv')

## UIUC Dataset

In [15]:
# load raw UIUC dataset from local
df_uiuc = pd.read_csv('data/uiuc-courses-2022-sp.csv')

In [16]:
# define a list of subjects to consider for the UIUC dataset
uiuc_subjects_to_consider = [
    'CS', 'IS', # cs and data science
    'ABE', 'BIOC', 'BSE', 'CDB', 'CHBE', 'IB', 'MCB', 'MICR', 'PATH', # biology
    'CHEM', # chemistry
    'PHYS', # physics
    'FSHN', 'NUTR', # nutritional science
    'ESE', 'GEOL', 'ENSU', 'ENVS', 'NRES',
    'STAT', 'ASRM',
    'CS', 'CSE',
    'MATH'
]

In [17]:
# select the course description columns
df_uiuc_course_content = df_uiuc[[
    'Subject', 'Number', 'Name', 'Description'
]]

In [18]:
# drop any duplicates
df_uiuc_course_content = df_uiuc_course_content.drop_duplicates()

In [19]:
# select only the relevant subjects
df_uiuc_course_content = df_uiuc_course_content.loc[df_uiuc_course_content['Subject'].isin(uiuc_subjects_to_consider)]

In [20]:
# reset index
df_uiuc_course_content = df_uiuc_course_content.reset_index(drop=True)

In [21]:
# map the different subject codes into different subject domains
uiuc_subjects_dict = {}

for key in ['CS', 'IS']:
    uiuc_subjects_dict[key] = 'Computer Science'
for key in ['ABE', 'BIOC', 'BSE', 'CDB', 'CHBE', 'IB', 'MCB', 'MICR', 'PATH']:
    uiuc_subjects_dict[key] = 'Biology'
for key in ['CHEM']:
    uiuc_subjects_dict[key] = 'Chemistry'
for key in ['PHYS']:
    uiuc_subjects_dict[key] = 'Physics'
for key in ['FSHN', 'NUTR']:
    uiuc_subjects_dict[key] = 'Nutritional Science'
for key in ['ESE', 'GEOL', 'ENSU', 'ENVS', 'NRES']:
    uiuc_subjects_dict[key] = 'Earth Sciences'
for key in ['STAT', 'ASRM']:
    uiuc_subjects_dict[key] = 'Statistics & Actuarial Science'
for key in ['MATH']:
    uiuc_subjects_dict[key] = 'Mathematics'

In [22]:
# add 'Subject Domain' column to the UIUC dataset based on the subject codes
df_uiuc_course_content['Subject Domain'] = df_uiuc_course_content['Subject'].map(uiuc_subjects_dict)

In [23]:
df_uiuc_course_content.head()

Unnamed: 0,Subject,Number,Name,Description,Subject Domain
0,ABE,141,ABE Principles: Biological,"Principles of biology relevant to agriculture,...",Biology
1,ABE,152,Water in the Global Environment,This course develops a comprehensive understan...,Biology
2,ABE,199,Undergraduate Open Seminar,May be repeated to a maximum of 12 hours.,Biology
3,ABE,225,ABE Principles: Bioenvironment,Principles of environmental control for biolog...,Biology
4,ABE,226,ABE Principles: Bioprocessing,Principles of bioprocess engineering applied t...,Biology


In [24]:
# convert the columns into the same columns as the HKU dataset
df_uiuc_course_content['Course Code'] = df_uiuc_course_content['Subject'] + df_uiuc_course_content['Number'].apply(lambda x: str(x))
df_uiuc_course_content = df_uiuc_course_content.rename(columns = {
    'Name': 'Course Title',
    'Description': 'Course Description'
})
df_uiuc_course_content = df_uiuc_course_content.drop(columns=['Subject', 'Number'])

In [25]:
df_uiuc_course_content

Unnamed: 0,Course Title,Course Description,Subject Domain,Course Code
0,ABE Principles: Biological,"Principles of biology relevant to agriculture,...",Biology,ABE141
1,Water in the Global Environment,This course develops a comprehensive understan...,Biology,ABE152
2,Undergraduate Open Seminar,May be repeated to a maximum of 12 hours.,Biology,ABE199
3,ABE Principles: Bioenvironment,Principles of environmental control for biolog...,Biology,ABE225
4,ABE Principles: Bioprocessing,Principles of bioprocess engineering applied t...,Biology,ABE226
...,...,...,...,...
792,Topics in Statistics,May be repeated if topics vary. Prerequisite: ...,Statistics & Actuarial Science,STAT578
793,Hierarchical Linear Models,Same as PSYC 587 and EPSY 587. See EPSY 587.,Statistics & Actuarial Science,STAT587
794,Individual Study and Research,Directed reading and research. Approved for le...,Statistics & Actuarial Science,STAT590
795,STAT Internship,"Supervised, off-campus experience in a field i...",Statistics & Actuarial Science,STAT593


In [26]:
# save dataset to local
df_uiuc_course_content.to_csv('data/df_uiuc_course_content_with_domain.csv')

## HKU + UIUC Dataset

In [27]:
# combine both HKU and UIUC datasets
df = df_course_content.append(df_uiuc_course_content)

In [28]:
df = df.reset_index(drop=True)

In [29]:
# save dataset to local
df.to_csv('data/hku_uiuc_course_content_with_domain.csv')

In [30]:
# combine the course descriptions and outcomes
df_course_description = df[['Course Description', 'Subject Domain']]

# drop any duplicates
df_course_description = df_course_description.dropna().drop_duplicates().reset_index(drop=True)

In [31]:
# save dataset to local
df_course_description.to_csv('data/df_hku_uiuc_course_description_with_domain.csv')

# Finding Subject Domain Keywords

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kackie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kackie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kackie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Kackie\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
# load hku + uiuc dataset from local
df_course_description = pd.read_csv('data/df_hku_uiuc_course_description_with_domain.csv', index_col=0)

df_course_description

Unnamed: 0,Course Description,Subject Domain
0,This is an introductory course in computer pro...,Computer Science
1,This is an introductory course designed for fi...,Computer Science
2,This course covers intermediate to advanced co...,Computer Science
3,This course introduces the basic concepts of f...,Computer Science
4,This course introduces concepts and applicatio...,Computer Science
...,...,...
1138,May be repeated if topics vary. Prerequisite: ...,Statistics & Actuarial Science
1139,Same as PSYC 587 and EPSY 587. See EPSY 587.,Statistics & Actuarial Science
1140,Directed reading and research. Approved for le...,Statistics & Actuarial Science
1141,"Supervised, off-campus experience in a field i...",Statistics & Actuarial Science


In [5]:
# combine all descriptions of a subject domain into one single text instead of sarapate rows
def combine_all_rows_into_one(df_filtered, subject_domain):
    combined_text = ''
    for text in df_filtered:
        combined_text += text + ' '
    
    df_new = pd.DataFrame({'Subject Domain Description': combined_text, 'Subject Domain': subject_domain}, index=[0])
    return df_new

In [6]:
def combine_descriptions_for_all_subjects(df):
    subject_domains = df['Subject Domain'].unique()

    df_result = pd.DataFrame()
    for subject in subject_domains:
        df_filtered = df.loc[df['Subject Domain'] == subject]['Course Description'].drop_duplicates()
        df_new_row = combine_all_rows_into_one(df_filtered, subject)
        df_result = df_result.append(df_new_row)
    
    df_result = df_result.reset_index(drop=True)

    return df_result


In [7]:
df_subject_domain_descriptions = combine_descriptions_for_all_subjects(df_course_description)

In [8]:
df_subject_domain_descriptions

Unnamed: 0,Subject Domain Description,Subject Domain
0,This is an introductory course in computer pro...,Computer Science
1,A Biochemical Perspective on the Basic Science...,Biomedical Sciences
2,An issue-based approach will be adopted to ena...,Biological Sciences
3,Topics covered in the course will be:- Finding...,English
4,Topic 1: Gases: Their Properties and Behaviou...,Chemistry
5,- Grammar & vocabulary of modern Chinese\r - T...,Chinese
6,"Global climatic systems, climate classificatio...",Earth Sciences
7,The course will cover: Computational science a...,Mathematics
8,Topics include: the science in the household a...,Physics
9,This course aims at increasing students' aware...,Faculty


In [9]:
# save dataset to local
df_subject_domain_descriptions.to_csv('data/df_subject_domain_descriptions.csv')

Our goal is to match the student's skill(s) with a subject domain, so that the student can have a better idea on what courses to study. Therefore, the following models focus on:
- Input: A set of phrases / sentences that the student inputs into our chatbot
- Output: A subject domain classification that best matches with the skills the student has

We will:
1. First build a taxonomy of all the skills required for each subject domain (for this part we use an unsupervised method)
    - TF-IDF
    - TextRank
    - TopicRank
    - YAKE!
    - KeyBERT
2. Second implement a model to match the student's skills with the subject domain (for this part we use supervised learning)

## 1. Skills taxonomy for each subject domain (unsupervised model)

In [10]:
# load dataset from local
df_subject_domain_descriptions = pd.read_csv('data/df_subject_domain_descriptions.csv', index_col=0)

### Helper Functions

#### Helper functions for text preprocessing

In [11]:
# helper function
# return: LIST of tokenized words
def preprocess(text, with_stopwords=False, lemmatize=True):
    text = text.lower()
    words = word_tokenize(text)
    words = [word for word in words if word.isalpha()]
    if with_stopwords==False:
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if not word in stop_words]
    if lemmatize==True:
        lemmatizer=WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

    return words

In [12]:
def lemmatize(text):
    text = text.lower()
    words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return words

In [13]:
def stem(text):
    text = text.lower()
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if not word in stop_words]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    return words

In [14]:
def convert_list_to_sent(words):
    sentence = ' '.join(words)
    return sentence

In [15]:
def stem_entire_document(document):
    final_text = ''
    stemmed_doc = convert_list_to_sent(stem(document))
    final_text += stemmed_doc + ' '
    return final_text

#### Helper functions for top k keywords

In [25]:
# helper function
# get top k keywords based on a specific keyword extraction type
# get top k keywords
from summa import keywords
from yake import KeywordExtractor
from keybert import KeyBERT

def get_top_k_keywords(document, type, k=10):
    if type == 'textrank':
        top_k_keywords = keywords.keywords(document).split('\n')
    elif type == 'yake':
        kw_extractor = KeywordExtractor(lan='en', n=1, top=k)
        top_k_keywords = kw_extractor.extract_keywords(text=document)
        
        # since the extractor returns a tuple, we extract the first element in the tuple only
        top_k_keywords = [x for x, _ in top_k_keywords]

    elif type == 'keybert':
        kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')
        top_k_keywords = kw_extractor.extract_keywords(document, stop_words='english', top_n=k, use_mmr=True)

    return top_k_keywords

In [17]:
# helper function
# get top k keywords for ALL SUBJECT DOMAINS based on a specific keyword extraction type (e.g. 'tfidf', 'textrank', ...)
# return: DATAFRAME
def get_top_k_keywords_for_all_subjects(df, type, k=10):
    subject_domains = df['Subject Domain'].unique()

    results = []
    for subject in subject_domains:
        document = df.loc[df['Subject Domain'] == subject]['Subject Domain Description'].values[0]
        stemmed_document = stem_entire_document(document)
        top_k_keywords = get_top_k_keywords(stemmed_document, type, k)
        results.append([subject, top_k_keywords])
    
    df_result = pd.DataFrame(results, columns=['Subject Domain', 'Top K Keywords'])
    return df_result

### Model 1: TF-IDF

In [50]:
# helper function
# identity tokenizer that returns the original text (for tfidf)
def identity_tokenizer(text):
    return text

In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_top_k_keywords_tfidf(documents, k=10):

    # define tfidf vectorizer
    tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)
    transformed_documents = tfidf.fit_transform(documents)

    feature_array = tfidf.get_feature_names()
    tfidf_scores = sorted(list(zip(
        tfidf.get_feature_names(), transformed_documents.sum(0).getA1())), key=lambda x: x[1], reverse=True)[:k]

    idf_scores = sorted(list(zip(feature_array,tfidf.idf_,)), key = lambda x: x[1], reverse=True)[:k]

    print(tfidf_scores)
    

In [101]:
documents = df_course_description.loc[df_course_description['Subject Domain'] == 'Chemistry']['Course Description']
documents = documents.apply(preprocess)
get_top_k_keywords_tfidf(documents, 20)

[('chem', 11.832816235278253), ('chemistry', 8.09839277236706), ('student', 6.002582489182729), ('course', 5.003532047836422), ('organic', 4.9924757913933835), ('credit', 4.394229631219657), ('hour', 4.31921569572002), ('chemical', 4.301486384398401), ('graduate', 4.248369656505552), ('prerequisite', 4.1882763263630265), ('may', 3.74055485770705), ('inorganic', 3.0387652530800136), ('material', 2.71395051191926), ('undergraduate', 2.684230552624926), ('laboratory', 2.6444490418073463), ('see', 2.586844236222122), ('technique', 2.559782959528158), ('required', 2.556057896594594), ('synthesis', 2.3844607092227665), ('structure', 2.331992337888255)]


### Model 2: TextRank

In [18]:
# get top 10 keywords using TextRank
top_k_textrank = get_top_k_keywords_for_all_subjects(df_subject_domain_descriptions, 'textrank', 10)

In [19]:
top_k_textrank

Unnamed: 0,Subject Domain,Top K Keywords
0,Computer Science,"[data, student, cours comput program, informat..."
1,Biomedical Sciences,"[protein, student, cours, molecular, biochemis..."
2,Biological Sciences,"[cours, biology, biological, process, environm..."
3,English,"[student, scienc, learn, cours, present, langu..."
4,Chemistry,"[chemistry, chemistri equival, cours aim provi..."
5,Chinese,"[messag, student, write, techniqu, report, pre..."
6,Earth Sciences,"[student, environ, earth, geology manag, impac..."
7,Mathematics,"[theorems, mathematics, applic, applications, ..."
8,Physics,"[cours, basic physic, theori, topic includ, eq..."
9,Faculty,"[student, students, science, sciences, cours, ..."


In [20]:
# save TextRank top 10 to local
top_k_textrank.to_csv('subject-skills-results/top-10-textrank.csv')

### Model 3: YAKE!

In [111]:
top_k_yake = get_top_k_keywords_for_all_subjects(df_subject_domain_descriptions, 'yake', k=20)

In [112]:
top_k_yake

Unnamed: 0,Subject Domain,Top K Keywords
0,Computer Science,"[hour, cours, graduat, topic, inform, credit, ..."
1,Biomedical Sciences,"[protein, student, cours, molecular, biochemis..."
2,Biological Sciences,"[student, cours, biolog, food, major, scienc, ..."
3,English,"[present, studi, cours, languag, student, skil..."
4,Chemistry,"[chemistri, chem, cours, student, chemic, orga..."
5,Chinese,"[email, messag, techniqu, write, style, rhetor..."
6,Earth Sciences,"[hour, geol, prerequisit, student, geolog, env..."
7,Mathematics,"[math, hour, prerequisit, cours, linear, diffe..."
8,Physics,"[elect, cours, physic, astronomi, theme, numer..."
9,Faculty,"[student, cours, scienc, research, develop, di..."


In [123]:
# convert the top 20 keywords for each subject domain to a dictionary
top_k_yake_dict = top_k_yake.set_index('Subject Domain')['Top K Keywords'].to_dict()

In [125]:
# save the dictionary to json
import json
with open('data/subject_domain_keywords_top_20_yake_dict.json', 'w') as fp:
    json.dump(top_k_yake_dict, fp)

In [118]:
data

{'Top K Keywords': {'Computer Science': ['hour',
   'cours',
   'graduat',
   'topic',
   'inform',
   'credit',
   'quantum',
   'comput',
   'student',
   'data',
   'math',
   'prerequisit',
   'includ',
   'profession',
   'design',
   'program',
   'learn',
   'develop',
   'scienc',
   'undergradu'],
  'Biomedical Sciences': ['protein',
   'student',
   'cours',
   'molecular',
   'biochemistri',
   'sequenc',
   'cell',
   'basic',
   'signal',
   'structur',
   'provid',
   'biolog',
   'gene',
   'studi',
   'analysi',
   'acid',
   'method',
   'metabol',
   'scienc',
   'life'],
  'Biological Sciences': ['student',
   'cours',
   'biolog',
   'food',
   'major',
   'scienc',
   'provid',
   'environment',
   'ecolog',
   'studi',
   'process',
   'field',
   'system',
   'understand',
   'nutrit',
   'introduc',
   'chang',
   'knowledg',
   'basic',
   'topic'],
  'English': ['present',
   'studi',
   'cours',
   'languag',
   'student',
   'skill',
   'learn',
   'professi

In [108]:
# save YAKE top 20 to csv
top_k_yake.to_csv('subject-skills-results/top-20-yake.csv')

### Model 4: KeyBERT

In [26]:
top_k_keybert = get_top_k_keywords_for_all_subjects(df_subject_domain_descriptions, 'keybert', k=10)

In [27]:
top_k_keybert

Unnamed: 0,Subject Domain,Top K Keywords
0,Computer Science,"[(python, 0.4429), (javascript, 0.3521), (biol..."
1,Biomedical Sciences,"[(biotechnolog, 0.6446), (biochemistry, 0.6398..."
2,Biological Sciences,"[(biotechnologypract, 0.5221), (biochemistry, ..."
3,English,"[(presentationstud, 0.6183), (examin, 0.5725),..."
4,Chemistry,"[(chem1042, 0.4485), (hour, 0.4073), (august, ..."
5,Chinese,"[(email, 0.3805), (techniqu, 0.306), (linguist..."
6,Earth Sciences,"[(geotechn, 0.6321), (paleoproterozo, 0.5872),..."
7,Mathematics,"[(math3943, 0.5118), (riemannian, 0.4964), (co..."
8,Physics,"[(cosmolog, 0.5392), (biophys, 0.4803), (calcu..."
9,Faculty,"[(entrepreneurship, 0.4537), (biotechnolog, 0...."


In [28]:
# save KeyBERT top 10 to local
top_k_keybert.to_csv('subject-skills-results/top-10-keybert.csv')