In [31]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import os
import ast
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [32]:
model = SentenceTransformer('all-mpnet-base-v2')

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [33]:
def from_np_array(array_string):
    
    if (array_string == ''):
        return []
    
    array_data = ast.literal_eval(array_string)
    return array_data

In [34]:
def fill_empty_course_info(row):
    course_name = row['Course Name']
    course_info = row['Course Information']
    
    if (isinstance(course_info, str) == False):
        course_info = course_name
        
    return course_info
    

In [35]:
def encode_course_info(model, row):
    course_info = row['Course Information']
    embedding = []
    
    if (isinstance(course_info, str) == False):
        return ""
    
    embeddings = model.encode(course_info)
    
    embedding_string = ""
    
    for embedding in embeddings:
        embedding_string += str(embedding) + "|"
        
    embedding_string = embedding_string[0 : len(embedding_string)-1]
            
    return embedding_string

In [36]:
def encode_course_name(model, row):
    course_name = row['Course Name']
    embedding = []
    
    if (isinstance(course_name, str) == False):
        return ""
    
    embeddings = model.encode(course_name)
    
    embedding_string = ""
    
    for embedding in embeddings:
        embedding_string += str(embedding) + "|"
        
    embedding_string = embedding_string[0 : len(embedding_string)-1]
            
    return embedding_string

In [37]:
all_modules = pd.read_csv('scraped-data/all_modules.csv')

In [38]:
all_modules = pd.read_csv('scraped-data/all_modules.csv', converters={'Prerequisites': from_np_array, 'Mutually Exclusive': from_np_array, 'Topics': from_np_array})

In [39]:
valid_courses = all_modules['Course Code'].tolist()

In [40]:
prerequisites_group_df = pd.DataFrame(columns=['course_code', 'prerequisites', 'group_id'])

In [41]:
mutually_exclusive_group_df = pd.DataFrame(columns=['course_code', 'mutually_exclusive'])

In [42]:
prereq_id = 0

In [43]:
for index, row in all_modules.iterrows():
    course_code = row['Course Code']
    list_of_mutually_exclusives = row['Mutually Exclusive']
    
    if (list_of_mutually_exclusives == None or isinstance(list_of_mutually_exclusives, float) == True or list_of_mutually_exclusives == []):
        continue
        
    for mutually_exclusive in list_of_mutually_exclusives:
        if (mutually_exclusive in valid_courses):
            new_row = pd.Series([course_code, mutually_exclusive], index=['course_code', 'mutually_exclusive'])
            mutually_exclusive_group_df = mutually_exclusive_group_df.append(new_row, ignore_index=True)

In [44]:
for index, row in all_modules.iterrows():
    course_code = row['Course Code']
    list_of_prerequisites = row['Prerequisites']
    course_info = row['Course Information']
    
    if (list_of_prerequisites == None or isinstance(list_of_prerequisites, float) == True or list_of_prerequisites == []):
        continue
        
    for prerequisites in list_of_prerequisites:
        piped_prerequisites = ""
        prereq_group_id = "prereq_group_" + str(prereq_id)

        for prerequisite in prerequisites:
            if (prerequisite in valid_courses):
                piped_prerequisites += prerequisite + '|'
            else:
                piped_prerequisites = ""
                break

             
        if (len(piped_prerequisites) != 0):
            piped_prerequisites = piped_prerequisites[0 : len(piped_prerequisites)-1]

            new_row = pd.Series([course_code, piped_prerequisites, prereq_group_id], index=['course_code', 'prerequisites', 'group_id'])
            prereq_id += 1

            prerequisites_group_df = prerequisites_group_df.append(new_row, ignore_index=True)

In [45]:
prerequisites_group_df.to_csv(os.path.join(os.getcwd(), "scraped-data/prerequisite_groups.csv"), index=False)

In [46]:
mutually_exclusive_group_df.to_csv(os.path.join(os.getcwd(), "scraped-data/mutually_exclusive.csv"), index=False)

In [47]:
all_modules.drop('Prerequisites', axis=1, inplace=True)

In [48]:
all_modules.drop('Mutually Exclusive', axis=1, inplace=True)

In [49]:
all_modules['Course Information'] = all_modules.apply(lambda row: fill_empty_course_info(row), axis=1)

In [50]:
all_modules['Encoded Course Information'] = all_modules.apply(lambda row: encode_course_info(model, row), axis=1)

In [51]:
all_modules['Encoded Course Name'] = all_modules.apply(lambda row: encode_course_name(model, row), axis=1)

In [54]:
all_modules.to_csv(os.path.join(os.getcwd(), "scraped-data/all_modules_with_encodings.csv"), index=False)

In [27]:
all_modules_with_topics = pd.read_csv('all_modules_with_topics.csv', converters={'Prerequisites': from_np_array, 'Mutually Exclusive': from_np_array, 'Topics': from_np_array})

In [7]:
all_modules.tail(50)

Unnamed: 0,Course Code,Course Name,Academic Units,Faculty,BDE,Grade Type,Prerequisites,Mutually Exclusive,Course Information,Discipline
1822,AED38E,PSYCHOPATHOLOGY IN YOUTHS,3,National Institute of Education,Yes,Letter Graded,[],[],This course aims to equip students with the fo...,National Institute of Education
1823,MA0303,HUMAN RESOURCE MANAGEMENT,3,School of Mechanical and Aerospace Engineering,Yes,Letter Graded,[],"[BG4902, BS4202, CE8003, CH4902, CV4202, ...",The Nature of Human Relations. The study of in...,Mechanical/Aerospace Engineering
1824,MA5105,ENGINEERING MANAGEMENT ANALYSIS,3,School of Mechanical and Aerospace Engineering,Yes,Letter Graded,[],[],Principles of engineering economy as a basis f...,Mechanical/Aerospace Engineering
1825,CM5012,INTRODUCTION TO FORENSIC SCIENCE,2,"School of Chemistry, Chemical Engineering and ...",Yes,Pass/Fail,[],"[CM5002, CM8002]","On completing this course, you will understand...","Chemistry, Chemical Engineering and Biotechnology"
1826,CS5207,"FIFTY DISCOVERIES, FIFTY INVENTIONS",3,Wee Kim Wee School of Communication and Inform...,Yes,Letter Graded,[],[],This course aims to acquaint you with the hist...,Communication Studies
1827,EE5087,LIVING WITH MATHEMATICS,3,School of Electrical and Electronic Engineering,Yes,Letter Graded,[],[],Solving algebraic equations and applications. ...,Electrical Engineering
1828,DM5001,WEB DESIGN,3,"School Of Art, Design and Media",Yes,Letter Graded,[],[DM3000],This course aims to introduce you to the funda...,"Art, Design and Media"
1829,BU5241,FINANCIAL WELLBEING,3,Nanyang Business School,Yes,Letter Graded,[],[],This course is designed to equip students with...,Business/Accountancy
1830,BU5544,MARKETING HEALTH: CONSUMERS' PURSUIT OF WELLNESS,3,Nanyang Business School,Yes,Letter Graded,[],[],"The health, beauty, and wellness industry is p...",Business/Accountancy
1831,BU5644,SMARTER BRAIN? THE SCIENCE OF DECISION MAKING,3,Nanyang Business School,Yes,Letter Graded,[],[],Effective decision making is an essential skil...,Business/Accountancy


In [8]:
valid_courses = all_modules['Course Code'].tolist()

In [12]:
prerequisites_group_df = pd.DataFrame(columns=['course_code', 'prerequisites', 'group_id'])

In [13]:
mutually_exclusive_group_df = pd.DataFrame(columns=['course_code', 'mutually_exclusive'])

In [14]:
prereq_id = 0

In [17]:
for index, row in all_modules.iterrows():
    course_code = row['Course Code']
    list_of_mutually_exclusives = row['Mutually Exclusive']
    
    if (list_of_mutually_exclusives == None or isinstance(list_of_mutually_exclusives, float) == True or list_of_mutually_exclusives == []):
        continue
        
    for mutually_exclusive in list_of_mutually_exclusives:
        if (mutually_exclusive in valid_courses):
            new_row = pd.Series([course_code, mutually_exclusive], index=['course_code', 'mutually_exclusive'])
            mutually_exclusive_group_df = mutually_exclusive_group_df.append(new_row, ignore_index=True)

In [18]:
for index, row in all_modules.iterrows():
    course_code = row['Course Code']
    list_of_prerequisites = row['Prerequisites']
    course_info = row['Course Information']
    
    if (list_of_prerequisites == None or isinstance(list_of_prerequisites, float) == True or list_of_prerequisites == []):
        continue
        
    for prerequisites in list_of_prerequisites:
        piped_prerequisites = ""
        prereq_group_id = "prereq_group_" + str(prereq_id)

        for prerequisite in prerequisites:
            if (prerequisite in valid_courses):
                piped_prerequisites += prerequisite + '|'
            else:
                piped_prerequisites = ""
                break

             
        if (len(piped_prerequisites) != 0):
            piped_prerequisites = piped_prerequisites[0 : len(piped_prerequisites)-1]

            new_row = pd.Series([course_code, piped_prerequisites, prereq_group_id], index=['course_code', 'prerequisites', 'group_id'])
            prereq_id += 1

            prerequisites_group_df = prerequisites_group_df.append(new_row, ignore_index=True)


In [19]:
prerequisites_group_df.to_csv(os.path.join(os.getcwd(), "scraped-data/prerequisite_groups.csv"), index=False)

In [20]:
mutually_exclusive_group_df.to_csv(os.path.join(os.getcwd(), "scraped-data/mutually_exclusive.csv"), index=False)

In [21]:
prereq_id = 0

In [22]:
for index, row in all_modules.iterrows():
    course_code = row['Course Code']
    list_of_mutually_exclusives = row['Mutually Exclusive']
    
    if (list_of_mutually_exclusives == None or isinstance(list_of_mutually_exclusives, float) == True or list_of_mutually_exclusives == []):
        continue
        
    for mutually_exclusive in list_of_mutually_exclusives:
        if (mutually_exclusive in valid_courses):
            new_row = pd.Series([course_code, mutually_exclusive], index=['course_code', 'mutually_exclusive'])
            mutually_exclusive_group_df = mutually_exclusive_group_df.append(new_row, ignore_index=True)

In [23]:
for index, row in all_modules.iterrows():
    course_code = row['Course Code']
    list_of_prerequisites = row['Prerequisites']
    course_info = row['Course Information']
    
    if (list_of_prerequisites == None or isinstance(list_of_prerequisites, float) == True or list_of_prerequisites == []):
        continue
        
    for prerequisites in list_of_prerequisites:
        piped_prerequisites = ""
        prereq_group_id = "prereq_group_" + str(prereq_id)

        for prerequisite in prerequisites:
            if (prerequisite in valid_courses):
                piped_prerequisites += prerequisite + '|'
            else:
                piped_prerequisites = ""
                break

             
        if (len(piped_prerequisites) != 0):
            piped_prerequisites = piped_prerequisites[0 : len(piped_prerequisites)-1]

            new_row = pd.Series([course_code, piped_prerequisites, prereq_group_id], index=['course_code', 'prerequisites', 'group_id'])
            prereq_id += 1

            prerequisites_group_df = prerequisites_group_df.append(new_row, ignore_index=True)


In [19]:
prerequisites_group_df.to_csv(os.path.join(os.getcwd(), "prerequisite_groups.csv"), index=False)

In [20]:
mutually_exclusive_group_df.to_csv(os.path.join(os.getcwd(), "mutually_exclusive.csv"), index=False)

In [24]:
all_modules.drop('Prerequisites', axis=1, inplace=True)

In [25]:
all_modules.drop('Mutually Exclusive', axis=1, inplace=True)

In [24]:
all_modules.to_csv(os.path.join(os.getcwd(), "scraped-data/all_modules_final.csv"), index=False)

In [25]:
all_modules = pd.read_csv("scraped-data/all_modules_final.csv")

In [28]:
all_modules['Course Information'] = all_modules.apply(lambda row: fill_empty_course_info(row), axis=1)

In [27]:
all_modules.to_csv(os.path.join(os.getcwd(), "all_modules_with_no_empty_course_info.csv"), index=False)

In [29]:
all_modules['Encoded Course Information'] = all_modules.apply(lambda row: encode_course_info(model, row), axis=1)

NameError: name 'encode_course_info' is not defined

In [32]:
all_modules['Encoded Course Name'] = all_modules.apply(lambda row: encode_course_name(model, row), axis=1)

In [33]:
all_modules.to_csv(os.path.join(os.getcwd(), "all_modules_with_encodings.csv"), index=False)

In [34]:
all_modules.tail(50)

Unnamed: 0,Course Code,Course Name,Academic Units,Faculty,BDE,Grade Type,Course Information,Discipline,Encoded Course Information,Encoded Course Name
1822,AED38E,PSYCHOPATHOLOGY IN YOUTHS,3,National Institute of Education,Yes,Letter Graded,This course aims to equip students with the fo...,National Institute of Education,0.033153024|-0.032460276|0.011104785|0.0012712...,0.05109445|0.022670958|0.037992954|-0.03616319...
1823,MA0303,HUMAN RESOURCE MANAGEMENT,3,School of Mechanical and Aerospace Engineering,Yes,Letter Graded,The Nature of Human Relations. The study of in...,Mechanical/Aerospace Engineering,0.05683826|-0.042534854|-0.022763845|-0.028552...,0.08252762|-0.041635696|-0.023229161|-0.071781...
1824,MA5105,ENGINEERING MANAGEMENT ANALYSIS,3,School of Mechanical and Aerospace Engineering,Yes,Letter Graded,Principles of engineering economy as a basis f...,Mechanical/Aerospace Engineering,-0.03114151|-0.008115872|-0.012403406|0.017643...,-0.014787275|-0.054385096|-0.038888175|0.00561...
1825,CM5012,INTRODUCTION TO FORENSIC SCIENCE,2,"School of Chemistry, Chemical Engineering and ...",Yes,Pass/Fail,"On completing this course, you will understand...","Chemistry, Chemical Engineering and Biotechnology",0.012326361|0.0073167016|0.010166116|-0.018267...,0.034222413|0.008838856|0.020994691|0.01571587...
1826,CS5207,"FIFTY DISCOVERIES, FIFTY INVENTIONS",3,Wee Kim Wee School of Communication and Inform...,Yes,Letter Graded,This course aims to acquaint you with the hist...,Communication Studies,0.005856195|-0.047859687|-0.031065635|-0.04403...,0.033501387|-0.0077291057|-0.039543997|-0.0039...
1827,EE5087,LIVING WITH MATHEMATICS,3,School of Electrical and Electronic Engineering,Yes,Letter Graded,Solving algebraic equations and applications. ...,Electrical Engineering,-0.031674113|-0.036064573|-0.036995664|0.01021...,-0.02966896|0.10283402|-0.019766012|0.01550402...
1828,DM5001,WEB DESIGN,3,"School Of Art, Design and Media",Yes,Letter Graded,This course aims to introduce you to the funda...,"Art, Design and Media",0.060117096|-0.10119464|-0.06508245|-0.0134885...,0.048014462|-0.051703468|-0.042553253|9.245576...
1829,BU5241,FINANCIAL WELLBEING,3,Nanyang Business School,Yes,Letter Graded,This course is designed to equip students with...,Business/Accountancy,-0.03125687|-0.032849897|-0.04127914|-0.030825...,-0.040974416|0.049066547|-0.012413355|-0.01238...
1830,BU5544,MARKETING HEALTH: CONSUMERS' PURSUIT OF WELLNESS,3,Nanyang Business School,Yes,Letter Graded,"The health, beauty, and wellness industry is p...",Business/Accountancy,0.08450485|0.028086968|-0.0065518897|-0.039820...,0.03199752|0.023281489|0.009297315|-0.03288709...
1831,BU5644,SMARTER BRAIN? THE SCIENCE OF DECISION MAKING,3,Nanyang Business School,Yes,Letter Graded,Effective decision making is an essential skil...,Business/Accountancy,0.02796735|0.025945349|-0.03144469|-0.04691799...,0.011779841|0.050827924|-0.016455328|-0.055144...


In [28]:
all_modules_with_topics.tail(50)

Unnamed: 0,Course Code,Course Name,Academic Units,Faculty,BDE,Grade Type,Prerequisites,Mutually Exclusive,Course Information,Topics
1822,AED38E,PSYCHOPATHOLOGY IN YOUTHS,3,NIE,Yes,Letter Graded,[],[],This course aims to equip students with the fo...,"[psychopathology youths course, psychopatholog..."
1823,MA0303,HUMAN RESOURCE MANAGEMENT,3,MAE,Yes,Letter Graded,[],"[BG4902, BS4202, CE8003, CH4902, CV4202, ...",The Nature of Human Relations. The study of in...,"[human resource management, hrm managing, fund..."
1824,MA5105,ENGINEERING MANAGEMENT ANALYSIS,3,MAE,Yes,Letter Graded,[],[],Principles of engineering economy as a basis f...,"[engineering management analysis, engineering ..."
1825,CM5012,INTRODUCTION TO FORENSIC SCIENCE,2,CCEB,Yes,Pass/Fail,[],"[CM5002, CM8002]","On completing this course, you will understand...","[understand forensic science, forensic science..."
1826,CS5207,"FIFTY DISCOVERIES, FIFTY INVENTIONS",3,WKWSCI,Yes,Letter Graded,[],[],This course aims to acquaint you with the hist...,[]
1827,EE5087,LIVING WITH MATHEMATICS,3,EEE,Yes,Letter Graded,[],[],Solving algebraic equations and applications. ...,[]
1828,DM5001,WEB DESIGN,3,ADM,Yes,Letter Graded,[],[DM3000],This course aims to introduce you to the funda...,[web design course]
1829,BU5241,FINANCIAL WELLBEING,3,NBS,Yes,Letter Graded,[],[],This course is designed to equip students with...,"[financial wellbeing course, financial wellbeing]"
1830,BU5544,MARKETING HEALTH: CONSUMERS' PURSUIT OF WELLNESS,3,NBS,Yes,Letter Graded,[],[],"The health, beauty, and wellness industry is p...","[marketing health consumers, marketing health]"
1831,BU5644,SMARTER BRAIN? THE SCIENCE OF DECISION MAKING,3,NBS,Yes,Letter Graded,[],[],Effective decision making is an essential skil...,[]


In [29]:
valid_courses = all_modules_with_topics['Course Code'].tolist()

In [30]:
prerequisites_group_df = pd.DataFrame(columns=['course_code', 'prerequisites', 'group_id'])

In [31]:
mutually_exclusive_group_df = pd.DataFrame(columns=['course_code', 'mutually_exclusive'])

In [32]:
prereq_id = 0

In [33]:
for index, row in all_modules_with_topics.iterrows():
    course_code = row['Course Code']
    list_of_mutually_exclusives = row['Mutually Exclusive']
    
    if (list_of_mutually_exclusives == None or isinstance(list_of_mutually_exclusives, float) == True or list_of_mutually_exclusives == []):
        continue
        
    for mutually_exclusive in list_of_mutually_exclusives:
        if (mutually_exclusive in valid_courses):
            new_row = pd.Series([course_code, mutually_exclusive], index=['course_code', 'mutually_exclusive'])
            mutually_exclusive_group_df = mutually_exclusive_group_df.append(new_row, ignore_index=True)


In [34]:
for index, row in all_modules_with_topics.iterrows():
    course_code = row['Course Code']
    list_of_prerequisites = row['Prerequisites']
    course_info = row['Course Information']
    
    if (list_of_prerequisites == None or isinstance(list_of_prerequisites, float) == True or list_of_prerequisites == []):
        continue
        
    for prerequisites in list_of_prerequisites:
        piped_prerequisites = ""
        prereq_group_id = "prereq_group_" + str(prereq_id)

        for prerequisite in prerequisites:
            if (prerequisite in valid_courses):
                piped_prerequisites += prerequisite + '|'
            else:
                piped_prerequisites = ""
                break

             
        if (len(piped_prerequisites) != 0):
            piped_prerequisites = piped_prerequisites[0 : len(piped_prerequisites)-1]

            new_row = pd.Series([course_code, piped_prerequisites, prereq_group_id], index=['course_code', 'prerequisites', 'group_id'])
            prereq_id += 1

            prerequisites_group_df = prerequisites_group_df.append(new_row, ignore_index=True)


In [35]:
prerequisites_group_df.to_csv(os.path.join(os.getcwd(), "prerequisite_groups.csv"), index=False)

In [13]:
mutually_exclusive_group_df.to_csv(os.path.join(os.getcwd(), "mutually_exclusive.csv"), index=False)

In [14]:
def parse_topics(list_of_topics):
    
    if (list_of_topics == []):
        return None
    
    piped_list_of_topics = ""
    
    for topic in list_of_topics:
        piped_list_of_topics += topic + '|'
    
    piped_list_of_topics = piped_list_of_topics[0 : len(piped_list_of_topics)-1]
    
    return piped_list_of_topics

In [15]:
all_modules_with_topics['Topics'] = all_modules_with_topics['Topics'].apply(lambda x: parse_topics(x))

In [17]:
all_modules_with_topics.drop('Prerequisites', axis=1, inplace=True)

In [18]:
all_modules_with_topics.drop('Mutually Exclusive', axis=1, inplace=True)

In [19]:
faculty_to_topic_mapping = {
    'ASE' : 'environmental earth systems science', 'LCC' : 'communication', 'SOH' : 'humanities', 'CML' : 'languages',
    'USP' : 'premier', 'NBS' : 'business', 'ICC' : 'interdisciplinary', 'SSS' : 'social science', 'NTUpreneur' : 'entrepreneurship',
    'ADM' : 'art, design and media', 'CCEB' : 'chemistry, chemical engineering and biotech', 'SCSE' : 'computer science',
    'EEE' : 'electrical engineering', 'COE' : 'general engineering', 'MAE' : 'mechanical engineering',
    'SPMS' : 'physical and mathematical sciences', 'MSE' : 'material science', 'SBS' : 'biological science',
    'CEE' : 'civil and environmental engineering', 'WKWSCI' : 'communication studies', 'REP' : 'renaissance engineering',
    'NIE' : 'education', 'CNY' : 'cn yang program', 'LKCMedicine' : 'medicine'
}

In [20]:
def add_faculty_to_topic(row):
    faculty = row['Faculty']
    topics = row['Topics']
    
    if (topics == None):
        return faculty_to_topic_mapping[faculty]
    
    topics += '|' + faculty_to_topic_mapping[faculty]
    
    return topics

In [21]:
all_modules_with_topics['Topics'] = all_modules_with_topics.apply(lambda x: add_faculty_to_topic(x), axis=1)

In [22]:
all_modules_with_topics.head(10)

Unnamed: 0,Course Code,Course Name,Academic Units,Faculty,BDE,Grade Type,Course Information,Topics
0,GC0001,SUSTAINABILITY: SEEING THROUGH THE HAZE,1,ASE,Yes,Pass/Fail,The course wi ll discuss sustainability from p...,environmental earth systems science
1,HW0001,INTRODUCTION TO ACADEMIC COMMUNICATION,0,LCC,Yes,Pass/Fail,The course aims to support you in your academi...,academic communication course|communication
2,HY0001,ETHICS & MORAL REASONING,1,SOH,Yes,Pass/Fail,HY0001 will introduce students to three major ...,teaching ethical principles|humanities
3,LS5005,SPANISH LANGUAGE LEVEL 5,3,CML,Yes,Letter Graded,"This blended learning course, based on the fli...",languages
4,SP0061,SCIENCE & TECHNOLOGY FOR HUMANITY,3,USP,Yes,Letter Graded,The course aims to inspire a long-lasting mind...,premier
5,AB0403,DECISION MAKING WITH PROGRAMMING & ANALYTICS,3,NBS,Yes,Letter Graded,This is an introductory course designed for bu...,programming analytics introductory|making prog...
6,AB1201,FINANCIAL MANAGEMENT,3,NBS,Yes,Letter Graded,"d Income Securities, Derivatives Securities, a...",financial management course|finance course imp...
7,AB1202,STATISTICS & ANALYSIS,3,NBS,Yes,Letter Graded,This course introduces the concepts and method...,business
8,AB1301,BUSINESS LAW,3,NBS,Yes,Letter Graded,The aim of this course is to provide students ...,law relating business|business
9,AB1403,INTERMEDIATE EXCEL,1,NBS,Yes,Pass/Fail,This course focuses on teaching introductory t...,intermediate excel course|intermediate excel|e...


In [27]:
def fill_empty_course_info(row):
    course_name = row['Course Name']
    course_info = row['Course Information']
    
    if (isinstance(course_info, str) == False):
        course_info = course_name
        
    return course_info
    
    
    
    

In [51]:
all_modules_with_topics.to_csv(os.path.join(os.getcwd(), "all_modules_final.csv"), index=False)

In [52]:
all_modules_with_topics = pd.read_csv("all_modules_final.csv")

In [53]:
all_modules_with_topics['Course Information'] = all_modules_with_topics.apply(lambda row: fill_empty_course_info(row), axis=1)

In [54]:
all_modules_with_topics.to_csv(os.path.join(os.getcwd(), "all_modules_with_no_empty_course_info.csv"), index=False)

In [55]:
all_modules_with_topics.head(10)

Unnamed: 0,Course Code,Course Name,Academic Units,Faculty,BDE,Grade Type,Course Information,Topics,Encoded Course Information,Encoded Course Name
0,GC0001,SUSTAINABILITY: SEEING THROUGH THE HAZE,1,ASE,Yes,Pass/Fail,The course wi ll discuss sustainability from p...,environmental earth systems science|sustainabi...,-0.00025134723|0.0055372017|-0.00534589|0.0387...,0.036235586|0.08773596|-0.022690646|0.02463894...
1,HW0001,INTRODUCTION TO ACADEMIC COMMUNICATION,0,LCC,Yes,Pass/Fail,The course aims to support you in your academi...,academic communication|communication,-0.026351945|-0.027172152|-0.0033674345|0.0146...,0.068022825|-0.008527996|-0.00739141|0.0046476...
2,HY0001,ETHICS & MORAL REASONING,1,SOH,Yes,Pass/Fail,HY0001 will introduce students to three major ...,ethical principles|humanities|ethics,0.022900576|0.13961577|0.018579232|0.009028231...,0.01748612|0.08560437|0.03321585|0.018731872|-...
3,LS5005,SPANISH LANGUAGE LEVEL 5,3,CML,Yes,Letter Graded,"This blended learning course, based on the fli...",languages|spanish,-0.032442745|-0.08577821|-0.048529234|-0.00187...,0.0024722698|-0.06658963|-0.025160616|0.032979...
4,SP0061,SCIENCE & TECHNOLOGY FOR HUMANITY,3,USP,Yes,Letter Graded,The course aims to inspire a long-lasting mind...,premier|science and technology,0.027388476|-0.024601875|-0.015980976|-0.05301...,0.05891931|0.030846994|-0.03908595|-0.05486637...
5,AB0403,DECISION MAKING WITH PROGRAMMING & ANALYTICS,3,NBS,Yes,Letter Graded,This is an introductory course designed for bu...,introductory programming|business analytics|py...,-0.009309238|0.00929346|-0.05696452|-0.0486138...,0.0009535659|0.09369335|-0.025330124|-0.049200...
6,AB1201,FINANCIAL MANAGEMENT,3,NBS,Yes,Letter Graded,"d Income Securities, Derivatives Securities, a...",financial management course|finance course imp...,-0.044498052|-0.04088343|-0.05672457|-0.034641...,0.009423677|0.0021888278|-0.037903763|-0.01046...
7,AB1202,STATISTICS & ANALYSIS,3,NBS,Yes,Letter Graded,This course introduces the concepts and method...,business,-0.03529743|-0.025104275|-0.04028884|0.0185293...,-0.01004325|-0.011976355|-0.059103694|0.011011...
8,AB1301,BUSINESS LAW,3,NBS,Yes,Letter Graded,The aim of this course is to provide students ...,law relating business|business,0.019766005|-0.04842579|0.013231385|-0.0404873...,0.045453105|-0.0012096729|-0.009872127|0.00814...
9,AB1403,INTERMEDIATE EXCEL,1,NBS,Yes,Pass/Fail,This course focuses on teaching introductory t...,intermediate excel course|intermediate excel|e...,-0.056611944|-0.03520852|-0.037055768|-0.02616...,-0.087172076|-0.010662532|-0.029617377|-0.0419...


In [28]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

# #Our sentences we like to encode
# sentences = ['This framework generates embeddings for each input sentence',
#     'Sentences are passed as a list of string.',
#     'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
embeddings = model.encode('This framework generates embeddings for each input sentence')

# #Print the embeddings
# for sentence, embedding in zip(sentences, embeddings):
#     print("Sentence:", sentence)
#     print("Embedding:", embedding)
#     print("")

In [29]:
def encode_course_info(model, row):
    course_info = row['Course Information']
    embedding = []
    
    if (isinstance(course_info, str) == False):
        return ""
    
    embeddings = model.encode(course_info)
    
    embedding_string = ""
    
    for embedding in embeddings:
        embedding_string += str(embedding) + "|"
        
    embedding_string = embedding_string[0 : len(embedding_string)-1]
            
    return embedding_string

In [57]:
all_modules_with_topics['Encoded Course Information'] = all_modules_with_topics.apply(lambda row: encode_course_info(model, row), axis=1)

In [30]:
def encode_course_name(model, row):
    course_name = row['Course Name']
    embedding = []
    
    if (isinstance(course_name, str) == False):
        return ""
    
    embeddings = model.encode(course_name)
    
    embedding_string = ""
    
    for embedding in embeddings:
        embedding_string += str(embedding) + "|"
        
    embedding_string = embedding_string[0 : len(embedding_string)-1]
            
    return embedding_string

In [59]:
all_modules_with_topics['Encoded Course Name'] = all_modules_with_topics.apply(lambda row: encode_course_name(model, row), axis=1)

In [60]:
all_modules_with_topics.to_csv(os.path.join(os.getcwd(), "all_modules_with_encodings.csv"), index=False)