# Entity Extraction

In [1]:
# import libraries
import pandas as pd
import spacy 
import ast
import re
from spacy.matcher import PhraseMatcher

## Entity extraction functions

#### Extraction from structured column data

In [2]:
# definte extract_entities function
def extract_entities(text, label):
    entities = [(text, label)]
    return entities

#### Extraction from column with list of multiple data

In [3]:
# define extract_module_entities function
def extract_module_entities(module_list):
    module_list = ast.literal_eval(module_list)
    return [(module, 'MODULE') for module in module_list]

#### Extraction from chunk of unstructured text data

In [4]:
# load spacy model
nlp = spacy.load("en_core_web_md")

# load the jobs and skills data
jobs_and_skills = pd.read_csv("jobs_and_skillset_linkedin_final.csv")
jobs_and_skills.head()

Unnamed: 0,Job Title,Skills
0,['Academic Advisor'],"['Academic Advising', 'Higher Education', 'Stu..."
1,['Academic Tutor'],"['Tutoring', 'Teaching', 'Peer Tutoring', 'Mat..."
2,['Account Coordinator'],"['Account Management', 'Account Coordination',..."
3,['Account Director'],"['Account Management', 'Client Services', 'Cli..."
4,['Account Executive'],"['Software as a Service (SaaS)', 'Account Mana..."


In [5]:
# get list of job titles and skills 
#job_titles = jobs_and_skills["Job Title"].to_list()
skills = jobs_and_skills["Skills"].unique().tolist()

# convert from string to list
#for i in range(len(job_titles)):
#    job_titles[i] = ast.literal_eval(job_titles[i])
#job_titles = [item for sublist in job_titles for item in sublist]

for i in range(len(skills)):
    skills[i] = ast.literal_eval(skills[i])

# create a list of unique skills
unique_skills = list(set([skill for sublist in skills for skill in sublist]))

#print(job_titles)
print(unique_skills)
print(len(unique_skills))

['Therapeutic Modalities', 'Server Configuration', 'Legal Research', 'Electronics Repair', 'Student Recruiting', 'Theatrical Production', 'Enercalc', 'Creative Agency', 'Automotive Technology', 'Strategy', 'Grills', 'Clinical Nutrition', 'Classroom Management', 'Autodesk Inventor', 'Software Testing', 'Youth Leadership', 'Cocktails', 'Mechanical Product Design', 'Purchase Management', 'Appointment Scheduling', 'Stroke Rehabilitation', 'U.S. VA Loans', 'Team Management', 'Technical Leadership', 'Allen-Bradley', 'Excavation', 'Patient Scheduling', 'Housekeeping', 'Salesforce Sales Cloud', 'Content Design', 'Functional Behavior Assessments', 'Computer Repair', 'Mental Health Care', 'Mobile Testing', 'Legal Document Preparation', 'Series 6', 'Healthcare Management', 'Jenkins', 'CEOs', 'Electrical Wiring', 'Microdermabrasion', 'Event Photography', 'Technical Translation', 'Interior Design', 'Strategic Human Resource Planning', 'Patient Education', 'American Sign Language', 'Loan Origination

In [6]:
# clean unique_skills list

# remove bracketed abbreviations from skills and the space before it
unique_skills = [re.sub(r'\s\(.*\)', '', skill) for skill in unique_skills]

# remove 'Microsoft ' substring before skills
unique_skills = [re.sub(r'Microsoft\s', '', skill) for skill in unique_skills]

In [7]:
def extract_skill_entities(text):
    
    # process text with spaCy model to extract entities
    doc = nlp(text)

    # initialize a dictionary to store the entities
    #entities = {
    #    "Job Title": [],
    #    "Skill": []
    #}

    skills = []

    # extract job titles entities
    #for job_title in job_titles:
    #    if job_title.lower() in text.lower():
    #        entities["Job Title"].append(job_title)
    
    # extract skill entities
    for skill in unique_skills:
        # create a regex pattern with word boundaries around the job title
        pattern = r"\b" + re.escape(skill) + r"\b"
    
        # search for the job title in the text (case-insensitive)
        if re.search(pattern, text, re.IGNORECASE):
            skills.append((skill,'SKILL'))

    return skills

In [8]:
# test spacy_extract_entities function
sample_text = "John Doe is a Software Engineer at Google with experience in Python and cloud computing. Jane Smith is a Project Manager at Microsoft. She specializes in Project Management and Data Science. Both have worked extensively on Agile and DevOps projects."

# call the function
skill_entities = extract_skill_entities(sample_text)

# print the extracted entities
#for entity_type, entity_list in entities.items():
#    print(f"{entity_type}: {', '.join(entity_list) if entity_list else 'None'}")
print(skill_entities)

[('DevOps', 'SKILL'), ('Management', 'SKILL'), ('Cloud Computing', 'SKILL'), ('Python', 'SKILL'), ('Project', 'SKILL'), ('Data Science', 'SKILL'), ('Project Management', 'SKILL')]


## 00 - Student Data

In [29]:
# load the student data
student_info = pd.read_csv('../../backend/data/00 - mock_student_data_final.csv')
# check the first few rows of the student_info dataframe
student_info.head()

Unnamed: 0,Student_Name,Matric_Number,NRIC,Year,Faculties,Major,Second Major,Modules_Completed,Grades
0,Tracy Lewis,A0216920B,XXXXX506Z,1,NUS Business School,Business Administration,,"['ACC1701B', 'DMB1202ACC', 'DMB1201MKT', 'MNO1...","{'ACC1701B': 'B', 'DMB1202ACC': 'A', 'DMB1201M..."
1,Andrew Holden,A0225069H,XXXXX799Z,3,YST Conservatory of Music,Music,,"['CFA1111A', 'MUA1190', 'MUA2109', 'MUA1172', ...","{'CFA1111A': 'F', 'MUA1190': 'A', 'MUA2109': '..."
2,Phillip Bullock,A0228204E,XXXXX194Z,1,College of Design and Engineering,Electrical Engineering,,"['ME1102', 'BN1111', 'PF1103', 'CN1101A', 'ID1...","{'ME1102': 'F', 'BN1111': 'B-', 'PF1103': 'C+'..."
3,Valerie Rivera,A0200778Y,XXXXX150Z,3,Computing,Business Analytics,,"['CS3236R', 'CS1010', 'CP3209', 'CS1010R', 'IS...","{'CS3236R': 'A+', 'CS1010': 'A+', 'CP3209': 'A..."
4,Robert Hall,A0222508M,XXXXX824Z,3,YST Conservatory of Music,Music,Business Administration,"['MUA1109', 'CFA1101A', 'MUA2255A', 'MUA1172',...","{'MUA1109': 'C+', 'CFA1101A': 'B-', 'MUA2255A'..."


In [30]:
# extract entities from the student data
student_info['Student_entities'] = student_info['Student_Name'].apply(lambda x: extract_entities(x, 'STUDENT'))
student_info['Faculty_entities'] = student_info['Faculties'].apply(lambda x: extract_entities(x, 'FACULTY'))
student_info['Major_entities'] = student_info['Major'].apply(lambda x: extract_entities(x, 'MAJOR'))
student_info['Module_entities'] = student_info['Modules_Completed'].apply(lambda x: extract_module_entities(x))

In [31]:
# check the first few rows of the student_info dataframe
student_info.head()

Unnamed: 0,Student_Name,Matric_Number,NRIC,Year,Faculties,Major,Second Major,Modules_Completed,Grades,Student_entities,Faculty_entities,Major_entities,Module_entities
0,Tracy Lewis,A0216920B,XXXXX506Z,1,NUS Business School,Business Administration,,"['ACC1701B', 'DMB1202ACC', 'DMB1201MKT', 'MNO1...","{'ACC1701B': 'B', 'DMB1202ACC': 'A', 'DMB1201M...","[(Tracy Lewis, STUDENT)]","[(NUS Business School, FACULTY)]","[(Business Administration, MAJOR)]","[(ACC1701B, MODULE), (DMB1202ACC, MODULE), (DM..."
1,Andrew Holden,A0225069H,XXXXX799Z,3,YST Conservatory of Music,Music,,"['CFA1111A', 'MUA1190', 'MUA2109', 'MUA1172', ...","{'CFA1111A': 'F', 'MUA1190': 'A', 'MUA2109': '...","[(Andrew Holden, STUDENT)]","[(YST Conservatory of Music, FACULTY)]","[(Music, MAJOR)]","[(CFA1111A, MODULE), (MUA1190, MODULE), (MUA21..."
2,Phillip Bullock,A0228204E,XXXXX194Z,1,College of Design and Engineering,Electrical Engineering,,"['ME1102', 'BN1111', 'PF1103', 'CN1101A', 'ID1...","{'ME1102': 'F', 'BN1111': 'B-', 'PF1103': 'C+'...","[(Phillip Bullock, STUDENT)]","[(College of Design and Engineering, FACULTY)]","[(Electrical Engineering, MAJOR)]","[(ME1102, MODULE), (BN1111, MODULE), (PF1103, ..."
3,Valerie Rivera,A0200778Y,XXXXX150Z,3,Computing,Business Analytics,,"['CS3236R', 'CS1010', 'CP3209', 'CS1010R', 'IS...","{'CS3236R': 'A+', 'CS1010': 'A+', 'CP3209': 'A...","[(Valerie Rivera, STUDENT)]","[(Computing, FACULTY)]","[(Business Analytics, MAJOR)]","[(CS3236R, MODULE), (CS1010, MODULE), (CP3209,..."
4,Robert Hall,A0222508M,XXXXX824Z,3,YST Conservatory of Music,Music,Business Administration,"['MUA1109', 'CFA1101A', 'MUA2255A', 'MUA1172',...","{'MUA1109': 'C+', 'CFA1101A': 'B-', 'MUA2255A'...","[(Robert Hall, STUDENT)]","[(YST Conservatory of Music, FACULTY)]","[(Music, MAJOR)]","[(MUA1109, MODULE), (CFA1101A, MODULE), (MUA22..."


## 01 - Module Info

In [22]:
# load module info data
module_info = pd.read_csv('../../backend/data/01 - mock_module_info.csv')

# check first few rows of the module_info dataframe
module_info.head()

FileNotFoundError: [Errno 2] No such file or directory: '../backend/data/01 - mock_module_info.csv'

In [14]:
# check type of columns in module_info
print(module_info.dtypes)

# change type of description to string
module_info['description'] = module_info['description'].astype(str)

moduleCode                  object
title                       object
description                 object
moduleCredit               float64
department                  object
faculty                     object
gradingBasisDescription     object
prerequisite                object
preclusion                  object
attributes                  object
corequisite                 object
lectureHours                 int64
tutorialHours               object
labHours                     int64
projectHours                object
prepHours                    int64
semester_01                  int64
semester_02                  int64
semester_03                  int64
semester_04                  int64
UE                           int64
dtype: object


In [15]:
# extract entities from the module data
module_info['Module_entities'] = module_info['moduleCode'].apply(lambda x: extract_entities(x, 'MODULE'))
module_info['Department_entities'] = module_info['department'].apply(lambda x: extract_entities(x, 'DEPARTMENT'))
module_info['Faculty_entities'] = module_info['faculty'].apply(lambda x: extract_entities(x, 'FACULTY'))
module_info['Skill_entities'] = module_info['description'].apply(lambda x: extract_skill_entities(x))

In [16]:
# check first few rows of the module_info dataframe
module_info.head()

Unnamed: 0,moduleCode,title,description,moduleCredit,department,faculty,gradingBasisDescription,prerequisite,preclusion,attributes,...,prepHours,semester_01,semester_02,semester_03,semester_04,UE,Module_entities,Department_entities,Faculty_entities,Skill_entities
0,ABM5001,Leadership in Biomedicine,Leadership is fundamental to the success of in...,2.0,NUS Medicine Dean's Office,Yong Loo Lin Sch of Medicine,Graded,[],[],,...,0,0,1,0,0,0,"[(ABM5001, MODULE)]","[(NUS Medicine Dean's Office, DEPARTMENT)]","[(Yong Loo Lin Sch of Medicine, FACULTY)]","[(Leadership, SKILL)]"
1,ABM5002,Advanced Biostatistics for Research,This course is served as a concept-based intro...,2.0,NUS Medicine Dean's Office,Yong Loo Lin Sch of Medicine,Graded,[],[],,...,0,0,1,0,0,0,"[(ABM5002, MODULE)]","[(NUS Medicine Dean's Office, DEPARTMENT)]","[(Yong Loo Lin Sch of Medicine, FACULTY)]","[(Research, SKILL), (Extract, SKILL), (Healthc..."
2,ABM5003,Biomedical Innovation & Enterprise,This course will furnish students with a thoro...,4.0,NUS Medicine Dean's Office,Yong Loo Lin Sch of Medicine,Graded,[],[],,...,0,1,0,0,0,0,"[(ABM5003, MODULE)]","[(NUS Medicine Dean's Office, DEPARTMENT)]","[(Yong Loo Lin Sch of Medicine, FACULTY)]","[(Research, SKILL), (Research and Development,..."
3,ABM5004,Capstone Project,This course encompasses research projects rele...,8.0,NUS Medicine Dean's Office,Yong Loo Lin Sch of Medicine,Graded,[],[],,...,0,1,1,0,0,0,"[(ABM5004, MODULE)]","[(NUS Medicine Dean's Office, DEPARTMENT)]","[(Yong Loo Lin Sch of Medicine, FACULTY)]","[(Research, SKILL)]"
4,ABM5101,Applied Immunology,Advanced immunological applications play impor...,4.0,NUS Medicine Dean's Office,Yong Loo Lin Sch of Medicine,Graded,[],[],,...,0,1,0,0,0,0,"[(ABM5101, MODULE)]","[(NUS Medicine Dean's Office, DEPARTMENT)]","[(Yong Loo Lin Sch of Medicine, FACULTY)]","[(Research, SKILL), (Design, SKILL), (Health, ..."


## 02 - Department Info

In [17]:
# load department info data
department_info = pd.read_csv('../../backend/data/02 - mock_department_list.csv')

# check first few rows of the department_info dataframe
department_info.head()

Unnamed: 0,department_id,department,faculty
0,NUSDP0001,NUS Medicine Dean's Office,Yong Loo Lin Sch of Medicine
1,NUSDP0002,Architecture,College of Design and Engineering
2,NUSDP0003,Accounting,NUS Business School
3,NUSDP0004,Communications and New Media,Arts and Social Science
4,NUSDP0005,History,Arts and Social Science


In [18]:
# extract entities from the department data
department_info['Department_entities'] = department_info['department'].apply(lambda x: extract_entities(x, 'DEPARTMENT'))
department_info['Faculty_entities'] = department_info['faculty'].apply(lambda x: extract_entities(x, 'FACULTY'))


In [19]:
# check first few rows of the department_info dataframe``
department_info.head()

Unnamed: 0,department_id,department,faculty,Department_entities,Faculty_entities
0,NUSDP0001,NUS Medicine Dean's Office,Yong Loo Lin Sch of Medicine,"[(NUS Medicine Dean's Office, DEPARTMENT)]","[(Yong Loo Lin Sch of Medicine, FACULTY)]"
1,NUSDP0002,Architecture,College of Design and Engineering,"[(Architecture, DEPARTMENT)]","[(College of Design and Engineering, FACULTY)]"
2,NUSDP0003,Accounting,NUS Business School,"[(Accounting, DEPARTMENT)]","[(NUS Business School, FACULTY)]"
3,NUSDP0004,Communications and New Media,Arts and Social Science,"[(Communications and New Media, DEPARTMENT)]","[(Arts and Social Science, FACULTY)]"
4,NUSDP0005,History,Arts and Social Science,"[(History, DEPARTMENT)]","[(Arts and Social Science, FACULTY)]"


## 03 - Staff Info

In [32]:
# load staff info data
staff_info = pd.read_csv('../../backend/data/03 - mock_staff_info.csv')

# check first few rows of the staff_info dataframe
staff_info.head()

Unnamed: 0,Employee ID,Employee Name,NRIC,DOB,DOJ,Department,Modules Taught
0,NUSPF12345,Marin Sergio Hernandez,XXXXX479A,1983-02-23T00:00:00Z,2009-10-31T00:00:00Z,Electrical and Computer Engineering,CEG5003
1,NUSPF12346,Kathryn Cordova,XXXXX815A,1985-09-02T00:00:00Z,2009-06-07T00:00:00Z,Civil and Environmental Engineering,ESE2102
2,NUSPF12347,Barbara Sanchez,XXXXX777A,1971-07-30T00:00:00Z,2008-05-09T00:00:00Z,Centre for Language Studies,LAT4201HM
3,NUSPF12348,Bryce Lucas,XXXXX610A,1973-07-20T00:00:00Z,2002-01-17T00:00:00Z,BIZ Dean's Office,DMB1203MNO
4,NUSPF12349,Judith Camacho,XXXXX629A,1991-11-16T00:00:00Z,2000-02-13T00:00:00Z,Economics,EC4401HM


In [24]:
# extract entities from the staff_info data
staff_info['Staff_entities'] = staff_info['Employee Name'].apply(lambda x: extract_entities(x, 'STAFF'))
staff_info['Department_entities'] = staff_info['Department'].apply(lambda x: extract_entities(x, 'DEPARTMENT'))
staff_info['Module_entities'] = staff_info['Modules Taught'].apply(lambda x: extract_entities(x, 'MODULE'))

In [25]:
# check first few rows of the staff_info dataframe
staff_info.head()

Unnamed: 0,Employee ID,Employee Name,NRIC,DOB,DOJ,Department,Modules Taught,Staff_entities,Department_entities,Module_entities
0,NUSPF12345,Marin Sergio Hernandez,XXXXX479A,1983-02-23T00:00:00Z,2009-10-31T00:00:00Z,Electrical and Computer Engineering,CEG5003,"[(Marin Sergio Hernandez, STAFF)]","[(Electrical and Computer Engineering, DEPARTM...","[(CEG5003, MODULE)]"
1,NUSPF12346,Kathryn Cordova,XXXXX815A,1985-09-02T00:00:00Z,2009-06-07T00:00:00Z,Civil and Environmental Engineering,ESE2102,"[(Kathryn Cordova, STAFF)]","[(Civil and Environmental Engineering, DEPARTM...","[(ESE2102, MODULE)]"
2,NUSPF12347,Barbara Sanchez,XXXXX777A,1971-07-30T00:00:00Z,2008-05-09T00:00:00Z,Centre for Language Studies,LAT4201HM,"[(Barbara Sanchez, STAFF)]","[(Centre for Language Studies, DEPARTMENT)]","[(LAT4201HM, MODULE)]"
3,NUSPF12348,Bryce Lucas,XXXXX610A,1973-07-20T00:00:00Z,2002-01-17T00:00:00Z,BIZ Dean's Office,DMB1203MNO,"[(Bryce Lucas, STAFF)]","[(BIZ Dean's Office, DEPARTMENT)]","[(DMB1203MNO, MODULE)]"
4,NUSPF12349,Judith Camacho,XXXXX629A,1991-11-16T00:00:00Z,2000-02-13T00:00:00Z,Economics,EC4401HM,"[(Judith Camacho, STAFF)]","[(Economics, DEPARTMENT)]","[(EC4401HM, MODULE)]"


In [27]:
# write the data to new csv files
student_info.to_csv('student_info_final.csv', index=False)
module_info.to_csv('module_info_final.csv', index=False)
department_info.to_csv('department_info_final.csv', index=False)
staff_info.to_csv('staff_info_final.csv', index=False)