# Relationship Extraction

In [163]:
# import libraries
import numpy as np
import pandas as pd
import ast


## Student Info

In [164]:
# load data
student_info = pd.read_csv('../entity_extraction/student_info_final.csv') 
print(student_info.head())

      Student_Name Matric_Number       NRIC  Year  \
0      Tracy Lewis     A0216920B  XXXXX506Z     1   
1    Andrew Holden     A0225069H  XXXXX799Z     3   
2  Phillip Bullock     A0228204E  XXXXX194Z     1   
3   Valerie Rivera     A0200778Y  XXXXX150Z     3   
4      Robert Hall     A0222508M  XXXXX824Z     3   

                           Faculties                    Major  \
0                NUS Business School  Business Administration   
1          YST Conservatory of Music                    Music   
2  College of Design and Engineering   Electrical Engineering   
3                          Computing       Business Analytics   
4          YST Conservatory of Music                    Music   

              Second Major                                  Modules_Completed  \
0                      NaN  ['ACC1701B', 'DMB1202ACC', 'DMB1201MKT', 'MNO1...   
1                      NaN  ['CFA1111A', 'MUA1190', 'MUA2109', 'MUA1172', ...   
2                      NaN  ['ME1102', 'BN1111'

In [165]:
# create new df with columns Subject, Predicate, Object
student_major = pd.DataFrame()

# student --> majors_in --> major
student_major['Subject'] = student_info['Student_Name']
student_major['Predicate'] = 'majors_in'
student_major['Object'] = student_info['Major']

In [166]:
print(student_major)

              Subject  Predicate                   Object
0         Tracy Lewis  majors_in  Business Administration
1       Andrew Holden  majors_in                    Music
2     Phillip Bullock  majors_in   Electrical Engineering
3      Valerie Rivera  majors_in       Business Analytics
4         Robert Hall  majors_in                    Music
...               ...        ...                      ...
3280  Renee Carpenter  majors_in  Business Administration
3281    Garrett Mcgee  majors_in                      Law
3282    Michael Colon  majors_in                 Medicine
3283     Wesley Moran  majors_in                      Law
3284   Angel Peterson  majors_in  Business Administration

[3285 rows x 3 columns]


In [167]:
# create new df
student_faculty = pd.DataFrame()

# student --> is_in_faculty --> faculty
student_faculty['Subject'] = student_info['Student_Name']
student_faculty['Predicate'] = 'is_in_faculty'
student_faculty['Object'] = student_info['Faculties']

In [168]:
print(student_faculty)

              Subject      Predicate                             Object
0         Tracy Lewis  is_in_faculty                NUS Business School
1       Andrew Holden  is_in_faculty          YST Conservatory of Music
2     Phillip Bullock  is_in_faculty  College of Design and Engineering
3      Valerie Rivera  is_in_faculty                          Computing
4         Robert Hall  is_in_faculty          YST Conservatory of Music
...               ...            ...                                ...
3280  Renee Carpenter  is_in_faculty                NUS Business School
3281    Garrett Mcgee  is_in_faculty                                Law
3282    Michael Colon  is_in_faculty       Yong Loo Lin Sch of Medicine
3283     Wesley Moran  is_in_faculty                                Law
3284   Angel Peterson  is_in_faculty                NUS Business School

[3285 rows x 3 columns]


In [169]:
# create new df
student_module = pd.DataFrame()

# student --> takes_module --> module 
# for every module in 'Module', create a new row in the df

student_module['Subject'] = student_info['Student_Name']
student_module['Predicate'] = 'takes_module'
student_module['Object'] = student_info['Modules_Completed']
# Convert comma-separated strings to lists
student_module['Object'] = student_module['Object'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
student_module = student_module.explode('Object')
student_module['Predicate2'] = 'and_scored'
student_module['Object2'] = student_info['Grades']
# Convert string representations of dictionaries into actual dictionaries
student_module['Object2'] = student_module['Object2'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
# grades column should be the value of key in 'Object' only
student_module['Object2'] = student_module.apply(
    lambda row: row['Object2'].get(row['Object'], None), axis=1
)

print(student_module)


             Subject     Predicate      Object  Predicate2 Object2
0        Tracy Lewis  takes_module    ACC1701B  and_scored       B
0        Tracy Lewis  takes_module  DMB1202ACC  and_scored       A
0        Tracy Lewis  takes_module  DMB1201MKT  and_scored      C+
0        Tracy Lewis  takes_module    MNO1706B  and_scored       C
0        Tracy Lewis  takes_module      RE1702  and_scored      C+
...              ...           ...         ...         ...     ...
3284  Angel Peterson  takes_module    ACC1701C  and_scored       D
3284  Angel Peterson  takes_module     BSE3711  and_scored      B+
3284  Angel Peterson  takes_module  DMB1204ACC  and_scored       D
3284  Angel Peterson  takes_module      RE3000  and_scored       D
3284  Angel Peterson  takes_module     BI3704B  and_scored      B-

[65104 rows x 5 columns]


## Module Info

In [170]:
# load data
module_info = pd.read_csv('../entity_extraction/module_info_final.csv')

In [171]:
# create new df
module_department = pd.DataFrame()

# module --> offered_by --> department
module_department['Subject'] = module_info['moduleCode']
module_department['Predicate'] = 'offered_by'
module_department['Object'] = module_info['department']

# remove nan values
module_department = module_department.dropna()

print(module_department)

      Subject   Predicate                      Object
0     ABM5001  offered_by  NUS Medicine Dean's Office
1     ABM5002  offered_by  NUS Medicine Dean's Office
2     ABM5003  offered_by  NUS Medicine Dean's Office
3     ABM5004  offered_by  NUS Medicine Dean's Office
4     ABM5101  offered_by  NUS Medicine Dean's Office
...       ...         ...                         ...
6844   ZB3311  offered_by         Biological Sciences
6845   ZB3312  offered_by         Biological Sciences
6846   ZB4171  offered_by         Biological Sciences
6847   ZB4199  offered_by         Biological Sciences
6848   ZB4299  offered_by         Biological Sciences

[6849 rows x 3 columns]


In [172]:
# create new df
module_prerequisite = pd.DataFrame()

# module --> has_prerequisite --> prerequisite
module_prerequisite['Subject'] = module_info['moduleCode']
module_prerequisite['Predicate'] = 'has_prerequisite'
module_prerequisite['Object'] = module_info['prerequisite']

# keep only those with prerequisites, not null and not empty
module_prerequisite = module_prerequisite[module_prerequisite['Object'].notnull()]
module_prerequisite = module_prerequisite[module_prerequisite['Object'] != '[]']

# explode the list of prerequisites
module_prerequisite['Object'] = module_prerequisite['Object'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
module_prerequisite = module_prerequisite.explode('Object')
module_prerequisite = module_prerequisite.explode('Object')


print(module_prerequisite)
  

      Subject         Predicate    Object
25    ACC2706  has_prerequisite   ACC1701
25    ACC2706  has_prerequisite  ACC1701X
25    ACC2706  has_prerequisite    EC2204
26    ACC2707  has_prerequisite   ACC1701
26    ACC2707  has_prerequisite  ACC1701X
...       ...               ...       ...
6844   ZB3311  has_prerequisite    ZB3310
6846   ZB4171  has_prerequisite   CS1010S
6846   ZB4171  has_prerequisite   LSM2253
6846   ZB4171  has_prerequisite    CS2220
6846   ZB4171  has_prerequisite   LSM3241

[8262 rows x 3 columns]


In [173]:
# create new df
module_preclusion = pd.DataFrame()

# module --> has_preclusion --> preclusion
module_preclusion['Subject'] = module_info['moduleCode']
module_preclusion['Predicate'] = 'has_preclusion'
module_preclusion['Object'] = module_info['preclusion']

# keep only those with preclusions, not null and not empty
module_preclusion = module_preclusion[module_preclusion['Object'].notnull()]
module_preclusion = module_preclusion[module_preclusion['Object'] != '[]']

# explode the list of preclusions
module_preclusion['Object'] = module_preclusion['Object'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
module_preclusion = module_preclusion.explode('Object')

print(module_preclusion)



       Subject       Predicate    Object
20    ACC1701A  has_preclusion   ACC1002
20    ACC1701A  has_preclusion  ACC1002X
20    ACC1701A  has_preclusion   ACC1701
20    ACC1701A  has_preclusion    EC2204
20    ACC1701A  has_preclusion    RE1705
...        ...             ...       ...
6845    ZB3312  has_preclusion   PHS3312
6845    ZB3312  has_preclusion    QF3312
6845    ZB3312  has_preclusion    ST3312
6846    ZB4171  has_preclusion  YSC4211C
6848    ZB4299  has_preclusion    ZB4199

[8138 rows x 3 columns]


In [174]:
# create new df
module_skill = pd.DataFrame()

# module --> teaches_skill --> skill
module_skill['Subject'] = module_info['moduleCode']
module_skill['Predicate'] = 'teaches_skill'

# select values of key 'skills' from 'Description_entities' column
module_skill['Object'] = module_info['Description_entities'].apply(lambda x: ast.literal_eval(x).get('Skill', None) if isinstance(x, str) else x)    

# explode the list of skills
module_skill = module_skill.explode('Object')
#module_skill['Object'] = module_skill['Object'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# keep only those with skills, remove null and not empty
module_skill = module_skill[module_skill['Object'].notnull()]
module_skill = module_skill[module_skill['Object'] != '[]']

print(module_skill)

      Subject      Predicate       Object
0     ABM5001  teaches_skill   Leadership
1     ABM5002  teaches_skill     Research
1     ABM5002  teaches_skill      Extract
1     ABM5002  teaches_skill   Healthcare
2     ABM5003  teaches_skill     Research
...       ...            ...          ...
6843   ZB3310  teaches_skill  Environment
6844   ZB3311  teaches_skill  Environment
6845   ZB3312  teaches_skill  Environment
6846   ZB4171  teaches_skill     Research
6846   ZB4171  teaches_skill      Biology

[14947 rows x 3 columns]


In [175]:
# create new df
module_semester = pd.DataFrame()

# module --> offered_in --> semester
module_semester['Subject'] = module_info['moduleCode']
module_semester['Predicate'] = 'offered_in'

# select semester_01, semester_02, semester_03, semester_04 from module_info, if value is 1, input column name, if value is 0, do not input column name
module_semester['Object'] = module_info[['semester_01', 'semester_02', 'semester_03', 'semester_04']].apply(lambda x: [col for col in x.index if x[col] == 1], axis=1)

# explode the list of semesters
module_semester['Object'] = module_semester['Object'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
module_semester = module_semester.explode('Object')

# remove those with no semesters
module_semester = module_semester[module_semester['Object'].notnull()]
module_semester = module_semester[module_semester['Object'] != '[]']

print(module_semester)


      Subject   Predicate       Object
0     ABM5001  offered_in  semester_02
1     ABM5002  offered_in  semester_02
2     ABM5003  offered_in  semester_01
3     ABM5004  offered_in  semester_01
3     ABM5004  offered_in  semester_02
...       ...         ...          ...
6846   ZB4171  offered_in  semester_01
6847   ZB4199  offered_in  semester_01
6847   ZB4199  offered_in  semester_02
6848   ZB4299  offered_in  semester_01
6848   ZB4299  offered_in  semester_02

[9021 rows x 3 columns]


## Department Info

In [176]:
# load data
department_info = pd.read_csv('../entity_extraction/department_info_final.csv')

In [177]:
# create new df
department_faculty = pd.DataFrame()

# department --> is_under_faculty --> faculty
department_faculty['Subject'] = department_info['department']
department_faculty['Predicate'] = 'is_under_faculty'
department_faculty['Object'] = department_info['faculty']

print(department_faculty)

                            Subject         Predicate  \
0        NUS Medicine Dean's Office  is_under_faculty   
1                      Architecture  is_under_faculty   
2                        Accounting  is_under_faculty   
3      Communications and New Media  is_under_faculty   
4                           History  is_under_faculty   
..                              ...               ...   
106  Ridge View Residential College  is_under_faculty   
107   University Scholars Programme  is_under_faculty   
108  College of Alice and Peter Tan  is_under_faculty   
109            Residential Colleges  is_under_faculty   
110                Yale-NUS College  is_under_faculty   

                                Object  
0         Yong Loo Lin Sch of Medicine  
1    College of Design and Engineering  
2                  NUS Business School  
3              Arts and Social Science  
4              Arts and Social Science  
..                                 ...  
106                Residenti

## Staff Info

In [178]:
# load data
staff_info = pd.read_csv('../entity_extraction/staff_info_final.csv')

In [179]:
# create new df
staff_module = pd.DataFrame()

# staff --> teaches_module --> module
staff_module['Subject'] = staff_info['Employee Name']
staff_module['Predicate'] = 'teaches_module'
staff_module['Object'] = staff_info['Modules Taught']

print(staff_module)

                    Subject       Predicate      Object
0    Marin Sergio Hernandez  teaches_module     CEG5003
1           Kathryn Cordova  teaches_module     ESE2102
2           Barbara Sanchez  teaches_module   LAT4201HM
3               Bryce Lucas  teaches_module  DMB1203MNO
4            Judith Camacho  teaches_module    EC4401HM
..                      ...             ...         ...
995              Lisa Reyes  teaches_module    MUA3256B
996           Melanie Kelly  teaches_module    LLJ5342V
997          Ronald Spencer  teaches_module      PC2032
998          Charles Rogers  teaches_module      PL4251
999          Vanessa Taylor  teaches_module     MUA4107

[1000 rows x 3 columns]


In [180]:
# create new df
staff_department = pd.DataFrame()

# staff --> is_in_department --> department
staff_department['Subject'] = staff_info['Employee Name']
staff_department['Predicate'] = 'is_in_department'
staff_department['Object'] = staff_info['Department_ID']

print(staff_department)

                    Subject         Predicate  \
0    Marin Sergio Hernandez  is_in_department   
1           Kathryn Cordova  is_in_department   
2           Barbara Sanchez  is_in_department   
3               Bryce Lucas  is_in_department   
4            Judith Camacho  is_in_department   
..                      ...               ...   
995              Lisa Reyes  is_in_department   
996           Melanie Kelly  is_in_department   
997          Ronald Spencer  is_in_department   
998          Charles Rogers  is_in_department   
999          Vanessa Taylor  is_in_department   

                                  Object  
0    Electrical and Computer Engineering  
1    Civil and Environmental Engineering  
2            Centre for Language Studies  
3                      BIZ Dean's Office  
4                              Economics  
..                                   ...  
995                  YSTCM Dean's Office  
996                    FoL Dean's Office  
997                     

## Save Dataframes

In [181]:
# write all to csv
student_major.to_csv('student_major.csv', index=False)
student_faculty.to_csv('student_faculty.csv', index=False)
student_module.to_csv('student_module.csv', index=False)
module_department.to_csv('module_department.csv', index=False)
module_prerequisite.to_csv('module_prerequisite.csv', index=False)
module_preclusion.to_csv('module_preclusion.csv', index=False)
module_skill.to_csv('module_skill.csv', index=False)
module_semester.to_csv('module_semester.csv', index=False)
department_faculty.to_csv('department_faculty.csv', index=False)
staff_module.to_csv('staff_module.csv', index=False)
staff_department.to_csv('staff_department.csv', index=False)
