In [49]:
# import libraries
import numpy as np
import pandas as pd
import ast

## Module Review


In [50]:
# load data
module_review_df = pd.read_csv('../entity_extraction/mock_module_reviews_final.csv') 
print(module_review_df.head())

  module_code          id                                            message  \
0    ACC1701X  6472570545   Taken in AY23/24 Semester 2. Lecturer: Prof D...   
1    ACC1701X  6467819412   Taken in AY23/24 Semester 2. Lecturer: Prof D...   
2    ACC1701X  6452830983   ACC1701X Lecturer: Adjunct Assoc. Prof. Deon ...   
3    ACC1701X  5889427965   Taken in AY19/20 Sem 2 Lecturer: Prof Winston...   
4    ACC1701X  5878270140   Lecturer: Prof Charles Shi Tutor: Mr Philip T...   

            created_at  likes  dislikes module_entities  \
0  2024-06-03T01:36:26      0         0    ['ACC1701X']   
1  2024-05-27T13:10:04      0         0    ['ACC1701X']   
2  2024-05-07T04:48:19      0         0    ['ACC1701X']   
3  2022-06-16T10:03:57      0         0    ['ACC1701X']   
4  2022-06-04T14:20:43      0         0    ['ACC1701X']   

                                     skills_entities  \
0                                                 []   
1  ['Inventory Management', 'Accounting', 'Cash',...

### Module --> Skills


In [51]:
# Extract individual elements from lists and strings if needed
def extract_first_item(entity):
    if isinstance(entity, list) and len(entity) > 0:
        return entity[0]  # Get the first item from the list
    if isinstance(entity, str) and entity.startswith('[') and entity.endswith(']'):
        # Handles cases where the value is a string that resembles a list
        return entity.strip("[]").strip("'\"")
    return entity

# Convert the skills_entities column to actual lists if they are strings
def convert_to_list(value):
    if isinstance(value, str):
        try:
            # Convert strings that look like lists into actual lists
            return ast.literal_eval(value)
        except:
            # If the conversion fails, return the value itself
            return value
    return value

# Apply conversion to ensure skills_entities column is a list
module_review_df['skills_entities'] = module_review_df['skills_entities'].apply(convert_to_list)

# Convert empty lists to NaN so they can be handled correctly by explode
module_review_df['skills_entities'] = module_review_df['skills_entities'].apply(lambda x: x if isinstance(x, list) and len(x) > 0 else pd.NA)

# Explode the skills_entities column to create one row per skill
exploded_df = module_review_df.explode('skills_entities')

# Drop rows with NaN values in skills_entities, as they represent modules without skills
exploded_df.dropna(subset=['skills_entities'], inplace=True)

# Remove duplicate rows to ensure each skill-module pair is unique
module_review_skills_rs_df = exploded_df.drop_duplicates(subset=['module_entities', 'skills_entities'])

# Remove brackets from lists (flatten them to strings)
module_review_skills_rs_df.loc[:, 'module_entities'] = module_review_skills_rs_df['module_entities'].apply(extract_first_item)
module_review_skills_rs_df.loc[:, 'skills_entities'] = module_review_skills_rs_df['skills_entities'].apply(extract_first_item)

# Rename columns to fit the subject-predicate-object format
module_review_skills_rs_df = module_review_skills_rs_df.rename(columns={
    'module_entities': 'Subject',
    'skills_entities': 'Object'
})
module_review_skills_rs_df['Predicate'] = 'teaches_skill'

# Reorder columns for better readability
module_review_skills_rs_df = module_review_skills_rs_df[['Subject', 'Predicate', 'Object']]

# Display the resulting DataFrame
print(module_review_skills_rs_df)

       Subject      Predicate                Object
1     ACC1701X  teaches_skill  Inventory Management
1     ACC1701X  teaches_skill            Accounting
1     ACC1701X  teaches_skill                  Cash
1     ACC1701X  teaches_skill            Management
1     ACC1701X  teaches_skill   Accounts Receivable
...        ...            ...                   ...
3419    TS3222  teaches_skill               Theatre
3420    TS3551  teaches_skill              Research
3434   UTS2105  teaches_skill               English
3442  UTW1001O  teaches_skill           Engineering
3442  UTW1001O  teaches_skill              Research

[1571 rows x 3 columns]


In [52]:
module_review_skills_rs_df.to_csv('module_skills_rs.csv', index=False)

### Staff --> Module


In [53]:
# Apply conversion to ensure staff_entities column is a list
module_review_df['staff_entities'] = module_review_df['staff_entities'].apply(convert_to_list)

# Convert empty lists to NaN so they can be handled correctly by explode
module_review_df['staff_entities'] = module_review_df['staff_entities'].apply(lambda x: x if isinstance(x, list) and len(x) > 0 else pd.NA)

# Explode the skills_entistaff_entitiesies column to create one row per skill
exploded_df = module_review_df.explode('staff_entities')

# Drop rows with NaN values in staff_entities, as they represent modules without skills
exploded_df.dropna(subset=['staff_entities'], inplace=True)

# Remove duplicate rows to ensure each staff-module pair is unique
module_review_staff_rs_df = exploded_df.drop_duplicates(subset=['module_entities', 'staff_entities'])

# Remove brackets from lists (flatten them to strings)
module_review_staff_rs_df.loc[:, 'module_entities'] = module_review_staff_rs_df['module_entities'].apply(extract_first_item)
module_review_staff_rs_df.loc[:, 'staff_entities'] = module_review_staff_rs_df['staff_entities'].apply(extract_first_item)

# Rename columns to fit the subject-predicate-object format
module_review_staff_rs_df = module_review_staff_rs_df.rename(columns={
    'module_entities': 'Subject',
    'staff_entities': 'Object'
})
module_review_staff_rs_df['Predicate'] = 'teaches_module'

# Reorder columns for better readability
module_review_staff_rs_df = module_review_staff_rs_df[['Subject', 'Predicate', 'Object']]

# Display the resulting DataFrame
print(module_review_staff_rs_df)

       Subject       Predicate              Object
1     ACC1701X  teaches_module           Prof Chan
2     ACC1701X  teaches_module        Prof Winston
2     ACC1701X  teaches_module           Prof Deon
5     ACC1701X  teaches_module          Prof Hanny
5     ACC1701X  teaches_module            prof shi
...        ...             ...                 ...
3430   UTC2716  teaches_module           Prof Pang
3432   UTC2728  teaches_module        Prof Lynette
3435   UTS2708  teaches_module        Prof Navarun
3435   UTS2708  teaches_module  Prof Navarun Varma
3442  UTW1001O  teaches_module            Dr Jinat

[1183 rows x 3 columns]


In [54]:
module_review_staff_rs_df.to_csv('staff_module_rs.csv', index=False)

## Module Venue


In [55]:
# load data
module_venue_df = pd.read_csv('../entity_extraction/mock_venue_final.csv') 
print(module_venue_df.head())

  module_entities                                         properties
0     ['HSH1000']  {'Day': 'Tuesday', 'Start Time': 1400, 'End Ti...
1     ['HSH1000']  {'Day': 'Tuesday', 'Start Time': 1600, 'End Ti...
2     ['HSS1000']  {'Day': 'Tuesday', 'Start Time': 1000, 'End Ti...
3     ['HSS1000']  {'Day': 'Tuesday', 'Start Time': 1200, 'End Ti...
4     ['HSS1000']  {'Day': 'Tuesday', 'Start Time': 1000, 'End Ti...


#### Module --> Venue Properties


In [56]:
# Apply extraction to the module_entities column to remove brackets
module_venue_df['module_entities'] = module_venue_df['module_entities'].apply(extract_first_item)

# Create a new DataFrame to store the relationships
module_venue_rs_df = pd.DataFrame()

# Add the relevant columns for Subject, Predicate, and Object
module_venue_rs_df['Subject'] = module_venue_df['module_entities']
module_venue_rs_df['Predicate'] = "has_venue_properties"  # Static value for predicate
module_venue_rs_df['Object'] = module_venue_df['properties']

# Remove duplicate rows to ensure each module-property pair is unique
module_venue_rs_df = module_venue_rs_df.drop_duplicates(subset=['Subject', 'Object'])

# Reorder columns for better readability
module_venue_rs_df = module_venue_rs_df[['Subject', 'Predicate', 'Object']]

# Display the resulting DataFrame
print(module_venue_rs_df)

       Subject             Predicate  \
0      HSH1000  has_venue_properties   
1      HSH1000  has_venue_properties   
2      HSS1000  has_venue_properties   
3      HSS1000  has_venue_properties   
6      HSA1000  has_venue_properties   
...        ...                   ...   
9508  GEN2060X  has_venue_properties   
9509  GEN2061Y  has_venue_properties   
9510  GEN2070Y  has_venue_properties   
9511  GEN2060Y  has_venue_properties   
9514   SFI2034  has_venue_properties   

                                                 Object  
0     {'Day': 'Tuesday', 'Start Time': 1400, 'End Ti...  
1     {'Day': 'Tuesday', 'Start Time': 1600, 'End Ti...  
2     {'Day': 'Tuesday', 'Start Time': 1000, 'End Ti...  
3     {'Day': 'Tuesday', 'Start Time': 1200, 'End Ti...  
6     {'Day': 'Tuesday', 'Start Time': 1800, 'End Ti...  
...                                                 ...  
9508  {'Day': 'Tuesday', 'Start Time': 800, 'End Tim...  
9509  {'Day': 'Tuesday', 'Start Time': 1200, 'End Ti...

In [57]:
module_venue_rs_df.to_csv('module_venue_properties_rs.csv', index=False)

## UG Programs


In [58]:
# load data
ug_program_df = pd.read_csv('../entity_extraction/ug_program_final.csv') 
print(ug_program_df.head())

                                Degree  \
0  Bachelor of Business Administration   
1  Bachelor of Business Administration   
2  Bachelor of Business Administration   
3  Bachelor of Business Administration   
4  Bachelor of Business Administration   

                                     Major  \
0               Applied Business Analytics   
1                       Business Economics   
2                                  Finance   
3          Innovation and Entrepreneurship   
4  Leadership and Human Capital Management   

                           degree_entities  \
0  ['Bachelor of Business Administration']   
1  ['Bachelor of Business Administration']   
2  ['Bachelor of Business Administration']   
3  ['Bachelor of Business Administration']   
4  ['Bachelor of Business Administration']   

                                major_entities  
0               ['Applied Business Analytics']  
1                       ['Business Economics']  
2                                  ['Finance']

### Major --> Degree


In [59]:
# Extract individual elements from lists and strings if needed
ug_program_df['major_entities'] = ug_program_df['major_entities'].apply(extract_first_item)
ug_program_df['degree_entities'] = ug_program_df['degree_entities'].apply(extract_first_item)

# Create a new DataFrame to store the relationships
ug_program_rs_df = pd.DataFrame()

# Add the relevant columns for Subject, Predicate, and Object
ug_program_rs_df['Subject'] = ug_program_df['major_entities']
ug_program_rs_df['Predicate'] = "is_under"  # Static value for predicate
ug_program_rs_df['Object'] = ug_program_df['degree_entities']

# Drop rows with NaN values to ensure we only have valid relationships
ug_program_rs_df.dropna(inplace=True)

# Display the new DataFrame
print(ug_program_rs_df.head())

                                   Subject Predicate  \
0               Applied Business Analytics  is_under   
1                       Business Economics  is_under   
2                                  Finance  is_under   
3          Innovation and Entrepreneurship  is_under   
4  Leadership and Human Capital Management  is_under   

                                Object  
0  Bachelor of Business Administration  
1  Bachelor of Business Administration  
2  Bachelor of Business Administration  
3  Bachelor of Business Administration  
4  Bachelor of Business Administration  


In [60]:
ug_program_rs_df.to_csv('major_degree_rs.csv', index=False)