Import libraries


In [1]:
import pandas as pd 
import spacy 
import re 
import ast 
from spacy.matcher import PhraseMatcher  

# 04 - mock module reviews


Load the data


In [2]:
module_review_path = '../backend/data/04 - mock_module_reviews.csv'
module_review = pd.read_csv(module_review_path)
print(module_review.head())

  module_code          id                                            message  \
0    ACC1701X  6472570545   Taken in AY23/24 Semester 2. Lecturer: Prof D...   
1    ACC1701X  6467819412   Taken in AY23/24 Semester 2. Lecturer: Prof D...   
2    ACC1701X  6452830983   ACC1701X Lecturer: Adjunct Assoc. Prof. Deon ...   
3    ACC1701X  5896942778   Taken in AY 21/22 Sem 2, review at: <a href="...   
4    ACC1701X  5889427965   Taken in AY19/20 Sem 2 Lecturer: Prof Winston...   

            created_at  likes  dislikes  
0  2024-06-03T01:36:26      0         0  
1  2024-05-27T13:10:04      0         0  
2  2024-05-07T04:48:19      0         0  
3  2022-06-24T09:55:49      0         0  
4  2022-06-16T10:03:57      0         0  


## Entity Extraction


Extract Modules


In [3]:
def extract_module_entities(module_code):
    # Ensure the input is a string and not empty
    if isinstance(module_code, str) and module_code.strip():
        return [(module_code)]
    else:
        return []

# Apply the function to extract module entities from the 'module_code' column
module_review['module_entities'] = module_review['module_code'].apply(lambda x: extract_module_entities(x))

# Display the DataFrame with extracted module entities
print(module_review[['module_code', 'module_entities']].head())

  module_code module_entities
0    ACC1701X      [ACC1701X]
1    ACC1701X      [ACC1701X]
2    ACC1701X      [ACC1701X]
3    ACC1701X      [ACC1701X]
4    ACC1701X      [ACC1701X]


Load spacy model for NER


In [4]:
nlp = spacy.load("en_core_web_sm")

Extract skills from linkedin job and skillset csv


In [5]:
jobs_and_skills_df = pd.read_csv("jobs_and_skillset_linkedin_final.csv")
jobs_and_skills_df.head()

Unnamed: 0,Job Title,Skills
0,['Academic Advisor'],"['Academic Advising', 'Higher Education', 'Stu..."
1,['Academic Tutor'],"['Tutoring', 'Teaching', 'Peer Tutoring', 'Mat..."
2,['Account Coordinator'],"['Account Management', 'Account Coordination',..."
3,['Account Director'],"['Account Management', 'Client Services', 'Cli..."
4,['Account Executive'],"['Software as a Service (SaaS)', 'Account Mana..."


In [6]:
# get list of job titles and skills 
skills = jobs_and_skills_df["Skills"].unique().tolist()

for i in range(len(skills)):
    skills[i] = ast.literal_eval(skills[i])

# Create a list of unique skills
unique_skills = list(set([skill for sublist in skills for skill in sublist]))

# remove "(programming language)" from  skills and the space before it
unique_skills = [re.sub(r'\s\(.*\)', '', skill) for skill in unique_skills]
print(unique_skills)
print(len(unique_skills))

['Emerging Markets', 'Music Education', 'Medication Reconciliation', 'Analytical Chemistry', 'Study Coordination', 'Job Search Strategies', 'Family Medicine', 'Environmental Impact Assessment', 'Hot Water', 'Visio', 'Enteral Nutrition', 'Medical Coding', 'Geographic Information Systems', 'Lifestyle Coaching', 'Claims Resolution', 'Responsive Web Design', 'Training Delivery', 'Structural Analysis', 'Voice & Accent', 'Crestron', 'Chicago Style', 'Financial Analysis', 'Intelligence Community', 'Client Services', 'Aircraft Systems', 'Holistic Life Coaching', 'Medical Transcription', 'Watercolor', 'Medical Assisting', 'Makeup Artistry', 'Mindfulness', 'Blueprint Reading', 'Cash Handling', 'Teaching', 'Time Study', 'Transform', 'ProSystem fx Engagement', 'Selenium', 'Gel Electrophoresis', 'Finish Carpentry', 'Web Testing', 'Bodywork', 'CNC Programming', 'Linguistics', 'Machining', 'Customer Service Training', 'Fluorescence Microscopy', 'Certified Immunizer', 'Mechanical Product Design', 'C#'

Add to Spacy Matchmaker


In [7]:
# Create a spaCy PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

# Convert each skill to a spaCy Doc object and add them to the matcher
patterns = [nlp(skill) for skill in unique_skills]
matcher.add("SKILLS", patterns)

Drop rows with links in "message" column


In [8]:
# Drop rows where 'message' contains 'href' (indicating links)
module_review = module_review[~module_review['message'].str.contains('href', na=False, case=False)]

Extracrt skills entities from the "message" column


In [9]:
def extract_skills_using_phrasematcher(text):
    # Ensure the input is a string
    if not isinstance(text, str):
        return [], []  # Return empty lists if the text is not a valid string

    doc = nlp(text)

    # Use matcher to find skills in text
    matches = matcher(doc)
    skills = [doc[start:end].text for match_id, start, end in matches]

    # Remove duplicates
    skills = list(set(skills))

    return skills

module_review['skill_entities'] = module_review['message'].apply(lambda x: extract_skills_using_phrasematcher(x))

# Display the DataFrame with the extracted skills
print(module_review[['message', 'skill_entities']].head())

                                             message  \
0   Taken in AY23/24 Semester 2. Lecturer: Prof D...   
1   Taken in AY23/24 Semester 2. Lecturer: Prof D...   
2   ACC1701X Lecturer: Adjunct Assoc. Prof. Deon ...   
4   Taken in AY19/20 Sem 2 Lecturer: Prof Winston...   
5   Lecturer: Prof Charles Shi Tutor: Mr Philip T...   

                                      skill_entities  
0                                                 []  
1  [Sales, Internal Controls, Accounts Receivable...  
2  [Internal Controls, Sales, Accounting, Cash, F...  
4                                                 []  
5                                                [C]  


Extract staff entities from message column


In [10]:
def extract_staff_names(text):
    if isinstance(text, str):
        doc = nlp(text)
        staff = []

        # Regex pattern to capture staff names with titles like 'Prof', 'Dr', 'Lecturer', 'Tutor'
        staff_pattern = re.compile(r'\b(Prof|Professor|Dr|Lecturer|Tutor|Instructor)\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?', re.IGNORECASE)
        
        for ent in doc.ents:
            match = staff_pattern.search(ent.text)
            if match:
                staff_name = match.group(0)
                
                # Exclude unwanted phrases that are falsely detected as staff names
                if not any(word in staff_name.lower() for word in ['tutorial', 'attendance', 'assignment', 'participation', 'ratios', 'draft', 'profile']):
                    staff.append(staff_name.strip())

        # Remove duplicates in staff
        staff = list(set(staff))
        return staff
    
    return [] 

# Apply modified staff extraction
module_review['staff_entities'] = module_review['message'].apply(lambda text: extract_staff_names(text))

# Display the DataFrame with the extracted skills and staff
print(module_review[['message', 'staff_entities']].head())

                                             message  \
0   Taken in AY23/24 Semester 2. Lecturer: Prof D...   
1   Taken in AY23/24 Semester 2. Lecturer: Prof D...   
2   ACC1701X Lecturer: Adjunct Assoc. Prof. Deon ...   
4   Taken in AY19/20 Sem 2 Lecturer: Prof Winston...   
5   Lecturer: Prof Charles Shi Tutor: Mr Philip T...   

              staff_entities  
0                         []  
1                [Prof Chan]  
2  [Prof Deon, Prof Winston]  
4                         []  
5                         []  


## Relationship Extraction


In [22]:
def create_dynamic_relationship(df, from_type, from_id_col, to_type, to_id_col, relationship_type, output_col):
    # List to store formatted relationship dictionaries for each row
    relationship_column = []

    # Iterate through each row of the DataFrame
    for _, row in df.iterrows():
        # Extract the `from_id` and `to_id` values from the specified columns
        from_ids = row[from_id_col] if isinstance(row[from_id_col], list) else [row[from_id_col]]
        to_ids = row[to_id_col] if isinstance(row[to_id_col], list) else [row[to_id_col]]

        # Create a list of dictionaries for each `to_id`
        relationship_dict = [
            {
                "from_type": from_type,
                "from_id": from_id,
                "to_type": to_type,
                "to_id": to_id,
                "type": relationship_type
            }
            for from_id in from_ids if pd.notna(from_id)
            for to_id in to_ids if pd.notna(to_id)  # Only include non-NaN `to_id` values
        ]

        # Append the relationship dictionary or an empty list if no valid `to_id` found
        relationship_column.append(relationship_dict if relationship_dict else [])

    # Add the relationships as a new column to the DataFrame
    df[output_col] = relationship_column
    
    # Return the updated DataFrame with the new relationships column
    return df

# Example usage for Module --> teaches_skill --> Skill relationship 
from_type = "Module"
from_id_col = "module_entities"  
to_type = "Skill"
to_id_col = "skill_entities"   
relationship_type = "teaches_skill"
output_col = "relationship1"

# Create relationships for each row and add them to the DataFrame
module_review_rs = create_dynamic_relationship(module_review, from_type, from_id_col, to_type, to_id_col, relationship_type, output_col)

# Example usage for Staff --> teaches_module --> Module relationship 
from_type = "Staff"
from_id_col = "staff_entities"  
to_type = "Module"
to_id_col = "module_entities"   
relationship_type = "teaches_module"
output_col = "relationship2"

# Create relationships for each row and add them to the DataFrame
module_review_rs = create_dynamic_relationship(module_review_rs, from_type, from_id_col, to_type, to_id_col, relationship_type, output_col)

# Display the DataFrame with relationships
print(module_review_rs[['module_entities', 'skill_entities', 'staff_entities', 'relationship1', 'relationship2']].head())

  module_entities                                     skill_entities  \
0      [ACC1701X]                                                 []   
1      [ACC1701X]  [Sales, Internal Controls, Accounts Receivable...   
2      [ACC1701X]  [Internal Controls, Sales, Accounting, Cash, F...   
4      [ACC1701X]                                                 []   
5      [ACC1701X]                                                [C]   

              staff_entities  \
0                         []   
1                [Prof Chan]   
2  [Prof Deon, Prof Winston]   
4                         []   
5                         []   

                                       relationship1  \
0                                                 []   
1  [{'from_type': 'Module', 'from_id': 'ACC1701X'...   
2  [{'from_type': 'Module', 'from_id': 'ACC1701X'...   
4                                                 []   
5  [{'from_type': 'Module', 'from_id': 'ACC1701X'...   

                                     

Save updated DF to a new CSV File


In [23]:
module_review_rs.to_csv('mock_module_reviews_final.csv', index=False)