Import libraries


In [133]:
import pandas as pd 
import spacy 
import re 

# 04 - mock module reviews


Load the data


In [134]:
module_review_path = '../04 - mock_module_reviews.csv'
module_review = pd.read_csv(module_review_path)

print(module_review.head())

  module_code          id                                            message  \
0    ACC1701X  6472570545   Taken in AY23/24 Semester 2. Lecturer: Prof D...   
1    ACC1701X  6467819412   Taken in AY23/24 Semester 2. Lecturer: Prof D...   
2    ACC1701X  6452830983   ACC1701X Lecturer: Adjunct Assoc. Prof. Deon ...   
3    ACC1701X  5896942778   Taken in AY 21/22 Sem 2, review at: <a href="...   
4    ACC1701X  5889427965   Taken in AY19/20 Sem 2 Lecturer: Prof Winston...   

            created_at  likes  dislikes  
0  2024-06-03T01:36:26      0         0  
1  2024-05-27T13:10:04      0         0  
2  2024-05-07T04:48:19      0         0  
3  2022-06-24T09:55:49      0         0  
4  2022-06-16T10:03:57      0         0  


Load spacy model for NER


In [135]:
nlp = spacy.load("en_core_web_sm")

Extarct module entities


In [136]:
def extract_module_entities(module_code):
    # Ensure the input is a string and not empty
    if isinstance(module_code, str) and module_code.strip():
        return [(module_code)]
    else:
        return []

# Apply the function to extract module entities from the 'module_code' column
module_review['module_entities'] = module_review['module_code'].apply(lambda x: extract_module_entities(x))

# Display the DataFrame with extracted module entities
print(module_review[['module_code', 'module_entities']].head())

  module_code module_entities
0    ACC1701X      [ACC1701X]
1    ACC1701X      [ACC1701X]
2    ACC1701X      [ACC1701X]
3    ACC1701X      [ACC1701X]
4    ACC1701X      [ACC1701X]


Drop rows with links in "message" column


In [137]:
# Drop rows where 'message' contains 'href' (indicating links)
module_review = module_review[~module_review['message'].str.contains('href', na=False, case=False)]

Extracrt skills and staff entities from the "message" column


In [138]:
def extract_skills_and_staff(text, module_entities, all_module_codes):
    if isinstance(text, str):  # Ensure the text is a string
        doc = nlp(text)

        # Initialize empty lists for skills and staff
        skills = []
        staff = []

        # Define keywords to exclude non-skill entities
        exclude_keywords = {'lecture', 'tutorial', 'midterm', 'mid-term', 'finals', 'quiz', 'module', 'review', 'project', 'assessment', 'maq', 'essay', 'report',
                            'mission', 'assignment', 'group', 'attendance', 'elearning', 'course', 'mcq', 'participation', 'exam', 'grade', 'workload', 'presentation', 
                            'project', 'ca', 'ca1', 'ca2', 'semester', 'sem', 'qna', 'syllabus', 'pyp', 'exam', 'practical', 'pe', 'quest','content', 'coursemology', 'academy'}

        # Add a set of non-skill words (like exclamations, etc.)
        stopwords = {'awesome', 'hahaha', 'haha', 'lol', 'cool', 'wow', 'excellent'}

        # Convert exclude keywords and stopwords to lowercase
        exclude_keywords = {keyword.lower() for keyword in exclude_keywords}
        stopwords = {word.lower() for word in stopwords}

        # Convert exclude keywords into regular expressions for hyphen and space 
        exclude_patterns = [r'(?i)\b' + re.escape(keyword) + r'[-\s]?\b' for keyword in exclude_keywords]

        # Regular expression pattern to remove grades (e.g., A+, B-, C)
        grade_pattern = re.compile(r'\b[A-F][+-]?\b|\bS/U\b|\bCS\b|\bCU\b', re.IGNORECASE)

        # Enhanced regex pattern to capture staff names following titles like 'Prof', 'Dr', 'Lecturer', 'Tutor', and "Prof-Name", "Prof:Name vice versa"
        staff_pattern = re.compile(r'\b(Prof|Dr|Lecturer|Tutor|Instructor)\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?', re.IGNORECASE)

        # Regular expression pattern to match common name formats (e.g., "Firstname Lastname")
        name_pattern = re.compile(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?\b')

        # Process each entity in the text
        for ent in doc.ents:
            # Convert entity text to lowercase for case-insensitive comparison
            entity_text_lower = ent.text.lower()

            # Check if the entity is related to staff (e.g., names with titles)
            match = staff_pattern.search(ent.text)
            if match:
                # Extract the matched staff names (can handle multiple names like 'Robert Kamei, Magdeline Ng')
                staff_name = match.group(0)
                staff.append(staff_name.strip())
            
            # Exclude names of people from the skills_entities using the 'PERSON' label or matching name patterns
            elif ent.label_ == 'PERSON' or name_pattern.match(ent.text):
                continue  # Skip entities labeled as a person or that look like names

            # Otherwise, classify it as a skill if it's not part of the exclude list
            elif ent.label_ in ['ORG','PRODUCT', 'SKILL', 'WORK_OF_ART', 'LANGUAGE'] :
                # Ensure the entity is not in the exclude list using regex for whole word matching
                if (not any(re.search(pattern, entity_text_lower) for pattern in exclude_patterns) 
                    and not grade_pattern.search(ent.text)
                    and ent.text.lower() not in stopwords
                    and ent.text.lower() not in [code.lower() for code in all_module_codes]):  # Case-insensitive comparison for module codes
                    skills.append(ent.text)

        # Remove any module codes (from the entire dataset) from the skills list (case-insensitive comparison)
        skills = [skill for skill in skills if skill.lower() not in map(str.lower, all_module_codes)]

        # Remove duplicate skills within the same row by converting to set and back to list
        skills = list(set(skills))

        # Convert list to set and back to list to remove duplicated staff names 
        staff = list(set(staff))  

        return skills, staff
    
# Gather all unique module codes from the entire dataset
all_module_codes = set(module_review['module_code'].str.lower().unique())

# Assuming 'message' is the column with reviews for modules
module_review[['skills_entities', 'staff_entities']] = module_review.apply(
    lambda row: pd.Series(extract_skills_and_staff(row['message'], row['module_entities'], all_module_codes)), axis=1)


# Display the DataFrame with the extracted skills
print(module_review[['message', 'skills_entities', 'staff_entities']].head())

                                             message  \
0   Taken in AY23/24 Semester 2. Lecturer: Prof D...   
1   Taken in AY23/24 Semester 2. Lecturer: Prof D...   
2   ACC1701X Lecturer: Adjunct Assoc. Prof. Deon ...   
4   Taken in AY19/20 Sem 2 Lecturer: Prof Winston...   
5   Lecturer: Prof Charles Shi Tutor: Mr Philip T...   

                      skills_entities             staff_entities  
0                                  []                         []  
1  [FSA, COGS, DuPont Framework, VAT]                [Prof Chan]  
2        [the Very Good to Excellent]  [Prof Winston, Prof Deon]  
4                           [ACC/FIN]                         []  
5                                  []                         []  


Save updated DF to a new CSV File


In [139]:
module_review.to_csv('mock_module_reviews_final.csv', index=False)