In [29]:
import pandas as pd

def encode_categorical_values(dataframes, stored_categories=None):
    student_info = dataframes['student_info']
    assessments = dataframes['assessments']
    vle = dataframes['vle']

    student_info['highest_education'] = ( # for consistency
        student_info['highest_education']
        .str.strip()  # Removes whitespace
        .str.lower()  # Converts to lowercase
    )

    edu_map = {
        'no formal quals': 0,
        'lower than a level': 1,
        'a level or equivalent': 2,
        'he qualification': 3,
        'post graduate qualification': 4
    }
    student_info['highest_education'] = student_info['highest_education'].map(edu_map)
    
    age_map = {
        '0-35': 0,
        '35-55': 1,
        '55<=': 2
    }
    student_info['age_band'] = student_info['age_band'].map(age_map)

    student_info['imd_band'] = ( # for consistency
        student_info['imd_band']
        .str.strip()  # Removes whitespace
        .str.replace(' ', '')  # Removes internal spaces
        .str.replace('%', '')  # Removes % signs
    )
    
    imd_map = {
        '0-10': 0,
        '10-20': 1,
        '20-30': 2,
        '30-40': 3,
        '40-50': 4,
        '50-60': 5,
        '60-70': 6,
        '70-80': 7,
        '80-90': 8,
        '90-100': 9,
        'Missing': -1,
    }

    student_info['imd_band'] = student_info['imd_band'].map(imd_map)

    one_hot_columns_student_info = ['gender', 'region', 'disability']
    one_hot_columns_assessments = ['assessment_type']
    one_hot_columns_vle = ['activity_type']

    if stored_categories is None:
        # TRAIN: get dummies and store columns/categories
        student_info_encoded = pd.get_dummies(student_info, columns=one_hot_columns_student_info, drop_first=True)
        assessments_encoded = pd.get_dummies(assessments, columns=one_hot_columns_assessments, drop_first=True)
        vle_encoded = pd.get_dummies(vle, columns=one_hot_columns_vle, drop_first=True)

        stored_categories = {
            'student_info_columns': student_info_encoded.columns,
            'assessments_columns': assessments_encoded.columns,
            'vle_columns': vle_encoded.columns
        }
    else:
        # VAL/TEST: get dummies then reindex to stored columns (adding missing columns with 0)
        student_info_encoded = pd.get_dummies(student_info, columns=one_hot_columns_student_info, drop_first=True)
        student_info_encoded = student_info_encoded.reindex(columns=stored_categories['student_info_columns'], fill_value=0)
        
        assessments_encoded = pd.get_dummies(assessments, columns=one_hot_columns_assessments, drop_first=True)
        assessments_encoded = assessments_encoded.reindex(columns=stored_categories['assessments_columns'], fill_value=0)
        
        vle_encoded = pd.get_dummies(vle, columns=one_hot_columns_vle, drop_first=True)
        vle_encoded = vle_encoded.reindex(columns=stored_categories['vle_columns'], fill_value=0)

    # Returnin encoded dataframes and stored_categories to reuse for val/test
    return {
        "student_info": student_info_encoded,
        "student_vle": dataframes['student_vle'],
        "student_reg": dataframes['student_reg'],
        "student_assessment": dataframes['student_assessment'],
        "assessments": assessments_encoded,
        "courses": dataframes['courses'],
        "vle": vle_encoded
    }, stored_categories
