In [2]:
import pandas as pd
import os
from pathlib import Path
from imputers import impute_with_constant, impute_with_median
from encoding import encode_categorical_values

INPUT_DIR = Path.home() / "Desktop" / "newData"
INPUT_DIR2 = Path.home() / "Desktop" / "anonymisedData"

STRATEGIES = {
    "constant": impute_with_constant,
    "median": impute_with_median,
}

assessments = pd.read_csv(os.path.join(INPUT_DIR2, "assessments.csv"))
courses = pd.read_csv(os.path.join(INPUT_DIR2, "courses.csv"))
vle = pd.read_csv(os.path.join(INPUT_DIR2, "vle.csv"))

def impute_missing_values(set, strategy, stored_values=None, stored_categories=None):
    print(f"\n Applying strategy '{strategy}' on set '{set}'")
    
    student_info = pd.read_csv(os.path.join(INPUT_DIR, f"{set}/student_info.csv"))
    student_vle = pd.read_csv(os.path.join(INPUT_DIR, f"{set}/student_vle.csv"))
    student_reg = pd.read_csv(os.path.join(INPUT_DIR, f"{set}/student_registration.csv"))
    student_assessment = pd.read_csv(os.path.join(INPUT_DIR, f"{set}/student_assessment.csv"))

    dataframes = {
        "student_info": student_info,
        "student_vle": student_vle,
        "student_reg": student_reg,
        "student_assessment": student_assessment,
        "assessments": assessments,
        "courses": courses,
        "vle": vle,
    }

    strategy_func = STRATEGIES[strategy]
    result = strategy_func(dataframes)
    if isinstance(result, tuple):
        dataframes, new_stored_values = result
    else:
        dataframes = result
        new_stored_values = stored_values

    print("Unique values => ", student_info['imd_band'].unique())
    print("Missing values => ", student_info['imd_band'].isna().sum())

    # For train: stored_categories is None; for val/test pass stored_categories    
    dataframes, new_stored_categories = encode_categorical_values(dataframes, stored_categories)

    output_dir = Path.home() / "Desktop" / "Logistic Regression" / strategy / set
    output_dir.mkdir(parents=True, exist_ok=True) 

    print("\nFINAL MISSING VALUE CHECK:")
    for col in ['imd_band', 'highest_education', 'age_band']:
        missing = dataframes['student_info'][col].isna().sum()
        print(f"{col}: {missing} missing")
        
    dataframes["student_info"].to_csv(output_dir / "student_info.csv", index=False)
    dataframes["student_assessment"].to_csv(output_dir / "student_assessment.csv", index=False)
    dataframes["student_vle"].to_csv(output_dir / "student_vle.csv", index=False)
    dataframes["student_reg"].to_csv(output_dir / "student_reg.csv", index=False)
    dataframes["assessments"].to_csv(output_dir / "assessments.csv", index=False)
    dataframes["courses"].to_csv(output_dir / "courses.csv", index=False)
    dataframes["vle"].to_csv(output_dir / "vle.csv", index=False)

    # Returning stored values and categories for next splits
    return new_stored_values, new_stored_categories

for strategy in STRATEGIES:
    stored_values = None
    stored_categories = None
    for split in ["train", "val", "test"]:
         stored_values, stored_categories = impute_missing_values(split, strategy, stored_values, stored_categories)


 Applying strategy 'constant' on set 'train'

=== ENCODING FUNCTION STARTED ===
Unique values =>  ['90-100%' '30-40%' '50-60%' 'Missing' '20-30%' '70-80%' '60-70%' '10-20'
 '80-90%' '0-10%' '40-50%']
Missing values =>  0

FINAL MISSING VALUE CHECK:
imd_band: 0 missing
highest_education: 0 missing
age_band: 0 missing

 Applying strategy 'constant' on set 'val'

=== ENCODING FUNCTION STARTED ===
Unique values =>  ['50-60%' '90-100%' '70-80%' '0-10%' '30-40%' 'Missing' '20-30%' '80-90%'
 '10-20' '40-50%' '60-70%']
Missing values =>  0

FINAL MISSING VALUE CHECK:
imd_band: 0 missing
highest_education: 0 missing
age_band: 0 missing

 Applying strategy 'constant' on set 'test'

=== ENCODING FUNCTION STARTED ===
Unique values =>  ['20-30%' '80-90%' '30-40%' '70-80%' '60-70%' '40-50%' '50-60%' '90-100%'
 '10-20' '0-10%' 'Missing']
Missing values =>  0

FINAL MISSING VALUE CHECK:
imd_band: 0 missing
highest_education: 0 missing
age_band: 0 missing

 Applying strategy 'median' on set 'train'
Un