## Preprocessing for School Location Database


In [1]:
import os
import pandas as pd

# Directory containing the school CSV files
schools_dir = '../data/landing/schools'

# List all CSV files in the directory
csv_files = [f for f in os.listdir(schools_dir) if f.endswith('.csv')]

# Load each CSV and collect their columns
schemas = {}
for file in csv_files:
    try:
        df = pd.read_csv(os.path.join(schools_dir, file), nrows=0, encoding='utf-8')
    except UnicodeDecodeError:
        df = pd.read_csv(os.path.join(schools_dir, file), nrows=0, encoding='latin1')
    schemas[file] = set(df.columns)

# Display the columns for each file
for file, cols in schemas.items():
    print(f"{file}: {sorted(cols)}")

# Check if all schemas match
all_schemas = list(schemas.values())
schemas_match = all(s == all_schemas[0] for s in all_schemas)
print(f"\nSchemas match: {schemas_match}")

school_locations_2023.csv: ['Address_Line_1', 'Address_Line_2', 'Address_Postcode', 'Address_State', 'Address_Town', 'Education_Sector', 'Entity_Type', 'Full_Phone_No', 'LGA_ID', 'LGA_Name', 'Postal_Address_Line_1', 'Postal_Address_Line_2', 'Postal_Postcode', 'Postal_State', 'Postal_Town', 'School_Name', 'School_No', 'School_Status', 'School_Type', 'X', 'Y']
school_locations_2025.csv: ['Address_Line_1', 'Address_Line_2', 'Address_Postcode', 'Address_State', 'Address_Town', 'Area', 'Education_Sector', 'Entity_Type', 'Full_Phone_No', 'LGA_ID', 'LGA_Name', 'LGA_TYPE', 'Postal_Address_Line_1', 'Postal_Address_Line_2', 'Postal_Postcode', 'Postal_State', 'Postal_Town', 'Region', 'School_Name', 'School_No', 'School_Status', 'School_Type', 'X', 'Y']
school_locations_2024.csv: ['AREA_Name', 'Address_Line_1', 'Address_Line_2', 'Address_Postcode', 'Address_State', 'Address_Town', 'Education_Sector', 'Entity_Type', 'Full_Phone_No', 'LGA_ID', 'LGA_Name', 'Postal_Address_Line_1', 'Postal_Address_Lin

In [None]:
# Define the standardized schema based on common columns across all years
standard_columns = [
    'Address_Line_1', 'Address_Line_2', 'Address_Postcode', 'Address_State', 'Address_Town',
    'Education_Sector', 'Entity_Type', 'Full_Phone_No', 'LGA_ID', 'LGA_Name',
    'Postal_Address_Line_1', 'Postal_Address_Line_2', 'Postal_Postcode', 'Postal_State', 'Postal_Town',
    'School_Name', 'School_No', 'School_Type', 'X', 'Y',
    # Additional columns that exist in some years
    'Area', 'LGA_TYPE', 'Region', 'School_Status'
]

print("Standardized schema:")
for col in standard_columns:
    print(f"  - {col}")


In [None]:
# Function to standardize column names and add missing columns
def standardize_school_dataframe(df, year):
    """
    Standardize a school dataframe to have consistent columns.
    
    Parameters:
    df: pandas DataFrame
    year: string indicating the year (2023, 2024, or 2025)
    
    Returns:
    pandas DataFrame with standardized columns
    """
    # Create a copy to avoid modifying the original
    df_std = df.copy()
    
    # Handle column name variations
    column_mapping = {
        'AREA_Name': 'Area',  # 2024 has AREA_Name instead of Area
        'Region_Name': 'Region'  # 2024 has Region_Name instead of Region
    }
    
    # Rename columns
    df_std = df_std.rename(columns=column_mapping)
    
    # Add missing columns with NaN values
    for col in standard_columns:
        if col not in df_std.columns:
            df_std[col] = None
    
    # Reorder columns to match standard schema
    df_std = df_std[standard_columns]
    
    # Add year column to indicate when school was established
    df_std['establishment_year'] = year
    
    return df_std

# Test the function with a small sample
print("Testing standardization function...")
for file in csv_files:
    year = file.split('_')[-1].split('.')[0]  # Extract year from filename
    print(f"\nProcessing {file} (year: {year})")
    
    # Load a small sample to test
    try:
        df_sample = pd.read_csv(os.path.join(schools_dir, file), nrows=5, encoding='utf-8')
    except UnicodeDecodeError:
        df_sample = pd.read_csv(os.path.join(schools_dir, file), nrows=5, encoding='latin1')
    
    # Standardize
    df_std = standardize_school_dataframe(df_sample, year)
    
    print(f"Original columns: {len(df_sample.columns)}")
    print(f"Standardized columns: {len(df_std.columns)}")
    print(f"Missing columns added: {[col for col in standard_columns if col not in df_sample.columns]}")


In [None]:
# Load and standardize all school datasets
print("Loading and standardizing all school datasets...")
standardized_dfs = []

for file in csv_files:
    year = file.split('_')[-1].split('.')[0]  # Extract year from filename
    print(f"\nProcessing {file} (year: {year})")
    
    # Load the full dataset
    try:
        df = pd.read_csv(os.path.join(schools_dir, file), encoding='utf-8')
    except UnicodeDecodeError:
        df = pd.read_csv(os.path.join(schools_dir, file), encoding='latin1')
    
    print(f"  Loaded {len(df)} schools")
    
    # Standardize the dataframe
    df_std = standardize_school_dataframe(df, year)
    standardized_dfs.append(df_std)
    
    print(f"  Standardized to {len(df_std.columns)} columns")

# Combine all standardized dataframes
print(f"\nCombining {len(standardized_dfs)} datasets...")
combined_schools = pd.concat(standardized_dfs, ignore_index=True)

print(f"Combined dataset shape: {combined_schools.shape}")
print(f"Total schools: {len(combined_schools)}")
print(f"\nEstablishment year distribution:")
print(combined_schools['establishment_year'].value_counts().sort_index())


In [None]:
# Display sample of the combined dataset
print("Sample of combined dataset:")
print(combined_schools[['School_Name', 'School_Type', 'Address_Town', 'establishment_year']].head(10))

print(f"\nData types:")
print(combined_schools.dtypes)

print(f"\nMissing values per column:")
missing_values = combined_schools.isnull().sum()
print(missing_values[missing_values > 0])


In [None]:
# Save the combined dataset
output_path = '../data/processed/schools/combined_schools_standardized.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True)

combined_schools.to_csv(output_path, index=False)
print(f"Combined dataset saved to: {output_path}")

# Also save a summary
summary_path = '../data/curated/schools/schools_summary.txt'
with open(summary_path, 'w') as f:
    f.write("Schools Dataset Summary\n")
    f.write("=" * 50 + "\n\n")
    f.write(f"Total schools: {len(combined_schools)}\n")
    f.write(f"Columns: {len(combined_schools.columns)}\n\n")
    f.write("Establishment year distribution:\n")
    for year, count in combined_schools['establishment_year'].value_counts().sort_index().items():
        f.write(f"  {year}: {count} schools\n")
    f.write(f"\nMissing values:\n")
    missing_values = combined_schools.isnull().sum()
    for col, count in missing_values[missing_values > 0].items():
        f.write(f"  {col}: {count} missing\n")

print(f"Summary saved to: {summary_path}")
