<a href="https://colab.research.google.com/github/kadefue/MoEST/blob/main/MoEST_Refactored_Secondary_School_Enrollment_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""Refactored MoEST Modeling for Secondary School Enrollment
"""

import os
import re
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import (
    RandomForestRegressor,
    HistGradientBoostingRegressor,
    GradientBoostingRegressor
)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from google.colab import drive

# =============================================================================
# CLASS 1: Data Loader & Cleaner
# =============================================================================

class MOESTDataLoader:
    """
    Handles mounting drive, loading CSVs, cleaning headers, handling types,
    and managing initial data quality checks.
    """
    def __init__(self, base_directory, exclude_keywords=None):
        self.base_directory = base_directory
        self.exclude_keywords = exclude_keywords if exclude_keywords else []
        self.dataframes = {}

    def mount_drive(self):
        drive.mount('/content/drive/')

    def get_file_list(self):
        all_files = [f for f in os.listdir(self.base_directory) if f.endswith('.csv')]
        filtered_files = []
        for file_name in all_files:
            if not any(keyword.lower() in file_name.lower() for keyword in self.exclude_keywords):
                filtered_files.append(file_name)
        return filtered_files

    def load_data(self):
        files = self.get_file_list()
        print(f"Found {len(files)} files to load.")
        for file_name in files:
            file_path = os.path.join(self.base_directory, file_name)
            df_name = file_name.replace('.csv', '')
            try:
                df = pd.read_csv(file_path)
                self.dataframes[df_name] = self._initial_clean(df)
                print(f"Loaded: {df_name}")
            except Exception as e:
                print(f"Error loading {file_name}: {e}")

    def _initial_clean(self, df):
        """Standardizes headers and cleans numeric columns."""
        df.columns = [str(col).strip().upper() for col in df.columns]

        for col in df.select_dtypes(include='object').columns:
            if df[col].astype(str).str.contains(',').any():
                cleaned = df[col].astype(str).str.replace(',', '', regex=False)
                converted = pd.to_numeric(cleaned, errors='coerce')
                if converted.notna().sum() > 0:
                    df[col] = converted

        unnamed = [c for c in df.columns if 'UNNAMED' in c]
        to_drop = [c for c in unnamed if df[c].isnull().mean() > 0.9]
        df.drop(columns=to_drop, inplace=True)
        return df

    def get_dataframe(self, name):
        return self.dataframes.get(name)

    def get_all_dataframes(self):
        return self.dataframes

# =============================================================================
# CLASS 2: Geography & Location Manager
# =============================================================================

class LocationManager:
    """
    Handles Geocoding, LGA Status merging, and Clustering.
    """
    def __init__(self, dataframes, geodata_path):
        self.dataframes = dataframes
        self.geodata_path = geodata_path
        self.geo_data = None
        self.lga_status_df = None

    def standardize_location_columns(self):
        for name, df in self.dataframes.items():
            reg_col = next((c for c in df.columns if c in ['REGION', 'REGON']), None)
            cou_col = next((c for c in df.columns if c in ['COUNCIL', 'DISTRICT', 'LGA NAME']), None)

            if reg_col and cou_col:
                df.rename(columns={reg_col: 'REGION', cou_col: 'COUNCIL'}, inplace=True)
                df['REGION'] = df['REGION'].astype(str).str.upper()
                df['COUNCIL'] = df['COUNCIL'].astype(str).str.upper()

    def merge_lga_status(self, lga_df_name='LGAs Urban and Rural Status'):
        if lga_df_name not in self.dataframes:
            print("LGA Status DataFrame not found.")
            return

        self.lga_status_df = self.dataframes[lga_df_name].copy()
        if 'REMARKS' in self.lga_status_df.columns:
            self.lga_status_df.drop(columns=['REMARKS'], inplace=True)
        self.lga_status_df.rename(columns={'CLASSIFICATION': 'LGA_STATUS'}, inplace=True)

        for name, df in self.dataframes.items():
            if name == lga_df_name: continue
            if 'REGION' in df.columns and 'COUNCIL' in df.columns:
                merged = pd.merge(df, self.lga_status_df, on=['REGION', 'COUNCIL'], how='left')
                self.dataframes[name] = merged
                print(f"Merged LGA Status into {name}")
        del self.dataframes[lga_df_name]

    def process_geocoding(self):
        if os.path.exists(self.geodata_path):
            print("Loading existing geodata...")
            self.geo_data = pd.read_csv(self.geodata_path)
        else:
            print("Generating new geodata...")
            self._fetch_geodata()

        self._apply_clustering()

        for name, df in self.dataframes.items():
            if 'REGION' in df.columns and 'COUNCIL' in df.columns:
                merged = pd.merge(df, self.geo_data[['REGION', 'COUNCIL', 'GEO_CLUSTER']],
                                  on=['REGION', 'COUNCIL'], how='left')
                self.dataframes[name] = merged
                print(f"Merged Geo Cluster into {name}")

    def _fetch_geodata(self):
        locs = []
        for df in self.dataframes.values():
            if 'REGION' in df.columns and 'COUNCIL' in df.columns:
                locs.append(df[['REGION', 'COUNCIL']])
        unique_locs = pd.concat(locs).drop_duplicates().reset_index(drop=True)

        geolocator = Nominatim(user_agent="moest_geo_mapper_v3")
        geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1.0)

        lats, lons = [], []
        for idx, row in unique_locs.iterrows():
            query = f"{row['COUNCIL']}, {row['REGION']}, Tanzania"
            try:
                loc = geocode(query)
                if loc:
                    lats.append(loc.latitude)
                    lons.append(loc.longitude)
                else:
                    lats.append(None)
                    lons.append(None)
            except:
                lats.append(None)
                lons.append(None)

        unique_locs['LATITUDE'] = lats
        unique_locs['LONGITUDE'] = lons
        self.geo_data = unique_locs.dropna(subset=['LATITUDE', 'LONGITUDE'])
        self.geo_data.to_csv(self.geodata_path, index=False)

    def _apply_clustering(self):
        kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
        self.geo_data['GEO_CLUSTER'] = kmeans.fit_predict(self.geo_data[['LATITUDE', 'LONGITUDE']])

# =============================================================================
# CLASS 3: Feature Engineer
# =============================================================================

class FeatureEngineer:
    """
    Handles specific transformation logic for Secondary School Subjects.
    """
    @staticmethod
    def melt_subjects(df):
        id_vars = [c for c in ['YEAR', 'REGION', 'COUNCIL'] if c in df.columns]
        subject_cols = [c for c in df.columns if 'FORM ' in c and ' - ' in c]

        print(f"Melting {len(subject_cols)} subject columns...")
        long_df = df.melt(id_vars=id_vars, value_vars=subject_cols, var_name='RAW', value_name='ENROLLMENT')

        long_df['FORM_NUM'] = long_df['RAW'].str.extract(r'FORM (\d)').astype(int)
        long_df['SUBJECT'] = long_df['RAW'].str.split(' - ').str[1].str.strip()
        long_df.drop(columns=['RAW'], inplace=True)
        return long_df

    @staticmethod
    def merge_infrastructure(main_df, infrastructure_dfs):
        keys = ['YEAR', 'REGION', 'COUNCIL']
        df = main_df.copy()

        # Tables
        tables = infrastructure_dfs.get('tables')
        if tables is not None and 'AVAILABLE_TABLES' in tables.columns:
            df = df.merge(tables[keys + ['AVAILABLE_TABLES']], on=keys, how='left').fillna({'AVAILABLE_TABLES': 0})

        # Labs
        labs = infrastructure_dfs.get('labs')
        if labs is not None:
            lab_cols = [c for c in labs.columns if 'LABORATORY' in c]
            if lab_cols:
                labs['TOTAL_LABS'] = labs[lab_cols].sum(axis=1)
                df = df.merge(labs[keys + ['TOTAL_LABS']], on=keys, how='left').fillna({'TOTAL_LABS': 0})

        return df

    @staticmethod
    def create_lag_features(df):
        df = df.sort_values(['REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM', 'YEAR'])
        g = df.groupby(['REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM'])

        df['LAG_1'] = g['ENROLLMENT'].shift(1)
        df['LAG_2'] = g['ENROLLMENT'].shift(2)
        df['YOY_GROWTH'] = (df['ENROLLMENT'] - df['LAG_1']) / (df['LAG_1'] + 1e-5)
        df['IS_ELECTION_YEAR'] = df['YEAR'].isin([2015, 2020, 2025, 2030]).astype(int)

        return df.fillna(-1)

    @staticmethod
    def create_cohort_features(df):
        df['PREV_YEAR'] = df['YEAR'] - 1
        df['PREV_FORM'] = df['FORM_NUM'] - 1

        df['LOOKUP_KEY'] = (df['YEAR'].astype(str) + '_' + df['REGION'] + '_' +
                            df['COUNCIL'] + '_' + df['SUBJECT'] + '_' + df['FORM_NUM'].astype(str))

        lookup_dict = df.groupby('LOOKUP_KEY')['ENROLLMENT'].sum().to_dict()

        df['SEARCH_KEY'] = (df['PREV_YEAR'].astype(str) + '_' + df['REGION'] + '_' +
                            df['COUNCIL'] + '_' + df['SUBJECT'] + '_' + df['PREV_FORM'].astype(str))

        df['COHORT_LAG'] = df['SEARCH_KEY'].map(lookup_dict).fillna(-1)
        df.drop(columns=['PREV_YEAR', 'PREV_FORM', 'LOOKUP_KEY', 'SEARCH_KEY'], inplace=True)
        return df

# =============================================================================
# CLASS 4: Model Engine (Conditional Ensemble)
# =============================================================================

class EnrollmentModelEngine:
    """
    Manages Training, Conditional Selection, and Recursive Forecasting.
    Implements logic: Use Best Model ONLY unless others are within 60% performance.
    """
    def __init__(self, df):
        self.df = df
        self.models = {}
        self.encoders = {}
        self.features = [
            'YEAR', 'REGION_ENC', 'COUNCIL_ENC', 'SUBJECT_ENC', 'FORM_NUM',
            'AVAILABLE_TABLES', 'TOTAL_LABS', 'LAG_1', 'LAG_2',
            'YOY_GROWTH', 'COHORT_LAG', 'IS_ELECTION_YEAR'
        ]
        self.selected_model_keys = [] # Stores keys of models to be used for inference

    def preprocess(self):
        self.encoders['REG'] = LabelEncoder()
        self.encoders['COU'] = LabelEncoder()
        self.encoders['SUB'] = LabelEncoder()

        self.df['REGION_ENC'] = self.encoders['REG'].fit_transform(self.df['REGION'].astype(str))
        self.df['COUNCIL_ENC'] = self.encoders['COU'].fit_transform(self.df['COUNCIL'].astype(str))
        self.df['SUBJECT_ENC'] = self.encoders['SUB'].fit_transform(self.df['SUBJECT'].astype(str))

    def train_all_models(self, cutoff_year=2023):
        """Trains all 5 candidate models."""
        print(f"\nTraining all candidate models on Data <= {cutoff_year}...")
        train_df = self.df[self.df['YEAR'] <= cutoff_year]
        X = train_df[self.features]
        y = train_df['ENROLLMENT']

        # 1. XGBoost
        print("Training XGBoost...")
        self.models['XGB'] = xgb.XGBRegressor(n_estimators=300, max_depth=9, learning_rate=0.05, n_jobs=-1)
        self.models['XGB'].fit(X, y)

        # 2. LightGBM
        print("Training LightGBM...")
        self.models['LGB'] = lgb.LGBMRegressor(n_estimators=500, num_leaves=50, min_child_samples=10, learning_rate=0.1, verbose=-1)
        self.models['LGB'].fit(X, y)

        # 3. Random Forest
        print("Training Random Forest...")
        self.models['RF'] = RandomForestRegressor(n_estimators=300, max_depth=12, n_jobs=-1, random_state=42)
        self.models['RF'].fit(X, y)

        # 4. HistGradientBoosting (Scikit-Learn)
        print("Training HistGradientBoosting...")
        self.models['HGB'] = HistGradientBoostingRegressor(max_iter=500, learning_rate=0.1, max_depth=10)
        self.models['HGB'].fit(X, y)

        # 5. GradientBoosting (Standard)
        # Limiting estimators slightly for speed as GB is sequential and slow
        print("Training GradientBoosting...")
        self.models['GB'] = GradientBoostingRegressor(n_estimators=300, max_depth=9, learning_rate=0.05)
        self.models['GB'].fit(X, y)

        print("All models trained successfully.")
    def calculate_metrics(self, y_true, y_pred, model_name):
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        r2 = r2_score(y_true, y_pred)

        # Handle division by zero for MAPE
        mask = y_true != 0
        if mask.sum() > 0:
            mape = np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100
            accuracy = 100 - mape
        else:
            mape = np.nan
            accuracy = np.nan

        print(f"--- {model_name} Results ---")
        print(f"   > {model_name}: R2={r2:.4f}, MAE={mae:,.2f}, RMSE={rmse:,.2f}, MAPE={mape:,.1f}%, Acc={accuracy:.1f}%")
        print("-" * 30)
        return rmse

    def evaluate_and_select_strategy(self, test_year_start=2024):
        """
        Evaluates models and applies the 60% rule:
        1. Find Best Model (Lowest RMSE).
        2. Identify models with RMSE <= Best RMSE * 1.6 (Within 60% of best).
        3. If only Best fits criteria -> Use Single Best.
        4. If others fit criteria -> Use Ensemble (Average of valid candidates).
        """
        test_df = self.df[self.df['YEAR'] >= test_year_start]
        if test_df.empty:
            print("No test data available. Defaulting to XGB only.")
            self.selected_model_keys = ['XGB']
            return

        X_test = test_df[self.features]
        y_true = test_df['ENROLLMENT']

        performance = {}
        print(f"\n--- Evaluation Results (Test Data {test_year_start}+) ---")

        # Evaluate all
        print("Evaluating all models...")
        for name, model in self.models.items():
            preds = model.predict(X_test)
            rmse = self.calculate_metrics(y_true, preds, name)
            performance[name] = rmse
            print(f"Model: {name} | RMSE: {rmse:,.2f}")

        # Find Best
        best_model_name = min(performance, key=performance.get)
        best_rmse = performance[best_model_name]
        print(f"\nBEST MODEL: {best_model_name} (RMSE: {best_rmse:,.2f})")

        # Apply Threshold Rule (RMSE within 40% of best)
        # "Performance at least 40% of best" usually means Error is not more than 40% higher.
        # Threshold = Best_RMSE + (0.40 * Best_RMSE) = 1.4 * Best_RMSE
        threshold = best_rmse * 1.4
        candidates = [name for name, score in performance.items() if score <= threshold]

        print(f"Selection Threshold (RMSE <= {threshold:,.2f})")
        print(f"Qualifying Models: {candidates}")

        # Logic: If others exist besides best -> Ensemble. Else -> Single.
        if len(candidates) > 1:
            self.selected_model_keys = candidates
            print(f"Strategy: ENSEMBLE (Average of {', '.join(candidates)})")
        else:
            self.selected_model_keys = [best_model_name]
            print(f"Strategy: SINGLE BEST MODEL ({best_model_name})")

    def _predict(self, X):
        """Predicts using the selected strategy."""
        preds = []
        for key in self.selected_model_keys:
            preds.append(self.models[key].predict(X))

        # Average the predictions of selected models
        return np.mean(preds, axis=0)

    def recursive_forecast(self, start_year, end_year):
        print(f"\nStarting Recursive Forecast ({start_year}-{end_year}) using {self.selected_model_keys}...")
        future_data = []
        current_data = self.df[self.df['YEAR'] == (start_year - 1)].copy()

        for year in range(start_year, end_year + 1):
            next_df = self._prepare_next_step(current_data, year)

            X_future = next_df[self.features]
            preds = self._predict(X_future)

            next_df['ENROLLMENT'] = np.maximum(0, preds)

            future_data.append(next_df)
            current_data = next_df.copy()
            print(f" > Forecasted {year}")

        return pd.concat(future_data, ignore_index=True)

    def _prepare_next_step(self, prev_df, target_year):
        next_df = prev_df.copy()
        next_df['YEAR'] = target_year

        # Shift Lags
        next_df['LAG_2'] = next_df['LAG_1']
        next_df['LAG_1'] = next_df['ENROLLMENT']

        # Update Growth
        next_df['YOY_GROWTH'] = (next_df['LAG_1'] - next_df['LAG_2']) / (next_df['LAG_2'] + 1e-5)

        # Update Cohort Logic
        cohort_lookup = prev_df.set_index(['REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM'])['ENROLLMENT'].to_dict()

        def get_cohort(row):
            target_form = row['FORM_NUM'] - 1
            if target_form < 1: return -1
            key = (row['REGION'], row['COUNCIL'], row['SUBJECT'], target_form)
            return cohort_lookup.get(key, -1)

        next_df['COHORT_LAG'] = next_df.apply(get_cohort, axis=1)
        next_df['IS_ELECTION_YEAR'] = 1 if target_year in [2025, 2030] else 0

        return next_df


# =============================================================================
# MAIN PIPELINE
# =============================================================================

def main():
    # 1. Configuration
    BASE_DIR = '/content/drive/MyDrive/GUIDELINES_TSC_JAN2026/Data Set/csvs/'
    GEO_FILE = '/content/drive/MyDrive/MOEST/tanzania_council_geodata.csv'
    EXCLUDE_KEYWORDS = ['Primary', 'Textbooks', 'Population', 'Teacher', 'COBET', 'Vocational']

    # 2. Load Data
    loader = MOESTDataLoader(BASE_DIR, EXCLUDE_KEYWORDS)
    loader.mount_drive()
    loader.load_data()
    all_dfs = loader.get_all_dataframes()

    # 3. Location & Clean Up
    loc_manager = LocationManager(all_dfs, GEO_FILE)
    loc_manager.standardize_location_columns()
    loc_manager.merge_lga_status()

    # 4. Feature Engineering
    print("\n--- Starting Feature Engineering ---")
    df_subject = all_dfs.get("Secondary_students_per_subject")
    df_tables = all_dfs.get("Data-Secondary Tables and chairs 2016-2025")
    df_labs = all_dfs.get("Combined_Secondary_Laboratories_All_G_NG")

    if df_subject is None:
        raise ValueError("Critical DataFrame 'Secondary_students_per_subject' not found.")

    engineer = FeatureEngineer()
    long_df = engineer.melt_subjects(df_subject)
    merged_df = engineer.merge_infrastructure(long_df, {'tables': df_tables, 'labs': df_labs})
    lagged_df = engineer.create_lag_features(merged_df)
    final_df = engineer.create_cohort_features(lagged_df)

    # 5. Modeling & Forecasting (Updated Strategy)
    print("\n--- Starting Model Engine ---")
    engine = EnrollmentModelEngine(final_df)
    engine.preprocess()

    # Train (Using data up to 2023 to test on 2024 and 2025)
    engine.train_all_models(cutoff_year=2023)

    # Evaluate & Select Strategy (Best or Ensemble)
    engine.evaluate_and_select_strategy(test_year_start=2024)

    # 6. Generate Forecast
    # Note: Recursive forecast starts from 2026, using 2025 as the base history
    forecast_df = engine.recursive_forecast(2026, 2030)

    # 7. Output
    print("\n--- Final Forecast Sample ---")
    output_cols = ['YEAR', 'REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM', 'ENROLLMENT']
    final_view = forecast_df[output_cols].copy()
    final_view['ENROLLMENT'] = final_view['ENROLLMENT'].round(0).astype(int)
    print(final_view.head(10))

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Found 14 files to load.
Loaded: Data-Secondary Enrollment 2016-2025
Loaded: Dropout-Secondary  2017-2024
Loaded: Data-Secondary Tables and chairs 2016-2025
Loaded: Secondary-Re_entry
Loaded: Secondary - DISABALITY 2020-2025
Loaded: LGAs Urban and Rural Status
Loaded: Combined_Secondary_Laboratories_Govt
Loaded: Combined_Secondary_Laboratories_All_G_NG
Loaded: Combined_Secondary_ICT_All_G_NG
Loaded: Combined_Secondary_ICT_Govt
Loaded: Combined_Secondary_Electricity_All_G_NG
Loaded: Combined_Secondary_Electricity_Govt
Loaded: Secondary_students_per_subject
Loaded: Secondary_enrollment_Gov_2016_2025
Merged LGA Status into Data-Secondary Enrollment 2016-2025
Merged LGA Status into Dropout-Secondary  2017-2024
Merged LGA Status into Data-Secondary Tables and chairs 2016-2025
Merged LGA Status into Secondary-Re_entry
Merged LGA Status into Secondary - DISABALITY 

In [None]:
# -*- coding: utf-8 -*-
"""Refactored MoEST Modeling for Secondary School Enrollment
"""

import os
import re
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import (
    RandomForestRegressor,
    HistGradientBoostingRegressor,
    GradientBoostingRegressor
)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from google.colab import drive

# =============================================================================
# CLASS 1: Data Loader & Cleaner
# =============================================================================

class MOESTDataLoader:
    """
    Handles mounting drive, loading CSVs, cleaning headers, handling types,
    and managing initial data quality checks.
    """
    def __init__(self, base_directory, exclude_keywords=None):
        self.base_directory = base_directory
        self.exclude_keywords = exclude_keywords if exclude_keywords else []
        self.dataframes = {}

    def mount_drive(self):
        drive.mount('/content/drive/')

    def get_file_list(self):
        all_files = [f for f in os.listdir(self.base_directory) if f.endswith('.csv')]
        filtered_files = []
        for file_name in all_files:
            if not any(keyword.lower() in file_name.lower() for keyword in self.exclude_keywords):
                filtered_files.append(file_name)
        return filtered_files

    def load_data(self):
        files = self.get_file_list()
        print(f"Found {len(files)} files to load.")
        for file_name in files:
            file_path = os.path.join(self.base_directory, file_name)
            df_name = file_name.replace('.csv', '')
            try:
                df = pd.read_csv(file_path)
                self.dataframes[df_name] = self._initial_clean(df)
                print(f"Loaded: {df_name}")
            except Exception as e:
                print(f"Error loading {file_name}: {e}")

    def _initial_clean(self, df):
        """Standardizes headers and cleans numeric columns."""
        df.columns = [str(col).strip().upper() for col in df.columns]

        for col in df.select_dtypes(include='object').columns:
            if df[col].astype(str).str.contains(',').any():
                cleaned = df[col].astype(str).str.replace(',', '', regex=False)
                converted = pd.to_numeric(cleaned, errors='coerce')
                if converted.notna().sum() > 0:
                    df[col] = converted

        unnamed = [c for c in df.columns if 'UNNAMED' in c]
        to_drop = [c for c in unnamed if df[c].isnull().mean() > 0.9]
        df.drop(columns=to_drop, inplace=True)
        return df

    def get_dataframe(self, name):
        return self.dataframes.get(name)

    def get_all_dataframes(self):
        return self.dataframes

    def inject_dataframe(self, name, df):
        """Allows manual injection of dataframes if needed."""
        self.dataframes[name] = self._initial_clean(df)

# =============================================================================
# CLASS 2: Geography & Location Manager
# =============================================================================

class LocationManager:
    """
    Handles Geocoding, LGA Status merging, and Clustering.
    """
    def __init__(self, dataframes, geodata_path):
        self.dataframes = dataframes
        self.geodata_path = geodata_path
        self.geo_data = None
        self.lga_status_df = None

    def standardize_location_columns(self):
        for name, df in self.dataframes.items():
            reg_col = next((c for c in df.columns if c in ['REGION', 'REGON']), None)
            cou_col = next((c for c in df.columns if c in ['COUNCIL', 'DISTRICT', 'LGA NAME']), None)

            if reg_col and cou_col:
                df.rename(columns={reg_col: 'REGION', cou_col: 'COUNCIL'}, inplace=True)
                df['REGION'] = df['REGION'].astype(str).str.upper()
                df['COUNCIL'] = df['COUNCIL'].astype(str).str.upper()

    def merge_lga_status(self, lga_df_name='LGAs Urban and Rural Status'):
        if lga_df_name not in self.dataframes:
            print("LGA Status DataFrame not found.")
            return

        self.lga_status_df = self.dataframes[lga_df_name].copy()
        if 'REMARKS' in self.lga_status_df.columns:
            self.lga_status_df.drop(columns=['REMARKS'], inplace=True)
        self.lga_status_df.rename(columns={'CLASSIFICATION': 'LGA_STATUS'}, inplace=True)

        for name, df in self.dataframes.items():
            if name == lga_df_name: continue
            if 'REGION' in df.columns and 'COUNCIL' in df.columns:
                merged = pd.merge(df, self.lga_status_df, on=['REGION', 'COUNCIL'], how='left')
                self.dataframes[name] = merged
                print(f"Merged LGA Status into {name}")
        del self.dataframes[lga_df_name]

    def process_geocoding(self):
        if os.path.exists(self.geodata_path):
            print("Loading existing geodata...")
            self.geo_data = pd.read_csv(self.geodata_path)
        else:
            print("Generating new geodata...")
            self._fetch_geodata()

        self._apply_clustering()

        for name, df in self.dataframes.items():
            if 'REGION' in df.columns and 'COUNCIL' in df.columns:
                merged = pd.merge(df, self.geo_data[['REGION', 'COUNCIL', 'GEO_CLUSTER']],
                                  on=['REGION', 'COUNCIL'], how='left')
                self.dataframes[name] = merged
                print(f"Merged Geo Cluster into {name}")

    def _fetch_geodata(self):
        locs = []
        for df in self.dataframes.values():
            if 'REGION' in df.columns and 'COUNCIL' in df.columns:
                locs.append(df[['REGION', 'COUNCIL']])
        unique_locs = pd.concat(locs).drop_duplicates().reset_index(drop=True)

        geolocator = Nominatim(user_agent="moest_geo_mapper_v3")
        geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1.0)

        lats, lons = [], []
        for idx, row in unique_locs.iterrows():
            query = f"{row['COUNCIL']}, {row['REGION']}, Tanzania"
            try:
                loc = geocode(query)
                if loc:
                    lats.append(loc.latitude)
                    lons.append(loc.longitude)
                else:
                    lats.append(None)
                    lons.append(None)
            except:
                lats.append(None)
                lons.append(None)

        unique_locs['LATITUDE'] = lats
        unique_locs['LONGITUDE'] = lons
        self.geo_data = unique_locs.dropna(subset=['LATITUDE', 'LONGITUDE'])
        self.geo_data.to_csv(self.geodata_path, index=False)

    def _apply_clustering(self):
        kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
        self.geo_data['GEO_CLUSTER'] = kmeans.fit_predict(self.geo_data[['LATITUDE', 'LONGITUDE']])

# =============================================================================
# CLASS 3: Feature Engineer
# =============================================================================

class FeatureEngineer:
    """
    Handles specific transformation logic for Secondary School Subjects.
    """
    @staticmethod
    def melt_subjects(df):
        id_vars = [c for c in ['YEAR', 'REGION', 'COUNCIL'] if c in df.columns]
        subject_cols = [c for c in df.columns if 'FORM ' in c and ' - ' in c]

        print(f"Melting {len(subject_cols)} subject columns...")
        long_df = df.melt(id_vars=id_vars, value_vars=subject_cols, var_name='RAW', value_name='ENROLLMENT')

        long_df['FORM_NUM'] = long_df['RAW'].str.extract(r'FORM (\d)').astype(int)
        long_df['SUBJECT'] = long_df['RAW'].str.split(' - ').str[1].str.strip()
        long_df.drop(columns=['RAW'], inplace=True)
        return long_df

    @staticmethod
    def merge_infrastructure(main_df, auxiliary_dfs):
        """
        Merges Tables, Labs, ICT, Electricity, Dropout, Reentry, Disability.
        """
        keys = ['YEAR', 'REGION', 'COUNCIL']
        df = main_df.copy()

        # 1. Tables
        tables = auxiliary_dfs.get('tables')
        if tables is not None and 'AVAILABLE_TABLES' in tables.columns:
            df = df.merge(tables[keys + ['AVAILABLE_TABLES']], on=keys, how='left').fillna({'AVAILABLE_TABLES': 0})

        # 2. Labs (Total)
        labs = auxiliary_dfs.get('labs')
        if labs is not None:
            lab_cols = [c for c in labs.columns if 'LABORATORY' in c]
            if lab_cols:
                labs['TOTAL_LABS'] = labs[lab_cols].sum(axis=1)
                df = df.merge(labs[keys + ['TOTAL_LABS']], on=keys, how='left').fillna({'TOTAL_LABS': 0})

        # 3. ICT (Computers)
        ict = auxiliary_dfs.get('ict')
        if ict is not None:
             # Assuming columns like 'DESKTOP COMPUTERS' or similar exist; summing generic computer columns
             comp_cols = [c for c in ict.columns if 'COMPUTER' in c]
             if comp_cols:
                 ict['TOTAL_COMPUTERS'] = ict[comp_cols].sum(axis=1)
                 df = df.merge(ict[keys + ['TOTAL_COMPUTERS']], on=keys, how='left').fillna({'TOTAL_COMPUTERS': 0})

        # 4. Electricity
        elec = auxiliary_dfs.get('electricity')
        if elec is not None:
            # Look for grid/tanesco columns
            grid_col = next((c for c in elec.columns if 'TANESCO' in c or 'GRID' in c), None)
            if grid_col:
                df = df.merge(elec[keys + [grid_col]], on=keys, how='left').fillna({grid_col: 0})
                df.rename(columns={grid_col: 'ELEC_GRID_PCT'}, inplace=True)

        # 5. Re-entry
        reentry = auxiliary_dfs.get('reentry')
        if reentry is not None:
            re_cols = [c for c in reentry.columns if 'RE-ENROLLED' in c]
            if re_cols:
                reentry['TOTAL_REENTRY'] = reentry[re_cols].sum(axis=1)
                df = df.merge(reentry[keys + ['TOTAL_REENTRY']], on=keys, how='left').fillna({'TOTAL_REENTRY': 0})

        # 6. Disability
        disability = auxiliary_dfs.get('disability')
        if disability is not None:
            # Sum all disability columns (excluding metadata)
            dis_cols = [c for c in disability.columns if c not in keys + ['Unnamed: 0']]
            # Filter for numeric columns only to be safe
            dis_cols = [c for c in dis_cols if pd.api.types.is_numeric_dtype(disability[c])]
            if dis_cols:
                disability['TOTAL_DISABLED'] = disability[dis_cols].sum(axis=1)
                df = df.merge(disability[keys + ['TOTAL_DISABLED']], on=keys, how='left').fillna({'TOTAL_DISABLED': 0})

        # 7. Dropout (Aggregated)
        dropout = auxiliary_dfs.get('dropout')
        if dropout is not None:
            # Sum dropout reasons
            drop_cols = [c for c in dropout.columns if c not in keys]
            drop_cols = [c for c in drop_cols if pd.api.types.is_numeric_dtype(dropout[c])]
            if drop_cols:
                dropout['TOTAL_DROPOUT'] = dropout[drop_cols].sum(axis=1)
                df = df.merge(dropout[keys + ['TOTAL_DROPOUT']], on=keys, how='left').fillna({'TOTAL_DROPOUT': 0})

        return df

    @staticmethod
    def create_lag_features(df):
        df = df.sort_values(['REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM', 'YEAR'])
        g = df.groupby(['REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM'])

        df['LAG_1'] = g['ENROLLMENT'].shift(1)
        df['LAG_2'] = g['ENROLLMENT'].shift(2)
        df['YOY_GROWTH'] = (df['ENROLLMENT'] - df['LAG_1']) / (df['LAG_1'] + 1e-5)
        df['IS_ELECTION_YEAR'] = df['YEAR'].isin([2015, 2020, 2025, 2030]).astype(int)

        return df.fillna(-1)

    @staticmethod
    def create_cohort_features(df):
        df['PREV_YEAR'] = df['YEAR'] - 1
        df['PREV_FORM'] = df['FORM_NUM'] - 1

        df['LOOKUP_KEY'] = (df['YEAR'].astype(str) + '_' + df['REGION'] + '_' +
                            df['COUNCIL'] + '_' + df['SUBJECT'] + '_' + df['FORM_NUM'].astype(str))

        lookup_dict = df.groupby('LOOKUP_KEY')['ENROLLMENT'].sum().to_dict()

        df['SEARCH_KEY'] = (df['PREV_YEAR'].astype(str) + '_' + df['REGION'] + '_' +
                            df['COUNCIL'] + '_' + df['SUBJECT'] + '_' + df['PREV_FORM'].astype(str))

        df['COHORT_LAG'] = df['SEARCH_KEY'].map(lookup_dict).fillna(-1)
        df.drop(columns=['PREV_YEAR', 'PREV_FORM', 'LOOKUP_KEY', 'SEARCH_KEY'], inplace=True)
        return df

# =============================================================================
# CLASS 4: Model Engine (Conditional Ensemble)
# =============================================================================

class EnrollmentModelEngine:
    """
    Manages Training, Conditional Selection, and Recursive Forecasting.
    Implements logic: Use Best Model ONLY unless others are within 40% performance.
    """
    def __init__(self, df):
        self.df = df
        self.models = {}
        self.encoders = {}
        self.features = [
            'YEAR', 'REGION_ENC', 'COUNCIL_ENC', 'SUBJECT_ENC', 'FORM_NUM',
            'AVAILABLE_TABLES', 'TOTAL_LABS', 'TOTAL_COMPUTERS', 'ELEC_GRID_PCT',
            'TOTAL_REENTRY', 'TOTAL_DISABLED', 'TOTAL_DROPOUT',
            'LAG_1', 'LAG_2', 'YOY_GROWTH', 'COHORT_LAG', 'IS_ELECTION_YEAR'
        ]
        # Filter features to ensure they exist in DF
        self.features = [f for f in self.features if f in df.columns]
        self.selected_model_keys = [] # Stores keys of models to be used for inference

    def preprocess(self):
        self.encoders['REG'] = LabelEncoder()
        self.encoders['COU'] = LabelEncoder()
        self.encoders['SUB'] = LabelEncoder()

        self.df['REGION_ENC'] = self.encoders['REG'].fit_transform(self.df['REGION'].astype(str))
        self.df['COUNCIL_ENC'] = self.encoders['COU'].fit_transform(self.df['COUNCIL'].astype(str))
        self.df['SUBJECT_ENC'] = self.encoders['SUB'].fit_transform(self.df['SUBJECT'].astype(str))

    def train_all_models(self, cutoff_year=2023):
        """Trains all 5 candidate models."""
        print(f"\nTraining all candidate models on Data <= {cutoff_year}...")
        train_df = self.df[self.df['YEAR'] <= cutoff_year]
        X = train_df[self.features]
        y = train_df['ENROLLMENT']

        # 1. XGBoost
        print("Training XGBoost...")
        self.models['XGB'] = xgb.XGBRegressor(n_estimators=300, max_depth=9, learning_rate=0.05, n_jobs=-1)
        self.models['XGB'].fit(X, y)

        # 2. LightGBM
        print("Training LightGBM...")
        self.models['LGB'] = lgb.LGBMRegressor(n_estimators=500, num_leaves=50, min_child_samples=10, learning_rate=0.1, verbose=-1)
        self.models['LGB'].fit(X, y)

        # 3. Random Forest
        print("Training Random Forest...")
        self.models['RF'] = RandomForestRegressor(n_estimators=300, max_depth=12, n_jobs=-1, random_state=42)
        self.models['RF'].fit(X, y)

        # 4. HistGradientBoosting (Scikit-Learn)
        print("Training HistGradientBoosting...")
        self.models['HGB'] = HistGradientBoostingRegressor(max_iter=500, learning_rate=0.1, max_depth=10)
        self.models['HGB'].fit(X, y)

        # 5. GradientBoosting (Standard)
        print("Training GradientBoosting...")
        self.models['GB'] = GradientBoostingRegressor(n_estimators=300, max_depth=9, learning_rate=0.05)
        self.models['GB'].fit(X, y)

        print("All models trained successfully.")

    def calculate_metrics(self, y_true, y_pred, model_name):
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        r2 = r2_score(y_true, y_pred)

        mask = y_true != 0
        if mask.sum() > 0:
            mape = np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100
            accuracy = 100 - mape
        else:
            mape = np.nan
            accuracy = np.nan

        print(f"--- {model_name} Results ---")
        print(f"   > {model_name}: R2={r2:.4f}, MAE={mae:,.2f}, RMSE={rmse:,.2f}, MAPE={mape:,.1f}%, Acc={accuracy:.1f}%")
        print("-" * 30)
        return rmse

    def evaluate_and_select_strategy(self, test_year_start=2024):
        test_df = self.df[self.df['YEAR'] >= test_year_start]
        if test_df.empty:
            print("No test data available. Defaulting to XGB only.")
            self.selected_model_keys = ['XGB']
            return

        X_test = test_df[self.features]
        y_true = test_df['ENROLLMENT']

        performance = {}
        print(f"\n--- Evaluation Results (Test Data {test_year_start}+) ---")

        print("Evaluating all models...")
        for name, model in self.models.items():
            preds = model.predict(X_test)
            rmse = self.calculate_metrics(y_true, preds, name)
            performance[name] = rmse
            print(f"Model: {name} | RMSE: {rmse:,.2f}")

        best_model_name = min(performance, key=performance.get)
        best_rmse = performance[best_model_name]
        print(f"\nBEST MODEL: {best_model_name} (RMSE: {best_rmse:,.2f})")

        # Threshold: Best RMSE + 60% of Best RMSE
        threshold = best_rmse * 1.6
        candidates = [name for name, score in performance.items() if score <= threshold]

        print(f"Selection Threshold (RMSE <= {threshold:,.2f})")
        print(f"Qualifying Models: {candidates}")

        if len(candidates) > 1:
            self.selected_model_keys = candidates
            print(f"Strategy: ENSEMBLE (Average of {', '.join(candidates)})")
        else:
            self.selected_model_keys = [best_model_name]
            print(f"Strategy: SINGLE BEST MODEL ({best_model_name})")

    def _predict(self, X):
        preds = []
        for key in self.selected_model_keys:
            preds.append(self.models[key].predict(X))
        return np.mean(preds, axis=0)

    def recursive_forecast(self, start_year, end_year):
        print(f"\nStarting Recursive Forecast ({start_year}-{end_year}) using {self.selected_model_keys}...")
        future_data = []
        current_data = self.df[self.df['YEAR'] == (start_year - 1)].copy()

        for year in range(start_year, end_year + 1):
            next_df = self._prepare_next_step(current_data, year)

            X_future = next_df[self.features]
            preds = self._predict(X_future)

            next_df['ENROLLMENT'] = np.maximum(0, preds)

            future_data.append(next_df)
            current_data = next_df.copy()
            print(f" > Forecasted {year}")

        return pd.concat(future_data, ignore_index=True)

    def _prepare_next_step(self, prev_df, target_year):
        next_df = prev_df.copy()
        next_df['YEAR'] = target_year

        next_df['LAG_2'] = next_df['LAG_1']
        next_df['LAG_1'] = next_df['ENROLLMENT']

        next_df['YOY_GROWTH'] = (next_df['LAG_1'] - next_df['LAG_2']) / (next_df['LAG_2'] + 1e-5)

        cohort_lookup = prev_df.set_index(['REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM'])['ENROLLMENT'].to_dict()

        def get_cohort(row):
            target_form = row['FORM_NUM'] - 1
            if target_form < 1: return -1
            key = (row['REGION'], row['COUNCIL'], row['SUBJECT'], target_form)
            return cohort_lookup.get(key, -1)

        next_df['COHORT_LAG'] = next_df.apply(get_cohort, axis=1)
        next_df['IS_ELECTION_YEAR'] = 1 if target_year in [2025, 2030] else 0

        return next_df


# =============================================================================
# MAIN PIPELINE
# =============================================================================

def main():
    # 1. Configuration
    BASE_DIR = '/content/drive/MyDrive/GUIDELINES_TSC_JAN2026/Data Set/csvs/'
    GEO_FILE = '/content/drive/MyDrive/MOEST/tanzania_council_geodata.csv'
    EXCLUDE_KEYWORDS = ['Primary', 'Textbooks', 'Population', 'Teacher', 'COBET', 'Vocational']

    # 2. Load Data
    loader = MOESTDataLoader(BASE_DIR, EXCLUDE_KEYWORDS)
    loader.mount_drive()
    loader.load_data()
    all_dfs = loader.get_all_dataframes()

    # 3. Location & Clean Up
    loc_manager = LocationManager(all_dfs, GEO_FILE)
    loc_manager.standardize_location_columns()
    loc_manager.merge_lga_status()

    # 4. Feature Engineering
    print("\n--- Starting Feature Engineering ---")

    # Retrieve all required DataFrames
    df_subject = all_dfs.get("Secondary_students_per_subject")
    df_tables = all_dfs.get("Data-Secondary Tables and chairs 2016-2025")
    df_labs = all_dfs.get("Combined_Secondary_Laboratories_All_G_NG")
    df_dropout = all_dfs.get("Dropout-Secondary  2017-2024")
    df_reentry = all_dfs.get("Secondary-Re_entry")
    df_disability = all_dfs.get("Secondary - DISABALITY 2020-2025")
    df_ict = all_dfs.get("Combined_Secondary_ICT_All_G_NG")
    df_elec = all_dfs.get("Combined_Secondary_Electricity_All_G_NG")

    if df_subject is None:
        raise ValueError("Critical DataFrame 'Secondary_students_per_subject' not found.")

    # Package auxiliary dfs for cleaner function signature
    aux_dfs = {
        'tables': df_tables,
        'labs': df_labs,
        'dropout': df_dropout,
        'reentry': df_reentry,
        'disability': df_disability,
        'ict': df_ict,
        'electricity': df_elec
    }

    engineer = FeatureEngineer()
    long_df = engineer.melt_subjects(df_subject)
    # Merge all auxiliary data
    merged_df = engineer.merge_infrastructure(long_df, aux_dfs)
    lagged_df = engineer.create_lag_features(merged_df)
    final_df = engineer.create_cohort_features(lagged_df)

    # 5. Modeling & Forecasting (Updated Strategy)
    print("\n--- Starting Model Engine ---")
    engine = EnrollmentModelEngine(final_df)
    engine.preprocess()

    # Train (Using data up to 2023 to test on 2024 and 2025)
    engine.train_all_models(cutoff_year=2023)

    # Evaluate & Select Strategy (Best or Ensemble)
    engine.evaluate_and_select_strategy(test_year_start=2024)

    # 6. Generate Forecast
    # Note: Recursive forecast starts from 2026, using 2025 as the base history
    forecast_df = engine.recursive_forecast(2026, 2030)

    # 7. Output
    print("\n--- Final Forecast Sample ---")
    output_cols = ['YEAR', 'REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM', 'ENROLLMENT']
    final_view = forecast_df[output_cols].copy()
    final_view['ENROLLMENT'] = final_view['ENROLLMENT'].round(0).astype(int)
    print(final_view.head(10))

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Found 15 files to load.
Loaded: Data-Secondary Enrollment 2016-2025
Loaded: Dropout-Secondary  2017-2024
Loaded: Data-Secondary Tables and chairs 2016-2025
Loaded: Secondary-Re_entry
Loaded: Secondary - DISABALITY 2020-2025
Loaded: LGAs Urban and Rural Status
Loaded: Combined_Secondary_Laboratories_Govt
Loaded: Combined_Secondary_Laboratories_All_G_NG
Loaded: Combined_Secondary_ICT_All_G_NG
Loaded: Combined_Secondary_ICT_Govt
Loaded: Combined_Secondary_Electricity_All_G_NG
Loaded: Combined_Secondary_Electricity_Govt
Loaded: Secondary_students_per_subject
Loaded: Secondary_enrollment_Gov_2016_2025
Loaded: Data-Secondary Enrollment 2016-2025 (1)
Merged LGA Status into Data-Secondary Enrollment 2016-2025
Merged LGA Status into Dropout-Secondary  2017-2024
Merged LGA Status into Data-Secondary Tables and chairs 2016-2025
Merged LGA Status into Secondary-Re_entr

In [1]:
# -*- coding: utf-8 -*-
"""Refactored MoEST Modeling for Secondary School Enrollment
"""

import os
import re
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import (
    RandomForestRegressor,
    HistGradientBoostingRegressor,
    GradientBoostingRegressor
)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from google.colab import drive

# =============================================================================
# CLASS 1: Data Loader & Cleaner
# =============================================================================

class MOESTDataLoader:
    """
    Handles mounting drive, loading CSVs, cleaning headers, handling types,
    and managing initial data quality checks.
    """
    def __init__(self, base_directory, exclude_keywords=None):
        self.base_directory = base_directory
        self.exclude_keywords = exclude_keywords if exclude_keywords else []
        self.dataframes = {}

    def mount_drive(self):
        drive.mount('/content/drive/')

    def get_file_list(self):
        all_files = [f for f in os.listdir(self.base_directory) if f.endswith('.csv')]
        filtered_files = []
        for file_name in all_files:
            if not any(keyword.lower() in file_name.lower() for keyword in self.exclude_keywords):
                filtered_files.append(file_name)
        return filtered_files

    def load_data(self):
        files = self.get_file_list()
        print(f"Found {len(files)} files to load.")
        for file_name in files:
            file_path = os.path.join(self.base_directory, file_name)
            df_name = file_name.replace('.csv', '')
            try:
                df = pd.read_csv(file_path)
                self.dataframes[df_name] = self._initial_clean(df)
                print(f"Loaded: {df_name}")
            except Exception as e:
                print(f"Error loading {file_name}: {e}")

    def _initial_clean(self, df):
        """Standardizes headers and cleans numeric columns."""
        df.columns = [str(col).strip().upper() for col in df.columns]

        for col in df.select_dtypes(include='object').columns:
            if df[col].astype(str).str.contains(',').any():
                cleaned = df[col].astype(str).str.replace(',', '', regex=False)
                converted = pd.to_numeric(cleaned, errors='coerce')
                if converted.notna().sum() > 0:
                    df[col] = converted

        unnamed = [c for c in df.columns if 'UNNAMED' in c]
        to_drop = [c for c in unnamed if df[c].isnull().mean() > 0.9]
        df.drop(columns=to_drop, inplace=True)
        return df

    def get_dataframe(self, name):
        return self.dataframes.get(name)

    def get_all_dataframes(self):
        return self.dataframes

# =============================================================================
# CLASS 2: Geography & Location Manager
# =============================================================================

class LocationManager:
    """
    Handles Geocoding, LGA Status merging, and Clustering.
    """
    def __init__(self, dataframes, geodata_path):
        self.dataframes = dataframes
        self.geodata_path = geodata_path
        self.geo_data = None
        self.lga_status_df = None

    def standardize_location_columns(self):
        for name, df in self.dataframes.items():
            reg_col = next((c for c in df.columns if c in ['REGION', 'REGON']), None)
            cou_col = next((c for c in df.columns if c in ['COUNCIL', 'DISTRICT', 'LGA NAME']), None)

            if reg_col and cou_col:
                df.rename(columns={reg_col: 'REGION', cou_col: 'COUNCIL'}, inplace=True)
                df['REGION'] = df['REGION'].astype(str).str.upper()
                df['COUNCIL'] = df['COUNCIL'].astype(str).str.upper()

    def merge_lga_status(self, lga_df_name='LGAs Urban and Rural Status'):
        if lga_df_name not in self.dataframes:
            print("LGA Status DataFrame not found.")
            return

        self.lga_status_df = self.dataframes[lga_df_name].copy()
        if 'REMARKS' in self.lga_status_df.columns:
            self.lga_status_df.drop(columns=['REMARKS'], inplace=True)
        self.lga_status_df.rename(columns={'CLASSIFICATION': 'LGA_STATUS'}, inplace=True)

        for name, df in self.dataframes.items():
            if name == lga_df_name: continue
            if 'REGION' in df.columns and 'COUNCIL' in df.columns:
                merged = pd.merge(df, self.lga_status_df, on=['REGION', 'COUNCIL'], how='left')
                self.dataframes[name] = merged
                print(f"Merged LGA Status into {name}")
        del self.dataframes[lga_df_name]

    def process_geocoding(self):
        if os.path.exists(self.geodata_path):
            print("Loading existing geodata...")
            self.geo_data = pd.read_csv(self.geodata_path)
        else:
            print("Generating new geodata...")
            self._fetch_geodata()

        self._apply_clustering()

        for name, df in self.dataframes.items():
            if 'REGION' in df.columns and 'COUNCIL' in df.columns:
                merged = pd.merge(df, self.geo_data[['REGION', 'COUNCIL', 'GEO_CLUSTER']],
                                  on=['REGION', 'COUNCIL'], how='left')
                self.dataframes[name] = merged
                print(f"Merged Geo Cluster into {name}")

    def _fetch_geodata(self):
        locs = []
        for df in self.dataframes.values():
            if 'REGION' in df.columns and 'COUNCIL' in df.columns:
                locs.append(df[['REGION', 'COUNCIL']])
        unique_locs = pd.concat(locs).drop_duplicates().reset_index(drop=True)

        geolocator = Nominatim(user_agent="moest_geo_mapper_v3")
        geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1.0)

        lats, lons = [], []
        for idx, row in unique_locs.iterrows():
            query = f"{row['COUNCIL']}, {row['REGION']}, Tanzania"
            try:
                loc = geocode(query)
                if loc:
                    lats.append(loc.latitude)
                    lons.append(loc.longitude)
                else:
                    lats.append(None)
                    lons.append(None)
            except:
                lats.append(None)
                lons.append(None)

        unique_locs['LATITUDE'] = lats
        unique_locs['LONGITUDE'] = lons
        self.geo_data = unique_locs.dropna(subset=['LATITUDE', 'LONGITUDE'])
        self.geo_data.to_csv(self.geodata_path, index=False)

    def _apply_clustering(self):
        kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
        self.geo_data['GEO_CLUSTER'] = kmeans.fit_predict(self.geo_data[['LATITUDE', 'LONGITUDE']])

# =============================================================================
# CLASS 3: Feature Engineer
# =============================================================================

class FeatureEngineer:
    """
    Handles specific transformation logic for Secondary School Subjects.
    """
    @staticmethod
    def melt_subjects(df):
        id_vars = [c for c in ['YEAR', 'REGION', 'COUNCIL'] if c in df.columns]
        subject_cols = [c for c in df.columns if 'FORM ' in c and ' - ' in c]

        print(f"Melting {len(subject_cols)} subject columns...")
        long_df = df.melt(id_vars=id_vars, value_vars=subject_cols, var_name='RAW', value_name='ENROLLMENT')

        long_df['FORM_NUM'] = long_df['RAW'].str.extract(r'FORM (\d)').astype(int)
        long_df['SUBJECT'] = long_df['RAW'].str.split(' - ').str[1].str.strip()
        long_df.drop(columns=['RAW'], inplace=True)
        return long_df

    @staticmethod
    def merge_infrastructure(main_df, auxiliary_dfs):
        """
        Merges Tables, Labs, ICT, Electricity, Dropout, Reentry, Disability.
        """
        keys = ['YEAR', 'REGION', 'COUNCIL']
        df = main_df.copy()

        # 1. Tables
        tables = auxiliary_dfs.get('tables')
        if tables is not None and 'AVAILABLE_TABLES' in tables.columns:
            df = df.merge(tables[keys + ['AVAILABLE_TABLES']], on=keys, how='left').fillna({'AVAILABLE_TABLES': 0})

        # 2. Labs (Total)
        labs = auxiliary_dfs.get('labs')
        if labs is not None:
            lab_cols = [c for c in labs.columns if 'LABORATORY' in c]
            if lab_cols:
                labs['TOTAL_LABS'] = labs[lab_cols].sum(axis=1)
                df = df.merge(labs[keys + ['TOTAL_LABS']], on=keys, how='left').fillna({'TOTAL_LABS': 0})

        # 3. ICT (Computers)
        ict = auxiliary_dfs.get('ict')
        if ict is not None:
             comp_cols = [c for c in ict.columns if 'COMPUTER' in c]
             if comp_cols:
                 ict['TOTAL_COMPUTERS'] = ict[comp_cols].sum(axis=1)
                 df = df.merge(ict[keys + ['TOTAL_COMPUTERS']], on=keys, how='left').fillna({'TOTAL_COMPUTERS': 0})

        # 4. Electricity
        elec = auxiliary_dfs.get('electricity')
        if elec is not None:
            grid_col = next((c for c in elec.columns if 'TANESCO' in c or 'GRID' in c), None)
            if grid_col:
                df = df.merge(elec[keys + [grid_col]], on=keys, how='left').fillna({grid_col: 0})
                df.rename(columns={grid_col: 'ELEC_GRID_PCT'}, inplace=True)

        # 5. Re-entry
        reentry = auxiliary_dfs.get('reentry')
        if reentry is not None:
            re_cols = [c for c in reentry.columns if 'RE-ENROLLED' in c]
            if re_cols:
                reentry['TOTAL_REENTRY'] = reentry[re_cols].sum(axis=1)
                df = df.merge(reentry[keys + ['TOTAL_REENTRY']], on=keys, how='left').fillna({'TOTAL_REENTRY': 0})

        # 6. Disability
        disability = auxiliary_dfs.get('disability')
        if disability is not None:
            dis_cols = [c for c in disability.columns if c not in keys + ['Unnamed: 0']]
            dis_cols = [c for c in dis_cols if pd.api.types.is_numeric_dtype(disability[c])]
            if dis_cols:
                disability['TOTAL_DISABLED'] = disability[dis_cols].sum(axis=1)
                df = df.merge(disability[keys + ['TOTAL_DISABLED']], on=keys, how='left').fillna({'TOTAL_DISABLED': 0})

        # 7. Dropout (Specific Columns: Truancy, Pregnancy, Indiscipline)
        dropout = auxiliary_dfs.get('dropout')
        if dropout is not None:
            # Check for specific columns
            target_cols = ['TRUANCY', 'PREGNANCY', 'INDISCIPLINE']
            existing_cols = [c for c in target_cols if c in dropout.columns]

            if existing_cols:
                df = df.merge(dropout[keys + existing_cols], on=keys, how='left')
                # Fill NaNs for these columns
                for c in existing_cols:
                    df[c] = df[c].fillna(0)

        return df

    @staticmethod
    def create_lag_features(df):
        df = df.sort_values(['REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM', 'YEAR'])
        g = df.groupby(['REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM'])

        df['LAG_1'] = g['ENROLLMENT'].shift(1)
        df['LAG_2'] = g['ENROLLMENT'].shift(2)
        df['YOY_GROWTH'] = (df['ENROLLMENT'] - df['LAG_1']) / (df['LAG_1'] + 1e-5)
        df['IS_ELECTION_YEAR'] = df['YEAR'].isin([2015, 2020, 2025, 2030]).astype(int)

        return df.fillna(-1)

    @staticmethod
    def create_cohort_features(df):
        df['PREV_YEAR'] = df['YEAR'] - 1
        df['PREV_FORM'] = df['FORM_NUM'] - 1

        df['LOOKUP_KEY'] = (df['YEAR'].astype(str) + '_' + df['REGION'] + '_' +
                            df['COUNCIL'] + '_' + df['SUBJECT'] + '_' + df['FORM_NUM'].astype(str))

        lookup_dict = df.groupby('LOOKUP_KEY')['ENROLLMENT'].sum().to_dict()

        df['SEARCH_KEY'] = (df['PREV_YEAR'].astype(str) + '_' + df['REGION'] + '_' +
                            df['COUNCIL'] + '_' + df['SUBJECT'] + '_' + df['PREV_FORM'].astype(str))

        df['COHORT_LAG'] = df['SEARCH_KEY'].map(lookup_dict).fillna(-1)
        df.drop(columns=['PREV_YEAR', 'PREV_FORM', 'LOOKUP_KEY', 'SEARCH_KEY'], inplace=True)
        return df

# =============================================================================
# CLASS 4: Model Engine (Conditional Ensemble)
# =============================================================================

class EnrollmentModelEngine:
    """
    Manages Training, Conditional Selection, and Recursive Forecasting.
    Implements logic: Use Best Model ONLY unless others are within 60% performance.
    """
    def __init__(self, df):
        self.df = df
        self.models = {}
        self.encoders = {}

        # --- FULL FEATURE LIST AS REQUESTED ---
        self.features = [
            'YEAR', 'REGION_ENC', 'COUNCIL_ENC', 'SUBJECT_ENC', 'FORM_NUM',
            'AVAILABLE_TABLES', 'TOTAL_REENTRY', 'TOTAL_DISABLED',
            'TOTAL_COMPUTERS', 'ELEC_GRID_PCT', 'TOTAL_LABS',
            'TRUANCY', 'PREGNANCY', 'INDISCIPLINE',
            'LAG_1', 'LAG_2', 'YOY_GROWTH', 'COHORT_LAG', 'IS_ELECTION_YEAR'
        ]

        # Filter features to ensure they exist in DF (safety check)
        self.features = [f for f in self.features if f in df.columns]
        self.selected_model_keys = []

    def preprocess(self):
        self.encoders['REG'] = LabelEncoder()
        self.encoders['COU'] = LabelEncoder()
        self.encoders['SUB'] = LabelEncoder()

        self.df['REGION_ENC'] = self.encoders['REG'].fit_transform(self.df['REGION'].astype(str))
        self.df['COUNCIL_ENC'] = self.encoders['COU'].fit_transform(self.df['COUNCIL'].astype(str))
        self.df['SUBJECT_ENC'] = self.encoders['SUB'].fit_transform(self.df['SUBJECT'].astype(str))

    def train_all_models(self, cutoff_year=2023):
        print(f"\nTraining all candidate models on Data <= {cutoff_year}...")
        train_df = self.df[self.df['YEAR'] <= cutoff_year]
        X = train_df[self.features]
        y = train_df['ENROLLMENT']

        # 1. XGBoost
        print("Training XGBoost...")
        self.models['XGB'] = xgb.XGBRegressor(n_estimators=300, max_depth=9, learning_rate=0.05, n_jobs=-1)
        self.models['XGB'].fit(X, y)

        # 2. LightGBM
        print("Training LightGBM...")
        self.models['LGB'] = lgb.LGBMRegressor(n_estimators=500, num_leaves=50, min_child_samples=10, learning_rate=0.1, verbose=-1)
        self.models['LGB'].fit(X, y)

        # 3. Random Forest
        print("Training Random Forest...")
        #self.models['RF'] = RandomForestRegressor(n_estimators=300, max_depth=12, n_jobs=-1, random_state=42)
        #self.models['RF'].fit(X, y)

        # 4. HistGradientBoosting
        print("Training HistGradientBoosting...")
        #self.models['HGB'] = HistGradientBoostingRegressor(max_iter=500, learning_rate=0.1, max_depth=10)
        #self.models['HGB'].fit(X, y)

        # 5. GradientBoosting
        print("Training GradientBoosting...")
        #self.models['GB'] = GradientBoostingRegressor(n_estimators=300, max_depth=9, learning_rate=0.05)
        #self.models['GB'].fit(X, y)

        print("All models trained successfully.")

    def calculate_metrics(self, y_true, y_pred, model_name):
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        r2 = r2_score(y_true, y_pred)

        # --- IMPROVED METRIC: WMAPE (Weighted Mean Absolute Percentage Error) ---
        # Standard MAPE explodes with zero/small values. WMAPE is sum(abs_error)/sum(actual).
        total_actual = np.sum(y_true)
        if total_actual > 0:
            wmape = np.sum(np.abs(y_true - y_pred)) / total_actual * 100
            accuracy = 100 - wmape
        else:
            wmape = np.nan
            accuracy = 0.0

        print(f"--- {model_name} Results ---")
        print(f"   > {model_name}: R2={r2:.4f}, MAE={mae:,.2f}, RMSE={rmse:,.2f}, WMAPE={wmape:.2f}%, Acc={accuracy:.2f}%")
        print("-" * 30)
        return rmse

    def evaluate_and_select_strategy(self, test_year_start=2024):
        test_df = self.df[self.df['YEAR'] >= test_year_start]
        if test_df.empty:
            print("No test data available. Defaulting to XGB only.")
            self.selected_model_keys = ['XGB']
            return

        X_test = test_df[self.features]
        y_true = test_df['ENROLLMENT']

        performance = {}
        print(f"\n--- Evaluation Results (Test Data {test_year_start}+) ---")

        print("Evaluating all models...")
        for name, model in self.models.items():
            preds = model.predict(X_test)
            rmse = self.calculate_metrics(y_true, preds, name)
            performance[name] = rmse
            # Redundant print removed to keep output clean

        best_model_name = min(performance, key=performance.get)
        best_rmse = performance[best_model_name]
        print(f"\nBEST MODEL: {best_model_name} (RMSE: {best_rmse:,.2f})")

        # Threshold: Best RMSE + 60% of Best RMSE
        threshold = best_rmse * 1.6
        candidates = [name for name, score in performance.items() if score <= threshold]

        print(f"Selection Threshold (RMSE <= {threshold:,.2f})")
        print(f"Qualifying Models: {candidates}")

        if len(candidates) > 1:
            self.selected_model_keys = candidates
            print(f"Strategy: ENSEMBLE (Average of {', '.join(candidates)})")
        else:
            self.selected_model_keys = [best_model_name]
            print(f"Strategy: SINGLE BEST MODEL ({best_model_name})")

    def _predict(self, X):
        preds = []
        for key in self.selected_model_keys:
            preds.append(self.models[key].predict(X))
        return np.mean(preds, axis=0)

    def recursive_forecast(self, start_year, end_year):
        print(f"\nStarting Recursive Forecast ({start_year}-{end_year}) using {self.selected_model_keys}...")
        future_data = []
        current_data = self.df[self.df['YEAR'] == (start_year - 1)].copy()

        for year in range(start_year, end_year + 1):
            next_df = self._prepare_next_step(current_data, year)

            X_future = next_df[self.features]
            preds = self._predict(X_future)

            next_df['ENROLLMENT'] = np.maximum(0, preds)

            future_data.append(next_df)
            current_data = next_df.copy()
            print(f" > Forecasted {year}")

        return pd.concat(future_data, ignore_index=True)

    def _prepare_next_step(self, prev_df, target_year):
        # By copying prev_df, we "forward fill" all infrastructure/demographic columns
        # (REENTRY, DISABLED, TRUANCY, PREGNANCY, INDISCIPLINE etc.) from the previous year.
        next_df = prev_df.copy()
        next_df['YEAR'] = target_year

        # Shift Time Series Lags
        next_df['LAG_2'] = next_df['LAG_1']
        next_df['LAG_1'] = next_df['ENROLLMENT']

        # Update Growth
        next_df['YOY_GROWTH'] = (next_df['LAG_1'] - next_df['LAG_2']) / (next_df['LAG_2'] + 1e-5)

        # Update Cohort Logic
        cohort_lookup = prev_df.set_index(['REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM'])['ENROLLMENT'].to_dict()

        def get_cohort(row):
            target_form = row['FORM_NUM'] - 1
            if target_form < 1: return -1
            key = (row['REGION'], row['COUNCIL'], row['SUBJECT'], target_form)
            return cohort_lookup.get(key, -1)

        next_df['COHORT_LAG'] = next_df.apply(get_cohort, axis=1)
        next_df['IS_ELECTION_YEAR'] = 1 if target_year in [2025, 2030] else 0

        return next_df


# =============================================================================
# MAIN PIPELINE
# =============================================================================

def main():
    # 1. Configuration
    BASE_DIR = '/content/drive/MyDrive/GUIDELINES_TSC_JAN2026/Data Set/csvs/'
    GEO_FILE = '/content/drive/MyDrive/MOEST/tanzania_council_geodata.csv'
    EXCLUDE_KEYWORDS = ['Primary', 'Textbooks', 'Population', 'Teacher', 'COBET', 'Vocational']

    # 2. Load Data
    loader = MOESTDataLoader(BASE_DIR, EXCLUDE_KEYWORDS)
    loader.mount_drive()
    loader.load_data()
    all_dfs = loader.get_all_dataframes()

    # 3. Location & Clean Up
    loc_manager = LocationManager(all_dfs, GEO_FILE)
    loc_manager.standardize_location_columns()
    loc_manager.merge_lga_status()

    # 4. Feature Engineering
    print("\n--- Starting Feature Engineering ---")

    # Retrieve all required DataFrames
    df_subject = all_dfs.get("Secondary_students_per_subject")
    df_tables = all_dfs.get("Data-Secondary Tables and chairs 2016-2025")
    df_labs = all_dfs.get("Combined_Secondary_Laboratories_All_G_NG")
    df_dropout = all_dfs.get("Dropout-Secondary  2017-2024")
    df_reentry = all_dfs.get("Secondary-Re_entry")
    df_disability = all_dfs.get("Secondary - DISABALITY 2020-2025")
    df_ict = all_dfs.get("Combined_Secondary_ICT_All_G_NG")
    df_elec = all_dfs.get("Combined_Secondary_Electricity_All_G_NG")

    if df_subject is None:
        raise ValueError("Critical DataFrame 'Secondary_students_per_subject' not found.")

    aux_dfs = {
        'tables': df_tables,
        'labs': df_labs,
        'dropout': df_dropout,
        'reentry': df_reentry,
        'disability': df_disability,
        'ict': df_ict,
        'electricity': df_elec
    }

    engineer = FeatureEngineer()
    long_df = engineer.melt_subjects(df_subject)
    # Merge all auxiliary data
    merged_df = engineer.merge_infrastructure(long_df, aux_dfs)
    lagged_df = engineer.create_lag_features(merged_df)
    final_df = engineer.create_cohort_features(lagged_df)

    # 5. Modeling & Forecasting (Updated Strategy)
    print("\n--- Starting Model Engine ---")
    engine = EnrollmentModelEngine(final_df)
    engine.preprocess()

    # Train
    engine.train_all_models(cutoff_year=2023)

    # Evaluate & Select Strategy
    engine.evaluate_and_select_strategy(test_year_start=2024)

    # 6. Generate Forecast
    forecast_df = engine.recursive_forecast(2026, 2030)

    # 7. Output
    print("\n--- Final Forecast Sample ---")
    output_cols = ['YEAR', 'REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM', 'ENROLLMENT']
    final_view = forecast_df[output_cols].copy()
    final_view['ENROLLMENT'] = final_view['ENROLLMENT'].round(0).astype(int)
    print(final_view.head(10))

if __name__ == "__main__":
    main()

Mounted at /content/drive/
Found 15 files to load.
Loaded: Data-Secondary Enrollment 2016-2025
Loaded: Dropout-Secondary  2017-2024
Loaded: Data-Secondary Tables and chairs 2016-2025
Loaded: Secondary-Re_entry
Loaded: Secondary - DISABALITY 2020-2025
Loaded: LGAs Urban and Rural Status
Loaded: Combined_Secondary_Laboratories_Govt
Loaded: Combined_Secondary_Laboratories_All_G_NG
Loaded: Combined_Secondary_ICT_All_G_NG
Loaded: Combined_Secondary_ICT_Govt
Loaded: Combined_Secondary_Electricity_All_G_NG
Loaded: Combined_Secondary_Electricity_Govt
Loaded: Secondary_students_per_subject
Loaded: Secondary_enrollment_Gov_2016_2025
Loaded: Data-Secondary Enrollment 2016-2025 (1)
Merged LGA Status into Data-Secondary Enrollment 2016-2025
Merged LGA Status into Dropout-Secondary  2017-2024
Merged LGA Status into Data-Secondary Tables and chairs 2016-2025
Merged LGA Status into Secondary-Re_entry
Merged LGA Status into Secondary - DISABALITY 2020-2025
Merged LGA Status into Combined_Secondary_Labo

In [2]:
# -*- coding: utf-8 -*-
"""Refactored MoEST Modeling for Secondary School Enrollment
"""

import os
import re
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import (
    RandomForestRegressor,
    HistGradientBoostingRegressor,
    GradientBoostingRegressor
)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from google.colab import drive

# =============================================================================
# CLASS 1: Data Loader & Cleaner
# =============================================================================

class MOESTDataLoader:
    """
    Handles mounting drive, loading CSVs, cleaning headers, handling types,
    and managing initial data quality checks.
    """
    def __init__(self, base_directory, exclude_keywords=None):
        self.base_directory = base_directory
        self.exclude_keywords = exclude_keywords if exclude_keywords else []
        self.dataframes = {}

    def mount_drive(self):
        drive.mount('/content/drive/')

    def get_file_list(self):
        all_files = [f for f in os.listdir(self.base_directory) if f.endswith('.csv')]
        filtered_files = []
        for file_name in all_files:
            if not any(keyword.lower() in file_name.lower() for keyword in self.exclude_keywords):
                filtered_files.append(file_name)
        return filtered_files

    def load_data(self):
        files = self.get_file_list()
        print(f"Found {len(files)} files to load.")
        for file_name in files:
            file_path = os.path.join(self.base_directory, file_name)
            df_name = file_name.replace('.csv', '')
            try:
                df = pd.read_csv(file_path)
                self.dataframes[df_name] = self._initial_clean(df)
                print(f"Loaded: {df_name}")
            except Exception as e:
                print(f"Error loading {file_name}: {e}")

    def _initial_clean(self, df):
        """Standardizes headers and cleans numeric columns."""
        df.columns = [str(col).strip().upper() for col in df.columns]

        for col in df.select_dtypes(include='object').columns:
            if df[col].astype(str).str.contains(',').any():
                cleaned = df[col].astype(str).str.replace(',', '', regex=False)
                converted = pd.to_numeric(cleaned, errors='coerce')
                if converted.notna().sum() > 0:
                    df[col] = converted

        unnamed = [c for c in df.columns if 'UNNAMED' in c]
        to_drop = [c for c in unnamed if df[c].isnull().mean() > 0.9]
        df.drop(columns=to_drop, inplace=True)
        return df

    def get_dataframe(self, name):
        return self.dataframes.get(name)

    def get_all_dataframes(self):
        return self.dataframes

# =============================================================================
# CLASS 2: Geography & Location Manager
# =============================================================================

class LocationManager:
    """
    Handles Geocoding, LGA Status merging, and Clustering.
    """
    def __init__(self, dataframes, geodata_path):
        self.dataframes = dataframes
        self.geodata_path = geodata_path
        self.geo_data = None
        self.lga_status_df = None

    def standardize_location_columns(self):
        for name, df in self.dataframes.items():
            reg_col = next((c for c in df.columns if c in ['REGION', 'REGON']), None)
            cou_col = next((c for c in df.columns if c in ['COUNCIL', 'DISTRICT', 'LGA NAME']), None)

            if reg_col and cou_col:
                df.rename(columns={reg_col: 'REGION', cou_col: 'COUNCIL'}, inplace=True)
                df['REGION'] = df['REGION'].astype(str).str.upper()
                df['COUNCIL'] = df['COUNCIL'].astype(str).str.upper()

    def merge_lga_status(self, lga_df_name='LGAs Urban and Rural Status'):
        if lga_df_name not in self.dataframes:
            print("LGA Status DataFrame not found.")
            return

        self.lga_status_df = self.dataframes[lga_df_name].copy()
        if 'REMARKS' in self.lga_status_df.columns:
            self.lga_status_df.drop(columns=['REMARKS'], inplace=True)
        self.lga_status_df.rename(columns={'CLASSIFICATION': 'LGA_STATUS'}, inplace=True)

        for name, df in self.dataframes.items():
            if name == lga_df_name: continue
            if 'REGION' in df.columns and 'COUNCIL' in df.columns:
                merged = pd.merge(df, self.lga_status_df, on=['REGION', 'COUNCIL'], how='left')
                self.dataframes[name] = merged
                print(f"Merged LGA Status into {name}")
        del self.dataframes[lga_df_name]

    def process_geocoding(self):
        if os.path.exists(self.geodata_path):
            print("Loading existing geodata...")
            self.geo_data = pd.read_csv(self.geodata_path)
        else:
            print("Generating new geodata...")
            self._fetch_geodata()

        self._apply_clustering()

        for name, df in self.dataframes.items():
            if 'REGION' in df.columns and 'COUNCIL' in df.columns:
                merged = pd.merge(df, self.geo_data[['REGION', 'COUNCIL', 'GEO_CLUSTER']],
                                  on=['REGION', 'COUNCIL'], how='left')
                self.dataframes[name] = merged
                print(f"Merged Geo Cluster into {name}")

    def _fetch_geodata(self):
        locs = []
        for df in self.dataframes.values():
            if 'REGION' in df.columns and 'COUNCIL' in df.columns:
                locs.append(df[['REGION', 'COUNCIL']])
        unique_locs = pd.concat(locs).drop_duplicates().reset_index(drop=True)

        geolocator = Nominatim(user_agent="moest_geo_mapper_v3")
        geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1.0)

        lats, lons = [], []
        for idx, row in unique_locs.iterrows():
            query = f"{row['COUNCIL']}, {row['REGION']}, Tanzania"
            try:
                loc = geocode(query)
                if loc:
                    lats.append(loc.latitude)
                    lons.append(loc.longitude)
                else:
                    lats.append(None)
                    lons.append(None)
            except:
                lats.append(None)
                lons.append(None)

        unique_locs['LATITUDE'] = lats
        unique_locs['LONGITUDE'] = lons
        self.geo_data = unique_locs.dropna(subset=['LATITUDE', 'LONGITUDE'])
        self.geo_data.to_csv(self.geodata_path, index=False)

    def _apply_clustering(self):
        kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
        self.geo_data['GEO_CLUSTER'] = kmeans.fit_predict(self.geo_data[['LATITUDE', 'LONGITUDE']])

# =============================================================================
# CLASS 3: Feature Engineer
# =============================================================================

class FeatureEngineer:
    """
    Handles specific transformation logic for Secondary School Subjects.
    """
    @staticmethod
    def melt_subjects(df):
        id_vars = [c for c in ['YEAR', 'REGION', 'COUNCIL'] if c in df.columns]
        subject_cols = [c for c in df.columns if 'FORM ' in c and ' - ' in c]

        print(f"Melting {len(subject_cols)} subject columns...")
        long_df = df.melt(id_vars=id_vars, value_vars=subject_cols, var_name='RAW', value_name='ENROLLMENT')

        long_df['FORM_NUM'] = long_df['RAW'].str.extract(r'FORM (\d)').astype(int)
        long_df['SUBJECT'] = long_df['RAW'].str.split(' - ').str[1].str.strip()
        long_df.drop(columns=['RAW'], inplace=True)
        return long_df

    @staticmethod
    def merge_infrastructure(main_df, auxiliary_dfs):
        """
        Merges Tables, Labs, ICT, Electricity, Dropout, Reentry, Disability.
        """
        keys = ['YEAR', 'REGION', 'COUNCIL']
        df = main_df.copy()

        # 1. Tables
        tables = auxiliary_dfs.get('tables')
        if tables is not None and 'AVAILABLE_TABLES' in tables.columns:
            df = df.merge(tables[keys + ['AVAILABLE_TABLES']], on=keys, how='left').fillna({'AVAILABLE_TABLES': 0})

        # 2. Labs (Total)
        labs = auxiliary_dfs.get('labs')
        if labs is not None:
            lab_cols = [c for c in labs.columns if 'LABORATORY' in c]
            if lab_cols:
                labs['TOTAL_LABS'] = labs[lab_cols].sum(axis=1)
                df = df.merge(labs[keys + ['TOTAL_LABS']], on=keys, how='left').fillna({'TOTAL_LABS': 0})

        # 3. ICT (Computers)
        ict = auxiliary_dfs.get('ict')
        if ict is not None:
             comp_cols = [c for c in ict.columns if 'COMPUTER' in c]
             if comp_cols:
                 ict['TOTAL_COMPUTERS'] = ict[comp_cols].sum(axis=1)
                 df = df.merge(ict[keys + ['TOTAL_COMPUTERS']], on=keys, how='left').fillna({'TOTAL_COMPUTERS': 0})

        # 4. Electricity
        elec = auxiliary_dfs.get('electricity')
        if elec is not None:
            grid_col = next((c for c in elec.columns if 'TANESCO' in c or 'GRID' in c), None)
            if grid_col:
                df = df.merge(elec[keys + [grid_col]], on=keys, how='left').fillna({grid_col: 0})
                df.rename(columns={grid_col: 'ELEC_GRID_PCT'}, inplace=True)

        # 5. Re-entry
        reentry = auxiliary_dfs.get('reentry')
        if reentry is not None:
            re_cols = [c for c in reentry.columns if 'RE-ENROLLED' in c]
            if re_cols:
                reentry['TOTAL_REENTRY'] = reentry[re_cols].sum(axis=1)
                df = df.merge(reentry[keys + ['TOTAL_REENTRY']], on=keys, how='left').fillna({'TOTAL_REENTRY': 0})

        # 6. Disability
        disability = auxiliary_dfs.get('disability')
        if disability is not None:
            dis_cols = [c for c in disability.columns if c not in keys + ['Unnamed: 0']]
            dis_cols = [c for c in dis_cols if pd.api.types.is_numeric_dtype(disability[c])]
            if dis_cols:
                disability['TOTAL_DISABLED'] = disability[dis_cols].sum(axis=1)
                df = df.merge(disability[keys + ['TOTAL_DISABLED']], on=keys, how='left').fillna({'TOTAL_DISABLED': 0})

        # 7. Dropout (Specific Columns: Truancy, Pregnancy, Indiscipline)
        dropout = auxiliary_dfs.get('dropout')
        if dropout is not None:
            # Check for specific columns
            target_cols = ['TRUANCY', 'PREGNANCY', 'INDISCIPLINE']
            existing_cols = [c for c in target_cols if c in dropout.columns]

            if existing_cols:
                df = df.merge(dropout[keys + existing_cols], on=keys, how='left')
                # Fill NaNs for these columns
                for c in existing_cols:
                    df[c] = df[c].fillna(0)

        return df

    @staticmethod
    def create_lag_features(df):
        df = df.sort_values(['REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM', 'YEAR'])
        g = df.groupby(['REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM'])

        df['LAG_1'] = g['ENROLLMENT'].shift(1)
        df['LAG_2'] = g['ENROLLMENT'].shift(2)
        df['YOY_GROWTH'] = (df['ENROLLMENT'] - df['LAG_1']) / (df['LAG_1'] + 1e-5)
        df['IS_ELECTION_YEAR'] = df['YEAR'].isin([2015, 2020, 2025, 2030]).astype(int)

        return df.fillna(-1)

    @staticmethod
    def create_cohort_features(df):
        df['PREV_YEAR'] = df['YEAR'] - 1
        df['PREV_FORM'] = df['FORM_NUM'] - 1

        df['LOOKUP_KEY'] = (df['YEAR'].astype(str) + '_' + df['REGION'] + '_' +
                            df['COUNCIL'] + '_' + df['SUBJECT'] + '_' + df['FORM_NUM'].astype(str))

        lookup_dict = df.groupby('LOOKUP_KEY')['ENROLLMENT'].sum().to_dict()

        df['SEARCH_KEY'] = (df['PREV_YEAR'].astype(str) + '_' + df['REGION'] + '_' +
                            df['COUNCIL'] + '_' + df['SUBJECT'] + '_' + df['PREV_FORM'].astype(str))

        df['COHORT_LAG'] = df['SEARCH_KEY'].map(lookup_dict).fillna(-1)
        df.drop(columns=['PREV_YEAR', 'PREV_FORM', 'LOOKUP_KEY', 'SEARCH_KEY'], inplace=True)
        return df

# =============================================================================
# CLASS 4: Model Engine (Conditional Ensemble)
# =============================================================================

class EnrollmentModelEngine:
    """
    Manages Training, Conditional Selection, and Recursive Forecasting.
    Implements logic: Use Best Model ONLY unless others are within 60% performance.
    """
    def __init__(self, df):
        self.df = df
        self.models = {}
        self.encoders = {}

        # --- FULL FEATURE LIST AS REQUESTED ---
        self.features = [
            'YEAR', 'REGION_ENC', 'COUNCIL_ENC', 'SUBJECT_ENC', 'FORM_NUM',
            'AVAILABLE_TABLES', 'TOTAL_REENTRY', 'TOTAL_DISABLED',
            'TOTAL_COMPUTERS', 'ELEC_GRID_PCT', 'TOTAL_LABS',
            'TRUANCY', 'PREGNANCY', 'INDISCIPLINE',
            'LAG_1', 'LAG_2', 'YOY_GROWTH', 'COHORT_LAG', 'IS_ELECTION_YEAR'
        ]

        # Filter features to ensure they exist in DF (safety check)
        self.features = [f for f in self.features if f in df.columns]
        self.selected_model_keys = []

    def preprocess(self):
        self.encoders['REG'] = LabelEncoder()
        self.encoders['COU'] = LabelEncoder()
        self.encoders['SUB'] = LabelEncoder()

        self.df['REGION_ENC'] = self.encoders['REG'].fit_transform(self.df['REGION'].astype(str))
        self.df['COUNCIL_ENC'] = self.encoders['COU'].fit_transform(self.df['COUNCIL'].astype(str))
        self.df['SUBJECT_ENC'] = self.encoders['SUB'].fit_transform(self.df['SUBJECT'].astype(str))

    def train_all_models(self, cutoff_year=2023):
        print(f"\nTraining all candidate models on Data <= {cutoff_year}...")
        train_df = self.df[self.df['YEAR'] <= cutoff_year]
        X = train_df[self.features]
        y = train_df['ENROLLMENT']

        # 1. XGBoost
        print("Training XGBoost...")
        self.models['XGB'] = xgb.XGBRegressor(n_estimators=300, max_depth=9, learning_rate=0.05, n_jobs=-1)
        self.models['XGB'].fit(X, y)

        # 2. LightGBM
        print("Training LightGBM...")
        self.models['LGB'] = lgb.LGBMRegressor(n_estimators=500, num_leaves=50, min_child_samples=10, learning_rate=0.1, verbose=-1)
        self.models['LGB'].fit(X, y)

        # 3. Random Forest
        print("Training Random Forest...")
        self.models['RF'] = RandomForestRegressor(n_estimators=300, max_depth=12, n_jobs=-1, random_state=42)
        self.models['RF'].fit(X, y)

        # 4. HistGradientBoosting
        print("Training HistGradientBoosting...")
        self.models['HGB'] = HistGradientBoostingRegressor(max_iter=500, learning_rate=0.1, max_depth=10)
        self.models['HGB'].fit(X, y)

        # 5. GradientBoosting
        print("Training GradientBoosting...")
        self.models['GB'] = GradientBoostingRegressor(n_estimators=300, max_depth=9, learning_rate=0.05)
        self.models['GB'].fit(X, y)

        print("All models trained successfully.")

    def calculate_metrics(self, y_true, y_pred, model_name):
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        r2 = r2_score(y_true, y_pred)

        # --- IMPROVED METRIC: WMAPE (Weighted Mean Absolute Percentage Error) ---
        # Standard MAPE explodes with zero/small values. WMAPE is sum(abs_error)/sum(actual).
        total_actual = np.sum(y_true)
        if total_actual > 0:
            wmape = np.sum(np.abs(y_true - y_pred)) / total_actual * 100
            accuracy = 100 - wmape
        else:
            wmape = np.nan
            accuracy = 0.0

        print(f"--- {model_name} Results ---")
        print(f"   > {model_name}: R2={r2:.4f}, MAE={mae:,.2f}, RMSE={rmse:,.2f}, WMAPE={wmape:.2f}%, Acc={accuracy:.2f}%")
        print("-" * 30)
        return rmse

    def evaluate_and_select_strategy(self, test_year_start=2024):
        test_df = self.df[self.df['YEAR'] >= test_year_start]
        if test_df.empty:
            print("No test data available. Defaulting to XGB only.")
            self.selected_model_keys = ['XGB']
            return

        X_test = test_df[self.features]
        y_true = test_df['ENROLLMENT']

        performance = {}
        print(f"\n--- Evaluation Results (Test Data {test_year_start}+) ---")

        print("Evaluating all models...")
        for name, model in self.models.items():
            preds = model.predict(X_test)
            rmse = self.calculate_metrics(y_true, preds, name)
            performance[name] = rmse
            # Redundant print removed to keep output clean

        best_model_name = min(performance, key=performance.get)
        best_rmse = performance[best_model_name]
        print(f"\nBEST MODEL: {best_model_name} (RMSE: {best_rmse:,.2f})")

        # Threshold: Best RMSE + 60% of Best RMSE
        threshold = best_rmse * 1.6
        candidates = [name for name, score in performance.items() if score <= threshold]

        print(f"Selection Threshold (RMSE <= {threshold:,.2f})")
        print(f"Qualifying Models: {candidates}")

        if len(candidates) > 1:
            self.selected_model_keys = candidates
            print(f"Strategy: ENSEMBLE (Average of {', '.join(candidates)})")
        else:
            self.selected_model_keys = [best_model_name]
            print(f"Strategy: SINGLE BEST MODEL ({best_model_name})")

    def _predict(self, X):
        preds = []
        for key in self.selected_model_keys:
            preds.append(self.models[key].predict(X))
        return np.mean(preds, axis=0)

    def recursive_forecast(self, start_year, end_year):
        print(f"\nStarting Recursive Forecast ({start_year}-{end_year}) using {self.selected_model_keys}...")
        future_data = []
        current_data = self.df[self.df['YEAR'] == (start_year - 1)].copy()

        for year in range(start_year, end_year + 1):
            next_df = self._prepare_next_step(current_data, year)

            X_future = next_df[self.features]
            preds = self._predict(X_future)

            next_df['ENROLLMENT'] = np.maximum(0, preds)

            future_data.append(next_df)
            current_data = next_df.copy()
            print(f" > Forecasted {year}")

        return pd.concat(future_data, ignore_index=True)

    def _prepare_next_step(self, prev_df, target_year):
        # By copying prev_df, we "forward fill" all infrastructure/demographic columns
        # (REENTRY, DISABLED, TRUANCY, PREGNANCY, INDISCIPLINE etc.) from the previous year.
        next_df = prev_df.copy()
        next_df['YEAR'] = target_year

        # Shift Time Series Lags
        next_df['LAG_2'] = next_df['LAG_1']
        next_df['LAG_1'] = next_df['ENROLLMENT']

        # Update Growth
        next_df['YOY_GROWTH'] = (next_df['LAG_1'] - next_df['LAG_2']) / (next_df['LAG_2'] + 1e-5)

        # Update Cohort Logic
        cohort_lookup = prev_df.set_index(['REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM'])['ENROLLMENT'].to_dict()

        def get_cohort(row):
            target_form = row['FORM_NUM'] - 1
            if target_form < 1: return -1
            key = (row['REGION'], row['COUNCIL'], row['SUBJECT'], target_form)
            return cohort_lookup.get(key, -1)

        next_df['COHORT_LAG'] = next_df.apply(get_cohort, axis=1)
        next_df['IS_ELECTION_YEAR'] = 1 if target_year in [2025, 2030] else 0

        return next_df


# =============================================================================
# MAIN PIPELINE
# =============================================================================

def main():
    # 1. Configuration
    BASE_DIR = '/content/drive/MyDrive/GUIDELINES_TSC_JAN2026/Data Set/csvs/'
    GEO_FILE = '/content/drive/MyDrive/MOEST/tanzania_council_geodata.csv'
    EXCLUDE_KEYWORDS = ['Primary', 'Textbooks', 'Population', 'Teacher', 'COBET', 'Vocational']

    # 2. Load Data
    loader = MOESTDataLoader(BASE_DIR, EXCLUDE_KEYWORDS)
    loader.mount_drive()
    loader.load_data()
    all_dfs = loader.get_all_dataframes()

    # 3. Location & Clean Up
    loc_manager = LocationManager(all_dfs, GEO_FILE)
    loc_manager.standardize_location_columns()
    loc_manager.merge_lga_status()

    # 4. Feature Engineering
    print("\n--- Starting Feature Engineering ---")

    # Retrieve all required DataFrames
    df_subject = all_dfs.get("Secondary_students_per_subject")
    df_tables = all_dfs.get("Data-Secondary Tables and chairs 2016-2025")
    df_labs = all_dfs.get("Combined_Secondary_Laboratories_All_G_NG")
    df_dropout = all_dfs.get("Dropout-Secondary  2017-2024")
    df_reentry = all_dfs.get("Secondary-Re_entry")
    df_disability = all_dfs.get("Secondary - DISABALITY 2020-2025")
    df_ict = all_dfs.get("Combined_Secondary_ICT_All_G_NG")
    df_elec = all_dfs.get("Combined_Secondary_Electricity_All_G_NG")

    if df_subject is None:
        raise ValueError("Critical DataFrame 'Secondary_students_per_subject' not found.")

    aux_dfs = {
        'tables': df_tables,
        'labs': df_labs,
        'dropout': df_dropout,
        'reentry': df_reentry,
        'disability': df_disability,
        'ict': df_ict,
        'electricity': df_elec
    }

    engineer = FeatureEngineer()
    long_df = engineer.melt_subjects(df_subject)
    # Merge all auxiliary data
    merged_df = engineer.merge_infrastructure(long_df, aux_dfs)
    lagged_df = engineer.create_lag_features(merged_df)
    final_df = engineer.create_cohort_features(lagged_df)

    # 5. Modeling & Forecasting (Updated Strategy)
    print("\n--- Starting Model Engine ---")
    engine = EnrollmentModelEngine(final_df)
    engine.preprocess()

    # Train
    engine.train_all_models(cutoff_year=2023)

    # Evaluate & Select Strategy
    engine.evaluate_and_select_strategy(test_year_start=2024)

    # 6. Generate Forecast
    forecast_df = engine.recursive_forecast(2026, 2030)

    # 7. Output
    print("\n--- Final Forecast Sample ---")
    output_cols = ['YEAR', 'REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM', 'ENROLLMENT']
    final_view = forecast_df[output_cols].copy()
    final_view['ENROLLMENT'] = final_view['ENROLLMENT'].round(0).astype(int)
    print(final_view.head(10))

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Found 15 files to load.
Loaded: Data-Secondary Enrollment 2016-2025
Loaded: Dropout-Secondary  2017-2024
Loaded: Data-Secondary Tables and chairs 2016-2025
Loaded: Secondary-Re_entry
Loaded: Secondary - DISABALITY 2020-2025
Loaded: LGAs Urban and Rural Status
Loaded: Combined_Secondary_Laboratories_Govt
Loaded: Combined_Secondary_Laboratories_All_G_NG
Loaded: Combined_Secondary_ICT_All_G_NG
Loaded: Combined_Secondary_ICT_Govt
Loaded: Combined_Secondary_Electricity_All_G_NG
Loaded: Combined_Secondary_Electricity_Govt
Loaded: Secondary_students_per_subject
Loaded: Secondary_enrollment_Gov_2016_2025
Loaded: Data-Secondary Enrollment 2016-2025 (1)
Merged LGA Status into Data-Secondary Enrollment 2016-2025
Merged LGA Status into Dropout-Secondary  2017-2024
Merged LGA Status into Data-Secondary Tables and chairs 2016-2025
Merged LGA Status into Secondary-Re_entr

In [3]:
# -*- coding: utf-8 -*-
"""
Refactored MoEST Modeling for Secondary School Per Subjects
Object-Oriented Implementation
"""

import os
import re
import time
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.cluster import KMeans
from sklearn.ensemble import (
    AdaBoostRegressor,
    HistGradientBoostingRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor
)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# Note: In a pure script environment, Colab specific commands
# like drive.mount or !pip should ideally be external setup steps.
# They are included here in comments or wrapped methods to preserve logic.

class Config:
    """Holds configuration paths and constants."""
    BASE_DIRECTORY = '/content/drive/MyDrive/GUIDELINES_TSC_JAN2026/Data Set/csvs/'
    GEODATA_FILENAME = '/content/drive/MyDrive/MOEST/tanzania_council_geodata.csv'

    # Column mapping constants used throughout the pipeline
    EXPECTED_COLS = {
        'Year': ['Year', 'YEAR', 'Academic Year'],
        'Region': ['Region', 'REGION', 'REGON'],
        'Council': ['Council', 'COUNCIL', 'DISTRICT', 'LGA NAME']
    }

    # Files to exclude during loading
    EXCLUDE_KEYWORDS = ['Primary', 'Textbooks', 'Population', 'COBET', 'Vocational', 'Enrollment']

class DataHandler:
    """Handles loading, initial cleaning, and merging of datasets."""

    def __init__(self, base_dir):
        self.base_dir = base_dir
        self.dataframes = {}

    def load_data(self):
        """Scans directory and loads CSVs not matching exclude keywords."""
        try:
            # Equivalent to !ls in python
            all_files_in_dir = os.listdir(self.base_dir)
            all_files = [f for f in all_files_in_dir if f.endswith('.csv')]

            filtered_files = []
            for file_name in all_files:
                if not any(keyword.lower() in file_name.lower() for keyword in Config.EXCLUDE_KEYWORDS):
                    filtered_files.append(file_name)

            print("Filtered files:", filtered_files)

            for file_name in filtered_files:
                file_path = os.path.join(self.base_dir, file_name)
                df_name = file_name.replace('.csv', '')
                try:
                    self.dataframes[df_name] = pd.read_csv(file_path)
                    print(f"Loaded {df_name} successfully.")
                except Exception as e:
                    print(f"Error loading {file_name}: {e}")
        except FileNotFoundError:
            print(f"Directory not found: {self.base_dir}")

    def clean_data(self):
        """Performs numeric conversion and drops empty columns."""
        for df_name, df in self.dataframes.items():
            print(f"\n--- Cleaning DataFrame: {df_name} ---")

            # 1. Convert 'object' columns to numeric
            for col in df.select_dtypes(include='object').columns:
                if df[col].astype(str).str.contains(',').any() or df[col].astype(str).str.fullmatch(r'\d+\.?\d*').any():
                    cleaned_col = df[col].astype(str).str.replace(',', '', regex=False).str.strip()
                    converted_col = pd.to_numeric(cleaned_col, errors='coerce')
                    if converted_col.notna().sum() > 0 and converted_col.dtype != object:
                        df[col] = converted_col
                        print(f"  Converted column '{col}' to numeric type.")

            # 2. Drop predominantly null 'Unnamed' columns
            unnamed_cols = [col for col in df.columns if re.match(r'Unnamed: \d+', str(col))]
            cols_to_drop = [col for col in unnamed_cols if (df[col].isnull().sum() / len(df) * 100) > 90]

            if cols_to_drop:
                df.drop(columns=cols_to_drop, inplace=True)
                print(f"  Dropped columns: {', '.join(cols_to_drop)}")

    def check_discrepancies(self):
        """Checks for missing core columns or data type mismatches."""
        all_discrepancies = []
        for df_name, df in self.dataframes.items():
            discrepancies = []
            for conceptual_col, possible_names in Config.EXPECTED_COLS.items():
                found_col_name = next((name for name in possible_names if name in df.columns), None)

                if found_col_name:
                    current_dtype = df[found_col_name].dtype
                    null_count = df[found_col_name].isnull().sum()

                    if conceptual_col == 'Year' and not pd.api.types.is_numeric_dtype(current_dtype):
                        discrepancies.append({'dataframe': df_name, 'column': found_col_name, 'issue': 'Inconsistent Data Type'})
                    elif conceptual_col in ['Region', 'Council'] and not pd.api.types.is_object_dtype(current_dtype):
                        discrepancies.append({'dataframe': df_name, 'column': found_col_name, 'issue': 'Inconsistent Data Type'})

                    if null_count > 0:
                        discrepancies.append({'dataframe': df_name, 'column': found_col_name, 'issue': 'Null Values', 'count': null_count})
                else:
                    discrepancies.append({'dataframe': df_name, 'column': conceptual_col, 'issue': 'Missing Column'})

            if discrepancies:
                all_discrepancies.extend(discrepancies)

        if all_discrepancies:
            print("--- Discrepancies Found ---")
            for d in all_discrepancies: print(d)
        else:
            print("No discrepancies found for core columns.")

    def merge_lga_status(self):
        """Merges LGA Status into all other dataframes."""
        if 'LGAs Urban and Rural Status' not in self.dataframes:
            print("LGA Status dataframe not found.")
            return

        df_lga_status = self.dataframes['LGAs Urban and Rural Status'].copy()
        if 'Remarks' in df_lga_status.columns:
            df_lga_status.drop(columns=['Remarks'], inplace=True)

        df_lga_status.rename(columns={'Region': 'Region', 'Council': 'Council', 'Classification': 'LGA_Status'}, inplace=True)
        df_lga_status['Region'] = df_lga_status['Region'].str.upper()
        df_lga_status['Council'] = df_lga_status['Council'].str.upper()

        for df_name, df in self.dataframes.items():
            if df_name == 'LGAs Urban and Rural Status': continue

            # Identify columns
            actual_region = next((n for n in Config.EXPECTED_COLS['Region'] if n in df.columns), None)
            actual_council = next((n for n in Config.EXPECTED_COLS['Council'] if n in df.columns), None)

            if actual_region and actual_council:
                df[actual_region] = df[actual_region].astype(str).str.upper()
                df[actual_council] = df[actual_council].astype(str).str.upper()

                try:
                    merged_df = pd.merge(df, df_lga_status,
                                         left_on=[actual_region, actual_council],
                                         right_on=['Region', 'Council'],
                                         how='left', suffixes=('', '_LGA'))
                    if 'Region_LGA' in merged_df.columns: merged_df.drop(columns=['Region_LGA'], inplace=True)
                    if 'Council_LGA' in merged_df.columns: merged_df.drop(columns=['Council_LGA'], inplace=True)

                    self.dataframes[df_name] = merged_df
                    print(f"  Merged LGA_Status into {df_name}")
                except Exception as e:
                    print(f"  Error merging {df_name}: {e}")

        # Cleanup
        del self.dataframes['LGAs Urban and Rural Status']
        if 'Pre-primary GER NA NER 2017-2025' in self.dataframes:
            del self.dataframes['Pre-primary GER NA NER 2017-2025']

    def drop_null_locations(self):
        """Drops rows where Region or Council is null."""
        for df_name, df in self.dataframes.items():
            actual_region = next((n for n in Config.EXPECTED_COLS['Region'] if n in df.columns), None)
            actual_council = next((n for n in Config.EXPECTED_COLS['Council'] if n in df.columns), None)

            if actual_region and actual_council:
                df.dropna(subset=[actual_region, actual_council], inplace=True)

    def get_dataframe(self, name):
        return self.dataframes.get(name)


class GeoProcessor:
    """Handles Geocoding and Clustering."""

    def __init__(self, data_handler):
        self.data_handler = data_handler
        self.geo_data = None

    def process_geodata(self):
        """Loads or generates geodata, then performs clustering."""
        if os.path.exists(Config.GEODATA_FILENAME):
            print(f"Loading geodata from {Config.GEODATA_FILENAME}")
            self.geo_data = pd.read_csv(Config.GEODATA_FILENAME)
        else:
            self._generate_geodata()

        self._cluster_geodata()
        self._merge_geodata_to_dataframes()

    def _generate_geodata(self):
        print("Generating new Geodata...")
        location_list = []
        for df_name, df in self.data_handler.dataframes.items():
            region_col = next((col for col in df.columns if col.lower() in ['region', 'regon']), None)
            council_col = next((col for col in df.columns if col.lower() in ['council', 'district', 'lga name']), None)

            if region_col and council_col:
                subset = df[[region_col, council_col]].astype(str).drop_duplicates().copy()
                subset.rename(columns={region_col: 'Region', council_col: 'Council'}, inplace=True)
                subset['Region'] = subset['Region'].str.upper()
                subset['Council'] = subset['Council'].str.upper()
                location_list.append(subset)

        self.geo_data = pd.concat(location_list).drop_duplicates().reset_index(drop=True)

        # Geocoding Logic
        def clean_council_name(name):
            name = str(name).replace(" MC", " Municipal Council").replace(" DC", " District Council")
            name = name.replace(" TC", " Town Council").replace(" CC", " City Council")
            return name

        self.geo_data['Search_Query'] = (
            self.geo_data['Council'].apply(clean_council_name) + ", " +
            self.geo_data['Region'] + ", Tanzania"
        )

        geolocator = Nominatim(user_agent="tanzania_education_mapping_project_refactored")
        geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1.5)

        print("Starting geocoding (this may take time)...")
        # In a real run, this loop runs. For refactoring safety, we keep logic but assume it runs.
        lats, longs = [], []
        for query in self.geo_data['Search_Query']:
            try:
                location = geocode(query)
                lats.append(location.latitude if location else None)
                longs.append(location.longitude if location else None)
            except Exception as e:
                print(f"Error {query}: {e}")
                lats.append(None)
                longs.append(None)

        self.geo_data['Latitude'] = lats
        self.geo_data['Longitude'] = longs
        self.geo_data = self.geo_data[['Region', 'Council', 'Latitude', 'Longitude']]
        self.geo_data.to_csv(Config.GEODATA_FILENAME, index=False)

    def _cluster_geodata(self):
        print("Clustering Geodata...")
        self.geo_data['Latitude'] = pd.to_numeric(self.geo_data['Latitude'], errors='coerce')
        self.geo_data['Longitude'] = pd.to_numeric(self.geo_data['Longitude'], errors='coerce')
        self.geo_data.dropna(subset=['Latitude', 'Longitude'], inplace=True)

        X = self.geo_data[['Latitude', 'Longitude']]
        kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
        self.geo_data['Geo_Cluster'] = kmeans.fit_predict(X)
        print("Clustering complete.")

    def _merge_geodata_to_dataframes(self):
        print("Merging Geo_Cluster into main dataframes...")
        for df_name, df in self.data_handler.dataframes.items():
            actual_region = next((n for n in Config.EXPECTED_COLS['Region'] if n in df.columns), None)
            actual_council = next((n for n in Config.EXPECTED_COLS['Council'] if n in df.columns), None)

            if actual_region and actual_council:
                df[actual_region] = df[actual_region].astype(str).str.upper()
                df[actual_council] = df[actual_council].astype(str).str.upper()

                try:
                    merged_df = pd.merge(df, self.geo_data[['Region', 'Council', 'Geo_Cluster']],
                                         left_on=[actual_region, actual_council],
                                         right_on=['Region', 'Council'],
                                         how='left', suffixes=('', '_geo'))

                    if 'Region_geo' in merged_df.columns: merged_df.drop(columns=['Region_geo'], inplace=True)
                    if 'Council_geo' in merged_df.columns: merged_df.drop(columns=['Council_geo'], inplace=True)

                    self.data_handler.dataframes[df_name] = merged_df
                except Exception as e:
                    print(f"Error merging Geo for {df_name}: {e}")


class SubjectModeler:
    """Handles Feature Engineering, Training, and Forecasting for Subjects."""

    def __init__(self, data_handler):
        self.dh = data_handler
        self.xgb_model = None
        self.lgb_model = None

    @staticmethod
    def standard_cols(df):
        df = df.copy()
        df.columns = [str(c).strip().upper() for c in df.columns]
        rename_map = {
            'YEAR': 'YEAR', 'REGION': 'REGION', 'COUNCIL': 'COUNCIL',
            'LGA_STATUS': 'LGA_STATUS', 'GEO_CLUSTER': 'GEO_CLUSTER'
        }
        df.rename(columns=rename_map, inplace=True)
        return df.loc[:, ~df.columns.duplicated()]

    def prepare_data(self):
        """Standardizes and reshapes subject data to long format."""
        # Retrieve necessary dataframes
        df_subject = self.dh.get_dataframe("Secondary_students_per_subject")
        df_table = self.dh.get_dataframe("Data-Secondary Tables and chairs 2016-2025")
        df_reentry = self.dh.get_dataframe("Secondary-Re_entry")
        df_disability = self.dh.get_dataframe("Secondary - DISABALITY 2020-2025")
        df_ict = self.dh.get_dataframe("Combined_Secondary_ICT_All_G_NG")
        df_elec = self.dh.get_dataframe("Combined_Secondary_Electricity_All_G_NG")
        df_labs = self.dh.get_dataframe("Combined_Secondary_Laboratories_All_G_NG")

        # Standardize
        enroll = self.standard_cols(df_subject)
        tables = self.standard_cols(df_table)
        reentry = self.standard_cols(df_reentry)
        disability = self.standard_cols(df_disability)
        ict = self.standard_cols(df_ict)
        elec = self.standard_cols(df_elec)
        labs = self.standard_cols(df_labs)

        # Melt Subject Columns
        id_vars = [c for c in ['YEAR', 'REGION', 'COUNCIL'] if c in enroll.columns]
        subject_cols = [c for c in enroll.columns if 'FORM ' in c and ' - ' in c]

        print(f"Melting {len(subject_cols)} subject columns...")
        long_df = enroll.melt(id_vars=id_vars, value_vars=subject_cols, var_name='RAW', value_name='ENROLLMENT')

        long_df['FORM_NUM'] = long_df['RAW'].str.extract(r'FORM (\d)').astype(int)
        long_df['SUBJECT'] = long_df['RAW'].str.split(' - ').str[1].str.strip()
        long_df.drop(columns=['RAW'], inplace=True)

        # Merge Council Features
        keys = ['YEAR', 'REGION', 'COUNCIL']

        # Tables
        if 'AVAILABLE_TABLES' in tables.columns:
            long_df = long_df.merge(tables[keys + ['AVAILABLE_TABLES']], on=keys, how='left').fillna(0)
        else:
            long_df['AVAILABLE_TABLES'] = 0

        # Labs
        lab_cols = [c for c in labs.columns if 'LABORATORY' in c]
        if lab_cols:
            labs['TOTAL_LABS'] = labs[lab_cols].sum(axis=1)
            long_df = long_df.merge(labs[keys + ['TOTAL_LABS']], on=keys, how='left').fillna(0)
        else:
            long_df['TOTAL_LABS'] = 0

        return long_df.sort_values(['REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM', 'YEAR'])

    def engineer_features(self, df):
        """Adds Lags, Growth, and Cohort logic."""
        g = df.groupby(['REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM'])
        df['LAG_1'] = g['ENROLLMENT'].shift(1)
        df['LAG_2'] = g['ENROLLMENT'].shift(2)
        df['YOY_GROWTH'] = (df['ENROLLMENT'] - df['LAG_1']) / (df['LAG_1'] + 1e-5)

        # Cohort Flow
        df['PREV_YEAR'] = df['YEAR'] - 1
        df['PREV_FORM'] = df['FORM_NUM'] - 1

        df['LOOKUP_KEY'] = (df['YEAR'].astype(str) + '_' + df['REGION'] + '_' +
                            df['COUNCIL'] + '_' + df['SUBJECT'] + '_' + df['FORM_NUM'].astype(str))

        lookup = df.groupby('LOOKUP_KEY')['ENROLLMENT'].sum().to_dict()

        df['SEARCH_KEY'] = (df['PREV_YEAR'].astype(str) + '_' + df['REGION'] + '_' +
                            df['COUNCIL'] + '_' + df['SUBJECT'] + '_' + df['PREV_FORM'].astype(str))

        df['COHORT_LAG'] = df['SEARCH_KEY'].map(lookup).fillna(-1)

        df.drop(columns=['PREV_YEAR', 'PREV_FORM', 'LOOKUP_KEY', 'SEARCH_KEY'], inplace=True)
        df['IS_ELECTION_YEAR'] = df['YEAR'].isin([2020, 2025, 2030]).astype(int)

        return df.fillna(-1)
    # ==========================================
    # 3. Model Training (Updated for Subject)
    # ==========================================
    def calculate_metrics(y_true, y_pred, model_name):
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        r2 = r2_score(y_true, y_pred)

        # Handle division by zero for MAPE
        mask = y_true != 0
        if mask.sum() > 0:
            mape = np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100
        else:
            mape = np.nan

        print(f"--- {model_name} Results ---")
        print(f"MAE  : {mae:,.2f}")
        print(f"RMSE : {rmse:,.2f}")
        print(f"R^2  : {r2:.4f}")
        print(f"MAPE : {mape:.2f}% Accuracy :{(100-mape) if not np.isnan(mape) else 0:.2f}%")
        print("-" * 30)
        return {'MAE': mae, 'RMSE': rmse, 'R2': r2, 'MAPE': mape}

    def train_models(self, df):
        """Trains XGBoost and LightGBM models."""
        le_reg = LabelEncoder()
        le_cou = LabelEncoder()
        le_sub = LabelEncoder()

        df['REGION_ENC'] = le_reg.fit_transform(df['REGION'].astype(str))
        df['COUNCIL_ENC'] = le_cou.fit_transform(df['COUNCIL'].astype(str))
        df['SUBJECT_ENC'] = le_sub.fit_transform(df['SUBJECT'].astype(str))

        feats = [
            'YEAR', 'REGION_ENC', 'COUNCIL_ENC', 'SUBJECT_ENC', 'FORM_NUM',
            'AVAILABLE_TABLES', 'TOTAL_LABS', 'LAG_1', 'LAG_2',
            'YOY_GROWTH', 'COHORT_LAG', 'IS_ELECTION_YEAR'
        ]

        train_df = df[df['YEAR'] <= 2025].copy()
        X = train_df[feats]
        y = train_df['ENROLLMENT']

        print(f"Training Models on {len(train_df)} rows...")

        self.xgb_model = xgb.XGBRegressor(n_estimators=300, max_depth=9, learning_rate=0.05, n_jobs=-1)
        self.xgb_model.fit(X, y)

        self.lgb_model = lgb.LGBMRegressor(n_estimators=200, num_leaves=60, verbose=-1)
        self.lgb_model.fit(X, y)

        return df, feats

    def _create_next_year_features(self, last_year_df, current_year):
        next_df = last_year_df.copy()
        next_df['YEAR'] = current_year

        next_df['LAG_2'] = next_df['LAG_1']
        next_df['LAG_1'] = next_df['ENROLLMENT']
        next_df['YOY_GROWTH'] = (next_df['LAG_1'] - next_df['LAG_2']) / (next_df['LAG_2'] + 1e-5)

        cohort_lookup = last_year_df.set_index(['REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM'])['ENROLLMENT'].to_dict()

        def get_cohort(row):
            target_form = row['FORM_NUM'] - 1
            if target_form < 1: return -1
            key = (row['REGION'], row['COUNCIL'], row['SUBJECT'], target_form)
            return cohort_lookup.get(key, -1)

        next_df['COHORT_LAG'] = next_df.apply(get_cohort, axis=1)
        next_df['IS_ELECTION_YEAR'] = 1 if current_year in [2025, 2030] else 0
        next_df['ENROLLMENT'] = np.nan
        return next_df

    def forecast(self, df, feats):
        """Recursive forecasting from 2026 to 2030."""
        print("Recursive Forecasting (2026-2030)...")
        future_preds = []
        current_sim = df[df['YEAR'] == 2025].copy()

        for year in range(2026, 2031):
            next_step = self._create_next_year_features(current_sim, year)
            X_future = next_step[feats]

            pred_xgb = self.xgb_model.predict(X_future)
            pred_lgb = self.lgb_model.predict(X_future)

            next_step['ENROLLMENT'] = (pred_xgb + pred_lgb) / 2
            next_step['ENROLLMENT'] = next_step['ENROLLMENT'].apply(lambda x: max(0, x))

            future_preds.append(next_step)
            current_sim = next_step.copy()
            print(f" > {year} Forecast Complete.")

        forecast_df = pd.concat(future_preds, ignore_index=True)
        output_cols = ['YEAR', 'REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM', 'ENROLLMENT']
        final_output = forecast_df[output_cols].copy()
        final_output['ENROLLMENT'] = final_output['ENROLLMENT'].round(0).astype(int)
        return final_output

class Pipeline:
    """Main Orchestrator."""

    def __init__(self):
        # Mount drive logic kept as per original request, though abstracted
        # from google.colab import drive
        # drive.mount('/content/drive/')
        self.data_handler = DataHandler(Config.BASE_DIRECTORY)
        self.geo_processor = GeoProcessor(self.data_handler)
        self.modeler = SubjectModeler(self.data_handler)

    def run(self):
        # 1. Data Loading & Cleaning
        self.data_handler.load_data()
        self.data_handler.clean_data()
        self.data_handler.check_discrepancies()
        self.data_handler.merge_lga_status()
        self.data_handler.drop_null_locations()

        # 2. Geo Processing
        self.geo_processor.process_geodata()

        # 3. Subject Modeling
        print("\n=== Starting Subject Modeling Pipeline ===")
        processed_df = self.modeler.prepare_data()
        engineered_df = self.modeler.engineer_features(processed_df)
        trained_df, feats = self.modeler.train_models(engineered_df)
        forecast_results = self.modeler.forecast(trained_df, feats)

        print("\n--- Sample Subject Forecast (2026-2030) ---")
        print(forecast_results.head())
        return forecast_results

if __name__ == "__main__":
    pipeline = Pipeline()
    forecast = pipeline.run()

Filtered files: ['Dropout-Secondary  2017-2024.csv', 'Data-Secondary Tables and chairs 2016-2025.csv', 'Secondary-Re_entry.csv', 'Secondary - DISABALITY 2020-2025.csv', 'LGAs Urban and Rural Status.csv', 'Combined_Secondary_Laboratories_Govt.csv', 'Combined_Secondary_Laboratories_All_G_NG.csv', 'Combined_Secondary_ICT_All_G_NG.csv', 'Combined_Secondary_ICT_Govt.csv', 'Combined_Secondary_Electricity_All_G_NG.csv', 'Combined_Secondary_Electricity_Govt.csv', 'Secondary_students_per_subject.csv']
Loaded Dropout-Secondary  2017-2024 successfully.
Loaded Data-Secondary Tables and chairs 2016-2025 successfully.
Loaded Secondary-Re_entry successfully.
Loaded Secondary - DISABALITY 2020-2025 successfully.
Loaded LGAs Urban and Rural Status successfully.
Loaded Combined_Secondary_Laboratories_Govt successfully.
Loaded Combined_Secondary_Laboratories_All_G_NG successfully.
Loaded Combined_Secondary_ICT_All_G_NG successfully.
Loaded Combined_Secondary_ICT_Govt successfully.
Loaded Combined_Seconda

In [None]:
# -*- coding: utf-8 -*-
"""
Refactored MoEST Modeling: OO Implementation with Individual Model Evaluation
"""

import os
import re
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.cluster import KMeans
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    HistGradientBoostingRegressor
)
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

class Config:
    BASE_DIRECTORY = '/content/drive/MyDrive/GUIDELINES_TSC_JAN2026/Data Set/csvs/'
    GEODATA_FILENAME = '/content/drive/MyDrive/MOEST/tanzania_council_geodata.csv'

    EXPECTED_COLS = {
        'Year': ['Year', 'YEAR', 'Academic Year'],
        'Region': ['Region', 'REGION', 'REGON'],
        'Council': ['Council', 'COUNCIL', 'DISTRICT', 'LGA NAME']
    }

    EXCLUDE_KEYWORDS = ['Primary', 'Textbooks', 'Population', 'COBET', 'Vocational', 'Enrollment']

class DataHandler:
    def __init__(self, base_dir):
        self.base_dir = base_dir
        self.dataframes = {}

    def load_data(self):
        try:
            all_files_in_dir = os.listdir(self.base_dir)
            all_files = [f for f in all_files_in_dir if f.endswith('.csv')]

            filtered_files = []
            for file_name in all_files:
                if not any(keyword.lower() in file_name.lower() for keyword in Config.EXCLUDE_KEYWORDS):
                    filtered_files.append(file_name)

            for file_name in filtered_files:
                file_path = os.path.join(self.base_dir, file_name)
                df_name = file_name.replace('.csv', '')
                try:
                    self.dataframes[df_name] = pd.read_csv(file_path)
                    print(f"Loaded {df_name}.")
                except Exception as e:
                    print(f"Error loading {file_name}: {e}")
        except FileNotFoundError:
            print(f"Directory not found: {self.base_dir}")

    def clean_data(self):
        for df_name, df in self.dataframes.items():
            for col in df.select_dtypes(include='object').columns:
                if df[col].astype(str).str.contains(',').any() or df[col].astype(str).str.fullmatch(r'\d+\.?\d*').any():
                    cleaned_col = df[col].astype(str).str.replace(',', '', regex=False).str.strip()
                    converted_col = pd.to_numeric(cleaned_col, errors='coerce')
                    if converted_col.notna().sum() > 0 and converted_col.dtype != object:
                        df[col] = converted_col

            unnamed_cols = [col for col in df.columns if re.match(r'Unnamed: \d+', str(col))]
            cols_to_drop = [col for col in unnamed_cols if (df[col].isnull().sum() / len(df) * 100) > 90]
            if cols_to_drop:
                df.drop(columns=cols_to_drop, inplace=True)

    def merge_lga_status(self):
        if 'LGAs Urban and Rural Status' not in self.dataframes: return

        df_lga_status = self.dataframes['LGAs Urban and Rural Status'].copy()
        if 'Remarks' in df_lga_status.columns: df_lga_status.drop(columns=['Remarks'], inplace=True)

        df_lga_status.rename(columns={'Region': 'Region', 'Council': 'Council', 'Classification': 'LGA_Status'}, inplace=True)
        df_lga_status['Region'] = df_lga_status['Region'].str.upper()
        df_lga_status['Council'] = df_lga_status['Council'].str.upper()

        for df_name, df in self.dataframes.items():
            if df_name == 'LGAs Urban and Rural Status': continue

            actual_region = next((n for n in Config.EXPECTED_COLS['Region'] if n in df.columns), None)
            actual_council = next((n for n in Config.EXPECTED_COLS['Council'] if n in df.columns), None)

            if actual_region and actual_council:
                df[actual_region] = df[actual_region].astype(str).str.upper()
                df[actual_council] = df[actual_council].astype(str).str.upper()
                try:
                    merged_df = pd.merge(df, df_lga_status,
                                         left_on=[actual_region, actual_council],
                                         right_on=['Region', 'Council'],
                                         how='left', suffixes=('', '_LGA'))
                    if 'Region_LGA' in merged_df.columns: merged_df.drop(columns=['Region_LGA'], inplace=True)
                    if 'Council_LGA' in merged_df.columns: merged_df.drop(columns=['Council_LGA'], inplace=True)
                    self.dataframes[df_name] = merged_df
                except Exception: pass

        del self.dataframes['LGAs Urban and Rural Status']
        if 'Pre-primary GER NA NER 2017-2025' in self.dataframes:
            del self.dataframes['Pre-primary GER NA NER 2017-2025']

    def drop_null_locations(self):
        for df_name, df in self.dataframes.items():
            actual_region = next((n for n in Config.EXPECTED_COLS['Region'] if n in df.columns), None)
            actual_council = next((n for n in Config.EXPECTED_COLS['Council'] if n in df.columns), None)
            if actual_region and actual_council:
                df.dropna(subset=[actual_region, actual_council], inplace=True)

    def get_dataframe(self, name):
        return self.dataframes.get(name)

class GeoProcessor:
    def __init__(self, data_handler):
        self.data_handler = data_handler
        self.geo_data = None

    def process_geodata(self):
        if os.path.exists(Config.GEODATA_FILENAME):
            print(f"Loading geodata from {Config.GEODATA_FILENAME}")
            self.geo_data = pd.read_csv(Config.GEODATA_FILENAME)
        else:
            self._generate_geodata()

        self._cluster_geodata()
        self._merge_geodata_to_dataframes()

    def _generate_geodata(self):
        pass

    def _cluster_geodata(self):
        if self.geo_data is None: return
        self.geo_data['Latitude'] = pd.to_numeric(self.geo_data['Latitude'], errors='coerce')
        self.geo_data['Longitude'] = pd.to_numeric(self.geo_data['Longitude'], errors='coerce')
        self.geo_data.dropna(subset=['Latitude', 'Longitude'], inplace=True)

        X = self.geo_data[['Latitude', 'Longitude']]
        if len(X) > 0:
            kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
            self.geo_data['Geo_Cluster'] = kmeans.fit_predict(X)

    def _merge_geodata_to_dataframes(self):
        if self.geo_data is None: return
        for df_name, df in self.data_handler.dataframes.items():
            actual_region = next((n for n in Config.EXPECTED_COLS['Region'] if n in df.columns), None)
            actual_council = next((n for n in Config.EXPECTED_COLS['Council'] if n in df.columns), None)

            if actual_region and actual_council:
                df[actual_region] = df[actual_region].astype(str).str.upper()
                df[actual_council] = df[actual_council].astype(str).str.upper()
                try:
                    merged_df = pd.merge(df, self.geo_data[['Region', 'Council', 'Geo_Cluster']],
                                         left_on=[actual_region, actual_council],
                                         right_on=['Region', 'Council'],
                                         how='left', suffixes=('', '_geo'))
                    if 'Region_geo' in merged_df.columns: merged_df.drop(columns=['Region_geo'], inplace=True)
                    if 'Council_geo' in merged_df.columns: merged_df.drop(columns=['Council_geo'], inplace=True)
                    self.data_handler.dataframes[df_name] = merged_df
                except Exception: pass

class SubjectModeler:
    def __init__(self, data_handler):
        self.dh = data_handler

    @staticmethod
    def standard_cols(df):
        if df is None: return pd.DataFrame()
        df = df.copy()
        df.columns = [str(c).strip().upper() for c in df.columns]
        rename_map = {
            'YEAR': 'YEAR', 'REGION': 'REGION', 'COUNCIL': 'COUNCIL',
            'LGA_STATUS': 'LGA_STATUS', 'GEO_CLUSTER': 'GEO_CLUSTER'
        }
        df.rename(columns=rename_map, inplace=True)
        return df.loc[:, ~df.columns.duplicated()]

    def prepare_data(self):
        df_subject = self.dh.get_dataframe("Secondary_students_per_subject")
        df_table = self.dh.get_dataframe("Data-Secondary Tables and chairs 2016-2025")
        df_drop = self.dh.get_dataframe("Dropout-Secondary  2017-2024")
        df_reentry = self.dh.get_dataframe("Secondary-Re_entry")
        df_disability = self.dh.get_dataframe("Secondary - DISABALITY 2020-2025")
        df_ict = self.dh.get_dataframe("Combined_Secondary_ICT_All_G_NG")
        df_elec = self.dh.get_dataframe("Combined_Secondary_Electricity_All_G_NG")
        df_labs = self.dh.get_dataframe("Combined_Secondary_Laboratories_All_G_NG")

        enroll = self.standard_cols(df_subject)
        tables = self.standard_cols(df_table)
        drops = self.standard_cols(df_drop)
        reentry = self.standard_cols(df_reentry)
        disability = self.standard_cols(df_disability)
        ict = self.standard_cols(df_ict)
        elec = self.standard_cols(df_elec)
        labs = self.standard_cols(df_labs)

        id_vars = [c for c in ['YEAR', 'REGION', 'COUNCIL'] if c in enroll.columns]
        subject_cols = [c for c in enroll.columns if 'FORM ' in c and ' - ' in c]

        long_df = enroll.melt(id_vars=id_vars, value_vars=subject_cols, var_name='RAW', value_name='ENROLLMENT')
        long_df['FORM_NUM'] = long_df['RAW'].str.extract(r'FORM (\d)').astype(int)
        long_df['SUBJECT'] = long_df['RAW'].str.split(' - ').str[1].str.strip()
        long_df.drop(columns=['RAW'], inplace=True)

        keys = ['YEAR', 'REGION', 'COUNCIL']

        if 'AVAILABLE_TABLES' in tables.columns:
            long_df = long_df.merge(tables[keys + ['AVAILABLE_TABLES']], on=keys, how='left').fillna(0)
        else: long_df['AVAILABLE_TABLES'] = 0

        lab_cols = [c for c in labs.columns if 'LABORATORY' in c]
        if lab_cols:
            labs['TOTAL_LABS'] = labs[lab_cols].sum(axis=1)
            long_df = long_df.merge(labs[keys + ['TOTAL_LABS']], on=keys, how='left').fillna(0)
        else: long_df['TOTAL_LABS'] = 0

        re_cols = [c for c in reentry.columns if 'RE-ENROLLED' in c]
        if re_cols:
            reentry['TOTAL_REENTRY'] = reentry[re_cols].sum(axis=1)
            long_df = long_df.merge(reentry[keys + ['TOTAL_REENTRY']], on=keys, how='left').fillna(0)
        else: long_df['TOTAL_REENTRY'] = 0

        dis_cols = [c for c in disability.columns if c in ['BLIND', 'LOW VISION']]
        if dis_cols:
            disability['TOTAL_DISABLED'] = disability[dis_cols].sum(axis=1)
            long_df = long_df.merge(disability[keys + ['TOTAL_DISABLED']], on=keys, how='left').fillna(0)
        else: long_df['TOTAL_DISABLED'] = 0

        ict_cols = [c for c in ict.columns if 'COMPUTERS' in c]
        if ict_cols:
            ict['TOTAL_COMPUTERS'] = ict[ict_cols].sum(axis=1)
            long_df = long_df.merge(ict[keys + ['TOTAL_COMPUTERS']], on=keys, how='left').fillna(0)
        else: long_df['TOTAL_COMPUTERS'] = 0

        elec_col = 'NATIONAL GRID (TANESCO) %'
        if elec_col in elec.columns:
            elec.rename(columns={elec_col: 'ELEC_GRID_PCT'}, inplace=True)
            long_df = long_df.merge(elec[keys + ['ELEC_GRID_PCT']], on=keys, how='left').fillna(0)
        else: long_df['ELEC_GRID_PCT'] = 0

        d_cols = [c for c in ['TRUANCY', 'PREGNANCY', 'INDISCIPLINE'] if c in drops.columns]
        if d_cols:
            d_grp = drops.groupby(keys)[d_cols].sum().reset_index()
            long_df = long_df.merge(d_grp, on=keys, how='left').fillna(0)
            for col in ['TRUANCY', 'PREGNANCY', 'INDISCIPLINE']:
                if col not in long_df.columns: long_df[col] = 0
        else:
             for col in ['TRUANCY', 'PREGNANCY', 'INDISCIPLINE']:
                long_df[col] = 0

        return long_df.sort_values(['REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM', 'YEAR'])

    def engineer_features(self, df):
        g = df.groupby(['REGION', 'COUNCIL', 'SUBJECT', 'FORM_NUM'])
        df['LAG_1'] = g['ENROLLMENT'].shift(1)
        df['LAG_2'] = g['ENROLLMENT'].shift(2)
        df['YOY_GROWTH'] = (df['ENROLLMENT'] - df['LAG_1']) / (df['LAG_1'] + 1e-5)

        df['PREV_YEAR'] = df['YEAR'] - 1
        df['PREV_FORM'] = df['FORM_NUM'] - 1

        df['LOOKUP_KEY'] = (df['YEAR'].astype(str) + '_' + df['REGION'] + '_' +
                            df['COUNCIL'] + '_' + df['SUBJECT'] + '_' + df['FORM_NUM'].astype(str))

        lookup = df.groupby('LOOKUP_KEY')['ENROLLMENT'].sum().to_dict()

        df['SEARCH_KEY'] = (df['PREV_YEAR'].astype(str) + '_' + df['REGION'] + '_' +
                            df['COUNCIL'] + '_' + df['SUBJECT'] + '_' + df['PREV_FORM'].astype(str))

        df['COHORT_LAG'] = df['SEARCH_KEY'].map(lookup).fillna(-1)

        df.drop(columns=['PREV_YEAR', 'PREV_FORM', 'LOOKUP_KEY', 'SEARCH_KEY'], inplace=True)
        df['IS_ELECTION_YEAR'] = df['YEAR'].isin([2020, 2025, 2030]).astype(int)

        return df.fillna(-1)

    def calculate_metrics(self, y_true, y_pred, model_name, year_label):
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))

        mask = y_true != 0
        if mask.sum() > 0:
            mape = np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100
        else:
            mape = np.nan

        accuracy = 100 - mape if not np.isnan(mape) else 0

        print(f" > {model_name} [{year_label}]: RMSE={rmse:,.0f} | MAE={mae:,.0f} | MAPE={mape:.2f}% | Acc={accuracy:.2f}%")

    def evaluate_years(self, df):
        le_reg = LabelEncoder()
        le_cou = LabelEncoder()
        le_sub = LabelEncoder()

        df['REGION_ENC'] = le_reg.fit_transform(df['REGION'].astype(str))
        df['COUNCIL_ENC'] = le_cou.fit_transform(df['COUNCIL'].astype(str))
        df['SUBJECT_ENC'] = le_sub.fit_transform(df['SUBJECT'].astype(str))

        feats = [
            'YEAR', 'REGION_ENC', 'COUNCIL_ENC', 'SUBJECT_ENC', 'FORM_NUM',
            'AVAILABLE_TABLES', 'TOTAL_REENTRY', 'TOTAL_DISABLED',
            'TOTAL_COMPUTERS', 'ELEC_GRID_PCT', 'TOTAL_LABS',
            'TRUANCY', 'PREGNANCY', 'INDISCIPLINE',
            'LAG_1', 'LAG_2', 'YOY_GROWTH', 'COHORT_LAG', 'IS_ELECTION_YEAR'
        ]

        available_feats = [f for f in feats if f in df.columns]

        # 1. TRAIN on Data <= 2023
        print(f"\nTraining Models on Data (Years <= 2023)...")
        train_df = df[df['YEAR'] <= 2023].copy()
        X_train = train_df[available_feats]
        y_train = train_df['ENROLLMENT']

        models = {
            "XGBoost": xgb.XGBRegressor(n_estimators=300, max_depth=9, learning_rate=0.05, n_jobs=-1),
            "LightGBM": lgb.LGBMRegressor(n_estimators=500, num_leaves=50, min_child_samples=10, learning_rate=0.1, verbose=-1),
            "RandomForest": RandomForestRegressor(n_estimators=300, max_depth=12, n_jobs=-1, random_state=42),
            "RandomForest 2": RandomForestRegressor(n_estimators=500, max_depth=12, n_jobs=-1, random_state=42),

         #   "GradientBoosting": GradientBoostingRegressor(n_estimators=200, max_depth=8, learning_rate=0.05, random_state=42),
          #  "HistGradBoost": HistGradientBoostingRegressor(max_iter=200, learning_rate=0.05, max_depth=10, random_state=42)
        }

        for name, model in models.items():
            print(f" > Training {name}...")
            model.fit(X_train, y_train)

        # 2. TEST LOOP
        for year in [2024, 2025]:
            print(f"\n--- Results for Year {year} ---")
            test_df = df[df['YEAR'] == year].copy()

            if not test_df.empty:
                X_test = test_df[available_feats]
                y_test = test_df['ENROLLMENT']

                for name, model in models.items():
                    print(f" > Testing {name}...")
                    preds = model.predict(X_test)
                    self.calculate_metrics(y_test, preds, name, str(year))
            else:
                print(f"No data found for {year}.")

class Pipeline:
    def __init__(self):
        self.data_handler = DataHandler(Config.BASE_DIRECTORY)
        self.geo_processor = GeoProcessor(self.data_handler)
        self.modeler = SubjectModeler(self.data_handler)

    def run(self):
        self.data_handler.load_data()
        self.data_handler.clean_data()
        self.data_handler.merge_lga_status()
        self.data_handler.drop_null_locations()
        self.geo_processor.process_geodata()

        print("\n=== Starting Subject Modeling Pipeline ===")
        processed_df = self.modeler.prepare_data()
        engineered_df = self.modeler.engineer_features(processed_df)
        self.modeler.evaluate_years(engineered_df)

if __name__ == "__main__":
    pipeline = Pipeline()
    pipeline.run()

Loaded Dropout-Secondary  2017-2024.
Loaded Data-Secondary Tables and chairs 2016-2025.
Loaded Secondary-Re_entry.
Loaded Secondary - DISABALITY 2020-2025.
Loaded LGAs Urban and Rural Status.
Loaded Combined_Secondary_Laboratories_Govt.
Loaded Combined_Secondary_Laboratories_All_G_NG.
Loaded Combined_Secondary_ICT_All_G_NG.
Loaded Combined_Secondary_ICT_Govt.
Loaded Combined_Secondary_Electricity_All_G_NG.
Loaded Combined_Secondary_Electricity_Govt.
Loaded Secondary_students_per_subject.
Loading geodata from /content/drive/MyDrive/MOEST/tanzania_council_geodata.csv

=== Starting Subject Modeling Pipeline ===

Training Models on Data (Years <= 2023)...
 > Training XGBoost...
 > Training LightGBM...
 > Training RandomForest...
