In [22]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import joblib

In [24]:
# Placeholder models and scalers (to be trained before usage)
basic_model = sm.load("ethical_model")     # Insert file path
advanced_model = sm.load("advanced_model") #  Insert file path

In [26]:
# Columns to remove depending on the chosen model
columns_to_remove_advanced = {
    'Country',
    'Alcohol_consumption',
    'Economy_status_Developing',
    'Polio',
    'Diphtheria',
    'Population_mln',
    'Thinness_five_nine_years',
    'Thinness_ten_nineteen_years',
    'Measles',
    'Hepatitis_B'
}

columns_to_remove_basic = {
    'Country',
    'Alcohol_consumption',
    'Economy_status_Developing',
    'Polio',
    'Diphtheria',
    'Population_mln',
    'Thinness_five_nine_years',
    'Measles',
    'Hepatitis_B',
    'Thinness_ten_nineteen_years',
    'BMI',
    'Region_Middle East',
    'Region_North America',
    'Region_Rest of Europe',
    'Region_Asia',
    'Region_Central America and Caribbean',
    'Region_European Union',
    'Region_Oceania',
    'Region_South America'
}

In [28]:
# Feature engineering function
def feature_eng(train_df, test_df, save_metadata=False, include_regions=True):
    """
    Feature engineering function with joblib for saving/loading scalers and feature columns.
    
    Args:
        train_df (pd.DataFrame): Training dataset.
        test_df (pd.DataFrame): Test dataset or user input.
        save_metadata (bool): Whether to save the scaler and feature columns. Only set to True during training.

    Returns:
        train_df (pd.DataFrame): Processed training dataset.
        test_df (pd.DataFrame): Processed test dataset.
    """
    train_df = train_df.copy()  # Copy the training dataset
    test_df = test_df.copy()  # Copy the test dataset

    # Scaling
    scale_columns = ['Year', 'Infant_deaths', 'Under_five_deaths', 'Adult_mortality',
                     'Alcohol_consumption', 'Hepatitis_B', 'Measles', 'BMI',
                     'Polio', 'Diphtheria', 'Incidents_HIV', 'GDP_per_capita',
                     'Population_mln', 'Thinness_ten_nineteen_years',
                     'Thinness_five_nine_years', 'Schooling']

    if save_metadata:
        # Training phase: Fit scaler and save feature columns
        train_df = pd.get_dummies(train_df, columns=['Region'], drop_first=True, prefix='Region', dtype=int)
        scaler = StandardScaler()
        train_df[scale_columns] = scaler.fit_transform(train_df[scale_columns])
        feature_columns = train_df.columns

        # Save scaler and feature columns
        joblib.dump(scaler, 'scaler')
        joblib.dump(feature_columns, 'feature_columns')
    else:
        # Prediction phase: Load scaler and feature columns
        scaler = joblib.load("scaler")                   # Insert file path
        feature_columns = joblib.load("feature_columns") # Insert file path

        if include_regions and 'Region' in test_df.columns:
            # One-hot encode and align test data with training columns
            test_df = pd.get_dummies(test_df, columns=['Region'], drop_first=False, prefix='Region', dtype=int)
        else:
            test_df.drop(columns=['Region'], errors='ignore', inplace=True)

        # Align test_df with train_df before scaling
        test_df = test_df.reindex(columns=feature_columns, fill_value=0)
        common_columns = [col for col in scale_columns if col in test_df.columns]
        test_df[common_columns] = scaler.transform(test_df[common_columns])

    # Add Constant
    train_df = sm.add_constant(train_df, has_constant='add')
    test_df = sm.add_constant(test_df, has_constant='add')

    return train_df, test_df

In [30]:
# Feature names for basic and advanced models
basic_features = ['Year', 'Infant_deaths', 'Under_five_deaths', 'Adult_mortality',
                  'Incidents_HIV', 'GDP_per_capita', 'Schooling', 'Economy_status_Developed']
advanced_features = ['Year', 'Infant_deaths', 'Under_five_deaths', 'Adult_mortality', 'BMI', 'Incidents_HIV', 
                     'GDP_per_capita', 'Schooling', 'Economy_status_developed', 
                     'Region_Asia', 'Region_Central America and Caribbean',
                     'Region_European Union', 'Region_Middle East', 'Region_North America',
                     'Region_Oceania', 'Region_Rest of Europe', 'Region_South America']

In [None]:
def predict_life_expectancy():
    try:
        # Step 1: Ask for consent to use advanced data
        consent = input("Do you consent to using advanced population data, which may include protected information, for better accuracy? (Y/N): ").strip().upper()

        if consent not in ['Y', 'N']:
            raise ValueError("Invalid input. Please enter 'Y' or 'N'.")

        # Step 2: Select the model and features
        if consent == 'Y':
            print("Using the advanced model for prediction.")
            model = advanced_model
            features = advanced_features
            cols_to_remove = columns_to_remove_advanced
            include_regions = True
        else:
            print("Using the basic model for prediction.")
            model = basic_model
            features = basic_features
            cols_to_remove = columns_to_remove_basic
            include_regions = False

        # Step 3: Prompt for input data
        print("Please provide the following inputs:")
        feature_inputs = {}

        # Extract regions based on the selected feature list
        regions = [feature[len("Region_"):] for feature in features if feature.startswith("Region_")]

        # If the model (advanced or otherwise) uses region-based features, prompt user once
        if len(regions) > 0:
            while True:
                print(f"Available regions: {', '.join(regions)}")
                region_value = input("Select your region: ").strip()
                if region_value in regions:
                    feature_inputs['Region'] = region_value
                    break
                else:
                    print(f"Invalid region (check spelling/capitalisation). Please choose from: {', '.join(regions)}")

        # Now, prompt for remaining numeric features
        
        feature_ranges = {'Year':(2000, float('inf')),'Infant_deaths':(0,1000),'Under_five_deaths':(0,1000),'Adult_mortality':(0,1000),'BMI':(0,100),'Incidents_HIV':(0,1000),'GDP_per_capita': (0, float('inf')),
                          'Thinness_ten_nineteen_years':(0,100),'Schooling':(0,100)}
        
        for feature in features:
            # Skip one-hot region features and Region since it's already been handled
            if feature.startswith("Region_") or feature == 'Region':
                continue

            while True:
                try:
                    value = float(input(f"Enter {feature}: "))
                    if feature in feature_ranges:
                        min_value, max_value = feature_ranges[feature]
                        if (value < min_value) | (value > max_value):
                            print(f"Invalid input for {feature}. Please enter a value between {min_value} and {max_value}.")
                            continue
                    elif feature == 'Economy_status_developed':
                        if value not in [0,1]:
                            print(f"Invalid input for {feature}. Please enter 0 or 1.")
                            continue
                    feature_inputs[feature] = value
                    break
                except ValueError:
                    print(f"Invalid input for {feature}. Please enter a valid number.")
                    

        # Step 4: Feature engineering
        input_df = pd.DataFrame([feature_inputs])  # Create a DataFrame from the user input
        train_df = pd.DataFrame(columns=features)  # Placeholder train DataFrame for fitting
        _, test_df = feature_eng(train_df, input_df, include_regions=include_regions)  # Apply feature engineering

        # Step 5: Predict life expectancy
        # Remove unwanted columns depending on the model
        feature_cols = [col for col in list(test_df.columns) if col not in cols_to_remove]

        prediction = model.predict(test_df[feature_cols])
        print(f"Predicted life expectancy: {prediction[0]:.2f} years")

    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage (after training models and scalers on the dataset)
predict_life_expectancy()

- Asia
- year: 2007
- inf_deaths:51.5
- u5_deaths:67.9
- ad_mort:201.0765
- BMI:21.2
- HIV:0.13
- GDP:1076
- schooling:5.0
- econ_status_developed:0
- life_exp:65.4