# Ethical Model
> Removing Unethical Features, whilst maintaining predictive power and robustness.

## Preliminary Steps (Train-test splitting, Feature Engineering)
* Same test split (80/20, random sample 23) as advanced model.
* Same feature engineering functions as advanced model.

In [51]:
# import all modules/libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

import statsmodels.api as sm
import statsmodels.tools
import joblib
from sklearn import metrics

In [6]:
# Open and save dataset
WHO = pd.read_csv("Life Expectancy Data.csv")

In [7]:
# Preparing for Train-test splitting. Specify features/x and set life expectancy as target/y

feature_cols = list(WHO.columns)
feature_cols.remove('Life_expectancy')

X = WHO[feature_cols]
y = WHO['Life_expectancy']

In [8]:
# Check all feature columns
feature_cols

['Country',
 'Region',
 'Year',
 'Infant_deaths',
 'Under_five_deaths',
 'Adult_mortality',
 'Alcohol_consumption',
 'Hepatitis_B',
 'Measles',
 'BMI',
 'Polio',
 'Diphtheria',
 'Incidents_HIV',
 'GDP_per_capita',
 'Population_mln',
 'Thinness_ten_nineteen_years',
 'Thinness_five_nine_years',
 'Schooling',
 'Economy_status_Developed',
 'Economy_status_Developing']

In [9]:
# TRAIN-TEST SPLIT - 80/20 split, consistent random state

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 23)

In [102]:
# FEATURE ENGINEERING FUNCTION

def feature_eng(train_df, test_df, save_metadata=True, include_regions=False):
    """
    Feature engineering function with joblib for saving/loading scalers and feature columns.
    
    Args:
        train_df (pd.DataFrame): Training dataset.
        test_df (pd.DataFrame): Test dataset or user input.
        save_metadata (bool): Whether to save the scaler and feature columns. Only set to True during training.

    Returns:
        train_df (pd.DataFrame): Processed training dataset.
        test_df (pd.DataFrame): Processed test dataset.
    """
    train_df = train_df.copy()  # Copy the training dataset
    test_df = test_df.copy()  # Copy the test dataset

    # Scaling
    scale_columns = ['Year', 'Infant_deaths', 'Under_five_deaths', 'Adult_mortality',
                     'Alcohol_consumption', 'Hepatitis_B', 'Measles', 'BMI',
                     'Polio', 'Diphtheria', 'Incidents_HIV', 'GDP_per_capita',
                     'Population_mln', 'Thinness_ten_nineteen_years',
                     'Thinness_five_nine_years', 'Schooling']

    if save_metadata:
        # Training phase: Fit scaler and save feature columns
        train_df = pd.get_dummies(train_df, columns=['Region'], drop_first=True, prefix='Region', dtype=int)
        scaler = StandardScaler()
        train_df[scale_columns] = scaler.fit_transform(train_df[scale_columns])
        feature_columns = train_df.columns

        # Save scaler and feature columns
        joblib.dump(scaler, 'scaler')
        joblib.dump(feature_columns, 'feature_columns')
    else:
        # Prediction phase: Load scaler and feature columns
        scaler = joblib.load('scaler')
        feature_columns = joblib.load('feature_columns')

        # One-hot encode and align test data with training columns
        test_df = pd.get_dummies(test_df, columns=['Region'], drop_first=False, prefix='Region', dtype=int)
        test_df = test_df.reindex(columns=feature_columns, fill_value=0)

        # Align test_df with train_df before scaling
        common_columns = [col for col in scale_columns if col in test_df.columns]
        test_df[common_columns] = scaler.transform(test_df[common_columns])

    # Add Constant
    train_df = sm.add_constant(train_df, has_constant='add')
    test_df = sm.add_constant(test_df, has_constant='add')

    return train_df, test_df


In [104]:
# feature engineer x_train and x_test
X_train_fe, X_test_fe = feature_eng(X_train, X_test)

In [106]:
# view feature engineered data
X_train_fe

Unnamed: 0,const,Country,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,...,Economy_status_Developed,Economy_status_Developing,Region_Asia,Region_Central America and Caribbean,Region_European Union,Region_Middle East,Region_North America,Region_Oceania,Region_Rest of Europe,Region_South America
2676,1.0,Singapore,0.952270,-1.028415,-0.907783,-1.196121,-0.723955,0.799286,0.950472,-0.644257,...,0,1,1,0,0,0,0,0,0,0
369,1.0,"Yemen, Rep.",0.515765,0.541674,0.352960,0.370161,-1.200723,-0.529570,0.950472,-0.780855,...,0,1,0,0,0,1,0,0,0,0
466,1.0,Austria,-1.012001,-0.952267,-0.853751,-0.913923,1.905876,-0.086618,-1.656268,0.038736,...,1,0,0,0,1,0,0,0,0,0
1739,1.0,Lesotho,-1.448505,1.448192,1.478623,3.597649,-0.493179,-0.339734,-1.656268,-0.416593,...,0,1,0,0,0,0,0,0,0,0
649,1.0,South Africa,-0.793748,0.683091,0.778461,2.690922,0.609981,-0.529570,-1.390274,0.676195,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1512,1.0,Cambodia,0.952270,0.063033,-0.122070,-0.052038,-0.576867,0.103218,-1.922262,-1.463847,...,0,1,1,0,0,0,0,0,0,0
1993,1.0,Norway,-0.138992,-0.999406,-0.887521,-1.093126,0.457821,0.229776,0.844075,0.539597,...,1,0,0,0,0,0,0,0,1,0
1064,1.0,Czechia,-1.012001,-0.966771,-0.862756,-0.630140,2.301492,0.482891,1.110069,0.721728,...,1,0,0,0,1,0,0,0,0,0
742,1.0,Hungary,0.079261,-0.908754,-0.824484,-0.225231,1.763860,0.229776,1.163268,0.721728,...,1,0,0,0,1,0,0,0,0,0


## Which Features pose Ethical Concerns?
When considering features to drop, we chose to drop:
* `BMI` - uses medical records of existing population
* `Region_`- dummies can introduce bias by grouping countries

We are aware that using there are other medical records, for example infant deaths and adult mortality. Under careful consideration, we have chosen to keep these variables as according to the latest GDPR and UK Data Protection Legislation, data privacy does not extent to deceased data subjects. In addition, this data is aggregate data, with no specific identification that can be traced back to these individuals. These features were deemed too significant to exclude. Statistics that use data on children are used under the assumption that there is parental consent, which is required under current legislation. References to the relevant legislation are included at the end of the workbook.

Therefore, the features used are:
* `Year`
* `Infant_deaths`
* `Under_five_deaths`
* `Adult_mortality`
* `Incidents_HIV`
* `GDP_per_capita`
* `Schooling`
* `Economy_status_developed`

In [109]:
# Define a set of columns to be removed. These include the advanced model features, but with BMI and Region dummies being excluded.
cols_to_remove = {'Country', 'Alcohol_consumption', 'Economy_status_Developing',
                  'Polio', 'Diphtheria', 'Population_mln', 'Thinness_five_nine_years',
                  'Thinness_ten_nineteen_years', 'Measles', 'Hepatitis_B','BMI',
                  'Region_Asia','Region_Central America and Caribbean','Region_European Union',
                  'Region_Middle East','Region_North America','Region_Oceania','Region_Rest of Europe',
                  'Region_South America'}

# Create a list of feature columns by excluding the ones listed in cols_to_remove from the columns in X_train_fe
feature_cols = [col for col in list(X_train_fe.columns) if col not in cols_to_remove]

## Modelling and Testing
* Summary
    * `R-squared` = 0.978
    * `Condition Number` = 18.4
    * `P_value` under 0.05 for all features

In [111]:
# Perform Linear Regression and Save the Model
lin_reg = sm.OLS(y_train, X_train_fe[feature_cols])
results = lin_reg.fit()
results.save('ethical_model')
results.summary()

0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.978
Model:,OLS,Adj. R-squared:,0.978
Method:,Least Squares,F-statistic:,12730.0
Date:,"Mon, 09 Dec 2024",Prob (F-statistic):,0.0
Time:,11:11:26,Log-Likelihood:,-4006.2
No. Observations:,2291,AIC:,8030.0
Df Residuals:,2282,BIC:,8082.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,68.5824,0.036,1880.114,0.000,68.511,68.654
Year,0.0669,0.030,2.242,0.025,0.008,0.125
Infant_deaths,-1.5364,0.189,-8.125,0.000,-1.907,-1.166
Under_five_deaths,-2.2213,0.191,-11.658,0.000,-2.595,-1.848
Adult_mortality,-5.2757,0.078,-67.633,0.000,-5.429,-5.123
Incidents_HIV,0.0733,0.048,1.531,0.126,-0.021,0.167
GDP_per_capita,0.4654,0.044,10.583,0.000,0.379,0.552
Schooling,0.3382,0.055,6.157,0.000,0.230,0.446
Economy_status_Developed,1.1793,0.106,11.099,0.000,0.971,1.388

0,1,2,3
Omnibus:,13.98,Durbin-Watson:,2.044
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.729
Skew:,0.06,Prob(JB):,8.57e-05
Kurtosis:,3.427,Cond. No.,18.4


## Testing
* On average, our ethical model is off by **1.36 years**
* This is only around **1.63%**

In [None]:
## Let's check the performance of our model. Calculate RMSE and MAPE on train data.

y_pred = results.predict(X_train_fe[feature_cols])

rmse = statsmodels.tools.eval_measures.rmse(y_train, y_pred)
mape = metrics.mean_absolute_percentage_error(y_train,y_pred)

In [None]:
print(rmse)
print(f"{mape*100}%")

In [None]:
def feature_eng(train_df, test_df, save_metadata=False, include_regions = False):
    """
    Feature engineering function with joblib for saving/loading scalers and feature columns.
    
    Args:
        train_df (pd.DataFrame): Training dataset.
        test_df (pd.DataFrame): Test dataset or user input.
        save_metadata (bool): Whether to save the scaler and feature columns. Only set to True during training.

    Returns:
        train_df (pd.DataFrame): Processed training dataset.
        test_df (pd.DataFrame): Processed test dataset.
    """
    train_df = train_df.copy()  # Copy the training dataset
    test_df = test_df.copy()  # Copy the test dataset

    # Scaling
    scale_columns = ['Year', 'Infant_deaths', 'Under_five_deaths', 'Adult_mortality',
                     'Alcohol_consumption', 'Hepatitis_B', 'Measles', 'BMI',
                     'Polio', 'Diphtheria', 'Incidents_HIV', 'GDP_per_capita',
                     'Population_mln', 'Thinness_ten_nineteen_years',
                     'Thinness_five_nine_years', 'Schooling']

    if save_metadata:
        # Training phase: Fit scaler and save feature columns
        train_df = pd.get_dummies(train_df, columns=['Region'], drop_first=True, prefix='Region', dtype=int)
        scaler = StandardScaler()
        train_df[scale_columns] = scaler.fit_transform(train_df[scale_columns])
        feature_columns = train_df.columns

        # Save scaler and feature columns
        joblib.dump(scaler, 'scaler')
        joblib.dump(feature_columns, 'feature_columns')
    else:
        # Prediction phase: Load scaler and feature columns
        scaler = joblib.load('scaler')
        feature_columns = joblib.load('feature_columns')

        # One-hot encode and align test data with training columns
        test_df = pd.get_dummies(test_df, columns=['Region'], drop_first=False, prefix='Region', dtype=int)
        test_df = test_df.reindex(columns=feature_columns, fill_value=0)

        # Align test_df with train_df before scaling
        common_columns = [col for col in scale_columns if col in test_df.columns]
        test_df[common_columns] = scaler.transform(test_df[common_columns])

    # Add Constant
    train_df = sm.add_constant(train_df, has_constant='add')
    test_df = sm.add_constant(test_df, has_constant='add')

    return train_df, test_df


In [None]:
# Feature Engineer test dataset and calculate RMSE and MAPE
_,X_test_fe = feature_eng(X_train,X_test)
y_test_pred = results.predict(X_test_fe[feature_cols])
rmse = statsmodels.tools.eval_measures.rmse(y_test, y_test_pred)
mape = metrics.mean_absolute_percentage_error(y_test,y_test_pred)

In [None]:
print(rmse)
print(f"{mape*100}%")

## REFERENCES
* [Data protection under GDPR](https://europa.eu/youreurope/business/dealing-with-customers/data-protection/data-protection-gdpr/index_en.htm)
* [Data Protection Act 2018](https://www.legislation.gov.uk/ukpga/2018/12/contents/enacted)
* [Does GDPR Apply To Data Relating To A Deceased Person?](https://www.nelsonslaw.co.uk/data-deceased-person/#:~:text=Information%20relating%20to%20a%20deceased%20person%20does%20not,usual%20method%20of%20making%20a%20Subject%20Access%20Request.)
* [What should our general approach to processing children’s personal data be?](https://ico.org.uk/for-organisations/uk-gdpr-guidance-and-resources/childrens-information/children-and-the-uk-gdpr/what-should-our-general-approach-to-processing-children-s-personal-data-be/)