In [101]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
 
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [102]:
def load_and_clean_data(file):
    df = pd.read_csv(file)
   
   
    df = df.dropna(subset=["Life_expectancy"])
 
 
    df = df.drop(columns=["Country", "Region", "Year", 'Economy_status_Developed', 'Economy_status_Developing', 'Measles'], errors='ignore')
 
    for col in df.select_dtypes(include=[np.number]):
        df[col] = df[col].fillna(df[col].median())
 
    return df

In [103]:
def split_scale(data, fulldata):
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data.select_dtypes(include=[np.number]))

    df = pd.DataFrame(data_scaled, columns=data.select_dtypes(include=[np.number]).columns)

    X = df.drop(columns=['Life_expectancy'])
    y = df['Life_expectancy']
 
    if fulldata:
        return X, y
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        return X_train, X_test, y_train, y_test

In [104]:
def feature_engineeringV2(X):

    '''
    This function inputs a dataframe df and returns a feature engineered version of the dataframe.
    It performs the following steps:
    
    1. Drops the columns 'Country', 'Region', 'Life_expectancy', 'Economy_status_Developed', 'Economy_status_Developing', and 'Measles'.
    2. Creates a dataframe X and series y for feature engineering.
    3. Removes highly correlated features with a correlation greater than 0.9.
    4. Scales the features using StandardScaler.
    5. Calculates the Variance Inflation Factor (VIF) for each feature and drops features with a VIF greater than 10.
    6. Returns the feature engineered version of X and y.
    '''
    
    transform_cols = ['GDP_per_capita', 'Incidents_HIV']  # Columns to be transformed
    X[transform_cols] = np.log1p(X[transform_cols])  

    corr_matrix = X.corr().abs() # Creates a correlation matrix for all the values in the X dataframe, making sure its absolute values (this prevents a division by zero error).
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(bool)) # Takes the upper triangle of the correlation matrix, k = 1, is the upper diagonal, and changes the type to boolean.
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)] # Takes all the columns with correlation greater than 0.9 in one list.
    X = X.drop(columns=to_drop) # Drops all of the columns with the correlations > 0.9 - this is the feature selection step.


    vif_data = pd.DataFrame()
    vif_data["feature"] = X.columns # Adds a column called feature and fills it with the columns from X.
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])] # Iterates through and calculates the VIF for each feature.
    X = X.drop(columns = vif_data[vif_data["VIF"] > 10]["feature"]) # Drops the columns with a VIF value > 10, as this suggests high multicollinearity.

    print(X.columns)

    # Returns the feature engineered version of X
    return X


In [105]:
def feature_engineeringV2_sensitive(X):

    '''
    This function inputs a dataframe df and returns a feature engineered version of the dataframe.
    It performs the following steps:
    
    1. Drops the columns 'Country', 'Region', 'Life_expectancy', 'Economy_status_Developed', 'Economy_status_Developing', and 'Measles'.
    2. Creates a dataframe X and series y for feature engineering.
    3. Removes highly correlated features with a correlation greater than 0.9.
    4. Scales the features using StandardScaler.
    5. Calculates the Variance Inflation Factor (VIF) for each feature and drops features with a VIF greater than 10.
    6. Returns the feature engineered version of X and y.
    '''

    transform_cols = ['GDP_per_capita', 'Incidents_HIV']  # Columns to be transformed
    X[transform_cols] = np.log1p(X[transform_cols])  

    # Drop sensitive columns
    X = X.drop(columns=['Infant_deaths', 'Under_five_deaths', 'Adult_mortality', 'Hepatitis_B', 'Polio', 'Diphtheria', 'Incidents_HIV']) # Drop sensitive columns that may not be appropriate for all audiences.

    corr_matrix = X.corr().abs() # Creates a correlation matrix for all the values in the X dataframe, making sure its absolute values (this prevents a division by zero error).
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(bool)) # Takes the upper triangle of the correlation matrix, k = 1, is the upper diagonal, and changes the type to boolean.
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)] # Takes all the columns with correlation greater than 0.9 in one list.
    X = X.drop(columns=to_drop) # Drops all of the columns with the correlations > 0.9 - this is the feature selection step.


    vif_data = pd.DataFrame()
    vif_data["feature"] = X.columns # Adds a column called feature and fills it with the columns from X.
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])] # Iterates through and calculates the VIF for each feature.
    X = X.drop(columns = vif_data[vif_data["VIF"] > 10]["feature"]) # Drops the columns with a VIF value > 10, as this suggests high multicollinearity.

    print(X.columns)
    # Returns the feature engineered version of X
    return X

In [106]:
def train_model(df, sensitive, fulldata):
    
    if fulldata:
        X, y = split_scale(df, fulldata)
        if sensitive:
            X_fe = feature_engineeringV2_sensitive(X)
        else:
            X_fe = feature_engineeringV2(X)
        
        X_fe_const = sm.add_constant(X_fe)

        X_fe_const = X_fe_const.reset_index(drop=True)
        y = y.reset_index(drop=True)

        model = sm.OLS(y, X_fe_const).fit()
        print(model.summary())
        y_pred = model.predict(X_fe_const)
        rmse = np.sqrt(mean_squared_error(y, y_pred))
        condition_number = np.linalg.cond(X_fe_const)
    else:
        X_train, X_test, y_train, y_test = split_scale(df, fulldata)

        if sensitive:
            X_train_fe = feature_engineeringV2_sensitive(X_train)
            X_test_fe = feature_engineeringV2_sensitive(X_test)
        else:
            X_train_fe = feature_engineeringV2(X_train)
            X_test_fe = feature_engineeringV2(X_test)
    
        X_train_fe_const = sm.add_constant(X_train_fe)
        X_test_fe_const = sm.add_constant(X_test_fe)

        X_train_fe_const = X_train_fe_const.reset_index(drop=True)
        y_train = y_train.reset_index(drop=True)

        X_test_fe_const = X_test_fe_const.reset_index(drop=True)
        y_test = y_test.reset_index(drop=True)

    

        model = sm.OLS(y_train, X_train_fe_const).fit()
    
        print(model.summary())
    
        y_pred = model.predict(X_test_fe_const)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        condition_number = np.linalg.cond(X_train_fe_const)
 
    print(f"\nTest RMSE: {rmse}")
    print(f"Condition Number: {condition_number}")
    
    return model

In [107]:
def full_pipeline(file, sensitive, fulldata):
    '''
    This function orchestrates the full pipeline of loading, cleaning, feature engineering, and training a model.
    '''
    data = load_and_clean_data(file)

    model = train_model(data, sensitive, fulldata)
    return model

In [108]:
model_sensitive = full_pipeline('Life Expectancy Data.csv', True, fulldata=False)

Index(['Alcohol_consumption', 'BMI', 'GDP_per_capita', 'Population_mln',
       'Thinness_ten_nineteen_years', 'Schooling'],
      dtype='object')
Index(['Alcohol_consumption', 'BMI', 'GDP_per_capita', 'Population_mln',
       'Thinness_ten_nineteen_years', 'Schooling'],
      dtype='object')
                            OLS Regression Results                            
Dep. Variable:        Life_expectancy   R-squared:                       0.664
Model:                            OLS   Adj. R-squared:                  0.663
Method:                 Least Squares   F-statistic:                     751.2
Date:                Mon, 14 Jul 2025   Prob (F-statistic):               0.00
Time:                        16:28:35   Log-Likelihood:                -2019.8
No. Observations:                2291   AIC:                             4054.
Df Residuals:                    2284   BIC:                             4094.
Df Model:                           6                                     

In [109]:
model_non_sensitive = full_pipeline('Life Expectancy Data.csv', False, fulldata=False)

Index(['Infant_deaths', 'Adult_mortality', 'Alcohol_consumption',
       'Hepatitis_B', 'BMI', 'Polio', 'Incidents_HIV', 'GDP_per_capita',
       'Population_mln', 'Thinness_ten_nineteen_years', 'Schooling'],
      dtype='object')
Index(['Infant_deaths', 'Adult_mortality', 'Alcohol_consumption',
       'Hepatitis_B', 'BMI', 'Polio', 'Incidents_HIV', 'GDP_per_capita',
       'Population_mln', 'Thinness_ten_nineteen_years', 'Schooling'],
      dtype='object')
                            OLS Regression Results                            
Dep. Variable:        Life_expectancy   R-squared:                       0.978
Model:                            OLS   Adj. R-squared:                  0.978
Method:                 Least Squares   F-statistic:                     9365.
Date:                Mon, 14 Jul 2025   Prob (F-statistic):               0.00
Time:                        16:28:36   Log-Likelihood:                 1122.6
No. Observations:                2291   AIC:                    

# Now training Model on entire dataset to feed into function

In [110]:
model_sensitive_full = full_pipeline('Life Expectancy Data.csv', True, fulldata=True)

Index(['Alcohol_consumption', 'BMI', 'GDP_per_capita', 'Population_mln',
       'Thinness_ten_nineteen_years', 'Schooling'],
      dtype='object')
                            OLS Regression Results                            
Dep. Variable:        Life_expectancy   R-squared:                       0.658
Model:                            OLS   Adj. R-squared:                  0.657
Method:                 Least Squares   F-statistic:                     914.8
Date:                Mon, 14 Jul 2025   Prob (F-statistic):               0.00
Time:                        16:29:40   Log-Likelihood:                -2528.8
No. Observations:                2864   AIC:                             5072.
Df Residuals:                    2857   BIC:                             5113.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t   

In [111]:
model_non_sensitive_full = full_pipeline('Life Expectancy Data.csv', False, fulldata=True)

Index(['Infant_deaths', 'Adult_mortality', 'Alcohol_consumption',
       'Hepatitis_B', 'BMI', 'Polio', 'Incidents_HIV', 'GDP_per_capita',
       'Population_mln', 'Thinness_ten_nineteen_years', 'Schooling'],
      dtype='object')
                            OLS Regression Results                            
Dep. Variable:        Life_expectancy   R-squared:                       0.978
Model:                            OLS   Adj. R-squared:                  0.978
Method:                 Least Squares   F-statistic:                 1.141e+04
Date:                Mon, 14 Jul 2025   Prob (F-statistic):               0.00
Time:                        16:29:40   Log-Likelihood:                 1387.8
No. Observations:                2864   AIC:                            -2752.
Df Residuals:                    2852   BIC:                            -2680.
Df Model:                          11                                         
Covariance Type:            nonrobust                     

In [112]:
model_sensitive_full.save("non_sensitive_model.pkl")

In [113]:
model_non_sensitive_full.save("sensitive_model.pkl")
'''
user_info_specs_Sensitive = [
    {'prompt': "Alcohol consumption per person per year in litres?", 'type': float},
    {'prompt': "average BMI", 'type': float},
    {'prompt': "GDP per capita", 'type': int},
    {'prompt': "Population in million", 'type': float},
    {'prompt': "what percentage of the population between1 1-19 years are thin(%)", 'type': float},
    {'prompt': "Number of years of Schooling(years)",'type':float}
    '''

'\nuser_info_specs_Sensitive = [\n    {\'prompt\': "Alcohol consumption per person per year in litres?", \'type\': float},\n    {\'prompt\': "average BMI", \'type\': float},\n    {\'prompt\': "GDP per capita", \'type\': int},\n    {\'prompt\': "Population in million", \'type\': float},\n    {\'prompt\': "what percentage of the population between1 1-19 years are thin(%)", \'type\': float},\n    {\'prompt\': "Number of years of Schooling(years)",\'type\':float}\n    '