In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the data
data = pd.read_csv(r'Life Expectancy Data.csv')

### Data Exploration

In [3]:
data.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Country,2938.0,193.0,Afghanistan,16.0,,,,,,,
Year,2938.0,,,,2007.51872,4.613841,2000.0,2004.0,2008.0,2012.0,2015.0
Status,2938.0,2.0,Developing,2426.0,,,,,,,
Adult Mortality,2928.0,,,,164.796448,124.292079,1.0,74.0,144.0,228.0,723.0
infant deaths,2938.0,,,,30.303948,117.926501,0.0,0.0,3.0,22.0,1800.0
Alcohol,2744.0,,,,4.602861,4.052413,0.01,0.8775,3.755,7.7025,17.87
percentage expenditure,2938.0,,,,738.251295,1987.914858,0.0,4.685343,64.912906,441.534144,19479.91161
Hepatitis B,2385.0,,,,80.940461,25.070016,1.0,77.0,92.0,97.0,99.0
Measles,2938.0,,,,2419.59224,11467.272489,0.0,0.0,17.0,360.25,212183.0
BMI,2904.0,,,,38.321247,20.044034,1.0,19.3,43.5,56.2,87.3


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Adult Mortality                  2928 non-null   float64
 4   infant deaths                    2938 non-null   int64  
 5   Alcohol                          2744 non-null   float64
 6   percentage expenditure           2938 non-null   float64
 7   Hepatitis B                      2385 non-null   float64
 8   Measles                          2938 non-null   int64  
 9    BMI                             2904 non-null   float64
 10  under-five deaths                2938 non-null   int64  
 11  Polio                            2919 non-null   float64
 12  Total expenditure   

In [5]:
data.isnull().sum()

Country                              0
Year                                 0
Status                               0
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
Life expectancy                     10
dtype: int64

In [6]:
data.shape

(2938, 22)

### Data Preprocessing

In [7]:
data.rename(columns={'Life expectancy ': 'Life_Expectancy', 'Adult Mortality': 'Adult_Mortality',
                     'infant deaths': 'Infant_Deaths', 'percentage expenditure': 'Percentage_Exp',
                     'Hepatitis B': 'Hepatitis_B', 'Measles ': 'Measles', ' BMI ': 'BMI',
                     'under-five deaths ': 'under_five_deaths', 'Total expenditure': 'Total_Exp',
                     'Diphtheria ': 'Diphtheria', ' HIV/AIDS': 'HIV/AIDS',
                     ' thinness  1-19 years': 'thinness_1_to_19', ' thinness 5-9 years': 'thinness_5_to_9',
                     'Income composition of resources': 'Income_Composition'}, inplace=True)

In [8]:
data = data.dropna(subset=['Life_Expectancy'], axis=0)
data.drop(['Country'], axis=1, inplace=True)

In [9]:
data.shape

(2928, 21)

In [10]:
new_data = data.copy()
new_data['Status'] = pd.get_dummies(new_data['Status'], drop_first=True).astype(int)
data_linear = data.copy()

### Create a train and test for xgboost, lightgbm, catboost, random forest and svr

In [11]:
X_train, X_test, y_train, y_test = train_test_split(new_data.drop('Life_Expectancy', axis=1),
                                                    new_data['Life_Expectancy'], test_size=0.3, random_state=42)

In [12]:
def outlier_treatment(df):
    # Copy the input data to avoid modifying the original DataFrame

    # Calculate the first quartile (Q1) and third quartile (Q3) for each column
    q1 = df.quantile(0.25, numeric_only=True)
    q3 = df.quantile(0.75, numeric_only=True)

    # Calculate the interquartile range (IQR)
    iqr = q3 - q1

    # Define the lower and upper bounds for outlier detection
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # Replace outliers with the lower or upper bound
    for column in df.columns:
        if np.issubdtype(df[column].dtype, np.number):  # Check if the column contains numerical data
            df[column] = np.where(df[column] < lower_bound[column], lower_bound[column], df[column])
            df[column] = np.where(df[column] > upper_bound[column], upper_bound[column], df[column])

    return df

In [13]:
def filling_missing_value(data):
    for i in data.columns:
        if data[i].dtypes == object:
            data[i].fillna(data[i].mode()[0], inplace=True)
        else:
            data[i].fillna(data[i].mean(), inplace=True)
    return data

### Check the distribution of the data

In [14]:
data_linear = filling_missing_value(data_linear)
data_linear = outlier_treatment(data_linear)

from scipy import stats

for i in data_linear.columns:

    if data_linear[i].dtype in ['int64', 'float64']:

        kstest_statistic, kstest_p_value = stats.kstest(data_linear[i], 'norm')

        # 'norm' inside indicates that the test is comparing the distribution of the data against a normal distribution

        print(f'Column: {i}')
        print(f'Kolmogorov-Smirnov Test:')
        print(f'Test Statistic: {kstest_statistic}')
        print(f'p-value: {kstest_p_value}')

        if kstest_p_value > 0.05:
            print('Data looks normally distributed')
            print()
        else:
            print('Data does not look normally distributed')
            print()

Column: Year
Kolmogorov-Smirnov Test:
Test Statistic: 1.0
p-value: 0.0
Data does not look normally distributed

Column: Adult_Mortality
Kolmogorov-Smirnov Test:
Test Statistic: 0.9918195008754738
p-value: 0.0
Data does not look normally distributed

Column: Infant_Deaths
Kolmogorov-Smirnov Test:
Test Statistic: 0.5742444035709464
p-value: 0.0
Data does not look normally distributed

Column: Alcohol
Kolmogorov-Smirnov Test:
Test Statistic: 0.6414334841157909
p-value: 0.0
Data does not look normally distributed

Column: Percentage_Exp
Kolmogorov-Smirnov Test:
Test Statistic: 0.7668396579846813
p-value: 0.0
Data does not look normally distributed

Column: Hepatitis_B
Kolmogorov-Smirnov Test:
Test Statistic: 1.0
p-value: 0.0
Data does not look normally distributed

Column: Measles
Kolmogorov-Smirnov Test:
Test Statistic: 0.6094219991993618
p-value: 0.0
Data does not look normally distributed

Column: BMI
Kolmogorov-Smirnov Test:
Test Statistic: 0.9811537226523659
p-value: 0.0
Data does not

### Feature Selection

In [15]:
def target_correlation(data_for_logic, target, threshold=0.1):
    corr_matrix = data_for_logic.corr(method='spearman', numeric_only=True)

    target_correlations = corr_matrix[target].abs() > threshold

    variables_explaining_target = target_correlations[target_correlations].index.tolist()

    variables_explaining_target.remove(target)

    result_explaining_target = pd.DataFrame(
        {'Variable': variables_explaining_target,
         'Correlation with Target': corr_matrix.loc[variables_explaining_target, target]})

    result_explaining_target.reset_index(drop=True, inplace=True)

    return result_explaining_target


In [16]:
target_result = target_correlation(data_linear, 'Life_Expectancy')

target_result

Unnamed: 0,Variable,Correlation with Target
0,Year,0.157168
1,Adult_Mortality,-0.649946
2,Infant_Deaths,-0.602416
3,Alcohol,0.429921
4,Percentage_Exp,0.423159
5,Hepatitis_B,0.316868
6,Measles,-0.280746
7,BMI,0.583869
8,under_five_deaths,-0.6219
9,Polio,0.538488


In [17]:
def intercorrelation(data, target, threshold=0.8):
    corr_matrix = data.drop(columns=[target]).corr(method='spearman', numeric_only=True)

    highly_correlated_variables = (np.abs(corr_matrix) > threshold) & (corr_matrix != 1.0)

    print(highly_correlated_variables)

    independent_variable_pairs = np.where(highly_correlated_variables)

    print(independent_variable_pairs)

    result_intercorrelated_independent = pd.DataFrame({
        'Variable 1': corr_matrix.index[independent_variable_pairs[0]],
        'Variable 2': corr_matrix.columns[independent_variable_pairs[1]]
    })
    return result_intercorrelated_independent


In [18]:
intercorrelated_result = intercorrelation(data_linear, 'Life_Expectancy')

intercorrelated_result

                     Year  Adult_Mortality  Infant_Deaths  Alcohol  \
Year                False            False          False    False   
Adult_Mortality     False            False          False    False   
Infant_Deaths       False            False          False    False   
Alcohol             False            False          False    False   
Percentage_Exp      False            False          False    False   
Hepatitis_B         False            False          False    False   
Measles             False            False          False    False   
BMI                 False            False          False    False   
under_five_deaths   False            False           True    False   
Polio               False            False          False    False   
Total_Exp           False            False          False    False   
Diphtheria          False            False          False    False   
HIV/AIDS            False            False          False    False   
GDP                 

Unnamed: 0,Variable 1,Variable 2
0,Infant_Deaths,under_five_deaths
1,under_five_deaths,Infant_Deaths
2,Polio,Diphtheria
3,Diphtheria,Polio
4,thinness_1_to_19,thinness_5_to_9
5,thinness_5_to_9,thinness_1_to_19
6,Income_Composition,Schooling
7,Schooling,Income_Composition


In [19]:
target_result.Variable.tolist()

['Year',
 'Adult_Mortality',
 'Infant_Deaths',
 'Alcohol',
 'Percentage_Exp',
 'Hepatitis_B',
 'Measles',
 'BMI',
 'under_five_deaths',
 'Polio',
 'Total_Exp',
 'Diphtheria',
 'HIV/AIDS',
 'GDP',
 'thinness_1_to_19',
 'thinness_5_to_9',
 'Income_Composition',
 'Schooling']

### VIF Calculation

In [20]:
from statsmodels.stats.outliers_influence import variance_inflation_factor


def calculate_vif(data, variables):
    vif_data = data[variables]
    vif_values = [variance_inflation_factor(vif_data.values, i) for i in range(vif_data.shape[1])]
    vif_result = pd.DataFrame({'VIF': vif_values, 'Features': variables})
    return vif_result


vif_result = calculate_vif(data_linear,
                           [
                               # 'Year',
                            'Adult_Mortality',
                            'Infant_Deaths',
                            'Alcohol',
                            'Percentage_Exp',
                            # 'Hepatitis_B',
                            'Measles',
                            'BMI',
                            # 'under_five_deaths',
                            # 'Polio',
                            'Total_Exp',
                            # 'Diphtheria',
                            'HIV/AIDS',
                            'GDP',
                            # 'thinness_1_to_19',
                            'thinness_5_to_9',
                            'Income_Composition',
                            'Schooling'])

vif_result


Unnamed: 0,VIF,Features
0,4.778056,Adult_Mortality
1,2.946648,Infant_Deaths
2,3.737087,Alcohol
3,3.535329,Percentage_Exp
4,2.244313,Measles
5,7.930145,BMI
6,8.14538,Total_Exp
7,2.929098,HIV/AIDS
8,4.359218,GDP
9,3.63065,thinness_5_to_9


### Split the data for linear regression

In [21]:
data_linear = pd.concat([data_linear[vif_result.Features.tolist()], data_linear.Life_Expectancy,pd.get_dummies(data.Status ,drop_first=True).astype(int)], axis=1)
X_train_linear, X_test_linear, y_train_linear, y_test_linear = train_test_split(
    data_linear.drop('Life_Expectancy', axis=1), data_linear['Life_Expectancy'], test_size=0.3, random_state=42)

### Model Building

In [22]:
sc = StandardScaler()
models = []

xgb_model_def = XGBRegressor()
lgb_model_def = LGBMRegressor()
catboost_model_def = CatBoostRegressor()
random_forest_def = RandomForestRegressor()
linear_regression_def = LinearRegression()
svr_def = SVR()
models.extend([
    ('XGBoost', xgb_model_def),
    ('LightGBM', lgb_model_def),
    ('CatBoost', catboost_model_def),
    ('Random Forest', random_forest_def),
    ('Linear Regression', linear_regression_def),
    ('SVR', svr_def)
])

In [23]:
def train_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = metrics.r2_score(y_test, y_pred)

    print(f'Model Performance for {model_name}')
    print('mae', mae)
    print('mse', mse)
    print('r2', r2)

    return r2

### Model Evaluation

In [24]:
r2_df = pd.DataFrame(columns=['Model', 'R2'])
for model_name, model in models:
    if model_name == 'Random Forest':
        X_train_filled = X_train.copy()
        X_test_filled = X_test.copy()
        
        X_train_filled = filling_missing_value(X_train_filled)
        X_test_filled = filling_missing_value(X_test_filled)
        
        gini_prob = train_and_evaluate_model(model_name, model, X_train_filled, y_train, X_test_filled, y_test)
    elif model_name == 'SVR':
        X_train_filled_svr = X_train.copy()
        X_test_filled_svr = X_test.copy()
        
        X_train_filled_svr = outlier_treatment(X_train_filled_svr)
        X_test_filled_svr = outlier_treatment(X_test_filled_svr)
        
        X_train_filled_svr = filling_missing_value(X_train_filled_svr)
        X_test_filled_svr = filling_missing_value(X_test_filled_svr)

        X_train_filled_svr = sc.fit_transform(X_train_filled_svr)
        X_test_filled_svr = sc.fit_transform(X_test_filled_svr)
        
        r2 = train_and_evaluate_model(model_name, model, X_train_filled_svr, y_train, X_test_filled_svr, y_test)
    elif model_name == 'Linear Regression':
        X_train_filled_linear = X_train_linear.copy()
        X_test_filled_linear = X_test_linear.copy()

        X_train_filled_linear = sc.fit_transform(X_train_filled_linear)
        X_test_filled_linear = sc.fit_transform(X_test_filled_linear)
        
        r2 = train_and_evaluate_model(model_name, model, X_train_filled_linear, y_train_linear, X_test_filled_linear,
                                      y_test_linear)
    else:
        r2 = train_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test)

    if r2 is not None:
        r2_df = pd.concat([r2_df, pd.DataFrame({'Model': [model_name], 'R2': [r2]})], ignore_index=True)

r2_df_sorted = r2_df.sort_values(by='R2', ascending=False)

r2_df_sorted


Model Performance for XGBoost
mae 1.2739142026239423
mse 3.903783291267915
r2 0.9581993349653307
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000406 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3317
[LightGBM] [Info] Number of data points in the train set: 2049, number of used features: 20
[LightGBM] [Info] Start training from score 69.324549
Model Performance for LightGBM
mae 1.2031057774209917
mse 3.537576904453295
r2 0.9621205747900502
Learning rate set to 0.045857
0:	learn: 9.1376888	total: 170ms	remaining: 2m 49s
1:	learn: 8.8436535	total: 172ms	remaining: 1m 25s
2:	learn: 8.5319951	total: 174ms	remaining: 57.8s
3:	learn: 8.2546549	total: 176ms	remaining: 43.7s
4:	learn: 7.9819945	total: 177ms	remaining: 35.3s
5:	learn: 7.7198074	total: 179ms	remaining: 29.6s
6:	learn: 7.4913605	total: 181ms	remaining: 25.7s
7:	learn: 7.2455454	total: 183ms	remaining: 22.7s
8:	learn: 7.0192714	total: 185m

Unnamed: 0,Model,R2
2,CatBoost,0.965991
3,Random Forest,0.965991
1,LightGBM,0.962121
0,XGBoost,0.958199
5,SVR,0.880852
4,Linear Regression,0.840204


### Hyperparameter Tuning

In [25]:
from sklearn.model_selection import cross_val_score
import optuna


def best_params_for_model(trial):
    param = {
        'C': trial.suggest_loguniform('C', 0.1, 10),  # Regularization parameter
        'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),  # Kernel type
        'degree': trial.suggest_int('degree', 2, 5),  # Degree for polynomial kernel
        'gamma': trial.suggest_categorical('gamma', ['auto', 'scale']),
        # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
        'epsilon': trial.suggest_loguniform('epsilon', 0.01, 1.0)  # Epsilon in the epsilon-SVR model
    }

    svr = SVR(**param)

    score = cross_val_score(svr, X_train_filled_svr, y_train, cv=3, scoring='r2', n_jobs=-1).mean()

    return score


study = optuna.create_study(direction='maximize')
study.optimize(best_params_for_model, n_trials=1)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)
best_svr_model = SVR(**best_params)

[I 2024-05-19 18:57:57,298] A new study created in memory with name: no-name-b930156f-760d-498d-baf7-39be2a1aa3ec
  'C': trial.suggest_loguniform('C', 0.1, 10),  # Regularization parameter
  'epsilon': trial.suggest_loguniform('epsilon', 0.01, 1.0)  # Epsilon in the epsilon-SVR model
[I 2024-05-19 18:58:01,215] Trial 0 finished with value: 0.8721462257129334 and parameters: {'C': 1.694602948034969, 'kernel': 'rbf', 'degree': 2, 'gamma': 'scale', 'epsilon': 0.012296052612772893}. Best is trial 0 with value: 0.8721462257129334.


Best trial:
  Value: 0.872
  Params:  {'C': 1.694602948034969, 'kernel': 'rbf', 'degree': 2, 'gamma': 'scale', 'epsilon': 0.012296052612772893}


In [26]:
from sklearn.model_selection import cross_val_score
import optuna


def best_params_for_model(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1),
        #  fraction of the training data used to train each tree.
        # Using a fraction of the data can help to prevent overfitting and improve generalization.
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),
        'gamma': trial.suggest_int('gamma', 0, 5)
        # Increasing the gamma value can help to prevent overfitting
        # Should be used only when you are using high depth
    }
    xgb_reg = XGBRegressor(**param)

    auc = cross_val_score(xgb_reg, X_train, y_train, cv=3, scoring='r2', n_jobs=-1).mean()
    return auc


study = optuna.create_study(direction='maximize')
study.optimize(best_params_for_model, n_trials=1)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)

best_xgb_model = XGBRegressor(**best_params)

[I 2024-05-19 18:58:01,227] A new study created in memory with name: no-name-2dc8e97e-ee54-46ab-8c3d-eb6766ac5741
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),
[I 2024-05-19 18:58:03,274] Trial 0 finished with value: 0.9414702733992337 and parameters: {'n_estimators': 827, 'learning_rate': 0.4390243105728273, 'max_depth': 9, 'subsample': 0.6971297656494345, 'colsample_bytree': 0.9879500832485928, 'gamma': 1}. Best is trial 0 with value: 0.9414702733992337.


Best trial:
  Value: 0.941
  Params:  {'n_estimators': 827, 'learning_rate': 0.4390243105728273, 'max_depth': 9, 'subsample': 0.6971297656494345, 'colsample_bytree': 0.9879500832485928, 'gamma': 1}


In [27]:
from sklearn.model_selection import cross_val_score
import optuna

def best_params_for_model(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 10, 100)
    }

    lgb_reg = LGBMRegressor(**param)

    auc = cross_val_score(lgb_reg, X_train, y_train, cv=3, scoring='r2', n_jobs=-1).mean()
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(best_params_for_model, n_trials=1)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)

best_lgb_model = LGBMRegressor(**best_params)

[I 2024-05-19 18:58:03,285] A new study created in memory with name: no-name-2a56fbc3-5508-4fd8-b15d-70d330516c81
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
[I 2024-05-19 18:58:06,481] Trial 0 finished with value: 0.9374443875603312 and parameters: {'n_estimators': 78, 'learning_rate': 0.0437563402536392, 'max_depth': 4, 'num_leaves': 20}. Best is trial 0 with value: 0.9374443875603312.


Best trial:
  Value: 0.937
  Params:  {'n_estimators': 78, 'learning_rate': 0.0437563402536392, 'max_depth': 4, 'num_leaves': 20}


In [28]:
def best_params_for_model(trial):

    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10), # Regularization term that prevents overfitting by penalizing large parameter values.
        # 'cat_features': [],  # Handle categorical features separately
        'loss_function': trial.suggest_categorical('loss_function', ['RMSE']) # For regression tasks, use ‘RMSE,’ while for classification, use ‘Logloss’.
    }

    cb_reg = CatBoostRegressor(**param)

    auc = cross_val_score(cb_reg, X_train, y_train, cv=3, scoring='r2', n_jobs=-1).mean()
    return auc



study = optuna.create_study(direction='maximize')
study.optimize(best_params_for_model, n_trials=1)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)

best_cb_model = CatBoostRegressor(**best_params)

[I 2024-05-19 18:58:06,494] A new study created in memory with name: no-name-61c6b994-4e2a-40cf-8b97-2df8386bf09d
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10), # Regularization term that prevents overfitting by penalizing large parameter values.
[I 2024-05-19 18:58:12,713] Trial 0 finished with value: 0.9598020110382652 and parameters: {'iterations': 922, 'learning_rate': 0.05332780078851239, 'depth': 6, 'l2_leaf_reg': 0.3218723981891117, 'loss_function': 'RMSE'}. Best is trial 0 with value: 0.9598020110382652.


Best trial:
  Value: 0.960
  Params:  {'iterations': 922, 'learning_rate': 0.05332780078851239, 'depth': 6, 'l2_leaf_reg': 0.3218723981891117, 'loss_function': 'RMSE'}


In [29]:

def best_params_for_linear_model(trial):
    param = {
        'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]),
        'copy_X': trial.suggest_categorical('copy_X', [True, False])
    }

    lr_reg = LinearRegression(**param)

    auc = cross_val_score(lr_reg, X_train_filled_linear, y_train, cv=3, scoring='r2', n_jobs=-1).mean()
    return auc


study = optuna.create_study(direction='maximize')
study.optimize(best_params_for_linear_model, n_trials=1)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)

best_lr_model = LinearRegression(**best_params)

[I 2024-05-19 18:58:12,725] A new study created in memory with name: no-name-f52af866-af29-45ae-8333-a53d0492859f
[I 2024-05-19 18:58:14,494] Trial 0 finished with value: 0.8339330366799849 and parameters: {'fit_intercept': True, 'copy_X': False}. Best is trial 0 with value: 0.8339330366799849.


Best trial:
  Value: 0.834
  Params:  {'fit_intercept': True, 'copy_X': False}


In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import optuna

def best_params_for_rf(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', [ 'sqrt', 'log2']),
    }

    rf_reg = RandomForestRegressor(**param)

    auc = cross_val_score(rf_reg, X_train_filled, y_train, cv=3, scoring='r2', n_jobs=-1).mean()
    return auc


study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(best_params_for_rf, n_trials=100)

print('Best trial for RandomForest:')
best_params_rf = study_rf.best_params
print('  Value: {:.3f}'.format(study_rf.best_value))
print('  Params: ', best_params_rf)

best_rf_model = RandomForestRegressor(**best_params_rf)


[I 2024-05-19 18:58:14,503] A new study created in memory with name: no-name-7e8af564-4f4c-4579-8c0a-cdf4567a57c2
[I 2024-05-19 18:58:17,421] Trial 0 finished with value: 0.8505644442722126 and parameters: {'n_estimators': 568, 'max_depth': 3, 'min_samples_split': 6, 'min_samples_leaf': 16, 'max_features': 'log2'}. Best is trial 0 with value: 0.8505644442722126.
[I 2024-05-19 18:58:18,060] Trial 1 finished with value: 0.8824931269961641 and parameters: {'n_estimators': 141, 'max_depth': 4, 'min_samples_split': 6, 'min_samples_leaf': 10, 'max_features': 'log2'}. Best is trial 1 with value: 0.8824931269961641.
[I 2024-05-19 18:58:19,175] Trial 2 finished with value: 0.8842083651454381 and parameters: {'n_estimators': 291, 'max_depth': 4, 'min_samples_split': 13, 'min_samples_leaf': 8, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.8842083651454381.
[I 2024-05-19 18:58:20,202] Trial 3 finished with value: 0.8800821472938055 and parameters: {'n_estimators': 278, 'max_depth': 4, 'mi

Best trial for RandomForest:
  Value: 0.947
  Params:  {'n_estimators': 830, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'log2'}


### Model Evaluation with Optimized Hyperparameters

In [31]:
models_optimized = []

models_optimized.extend([
    ('XGBoost Optuna', best_xgb_model),
    ('LightGBM Optuna', best_lgb_model),
    ('CatBoost Optuna', best_cb_model),
    ('Random Forest Optuna', best_rf_model),
    ('Linear Regression Optuna', best_lr_model),
    ('SVR Optuna', best_svr_model)
])

In [32]:
r2_df_optuna = pd.DataFrame(columns=['Model', 'R2'])

for model_name, model in models_optimized:
    if model_name == 'Random Forest Optuna':
        gini_prob = train_and_evaluate_model(model_name, model, X_train_filled, y_train, X_test_filled, y_test)
    elif model_name == 'SVR Optuna':
        r2 = train_and_evaluate_model(model_name, model, X_train_filled_svr, y_train, X_test_filled_svr, y_test)
    elif model_name == 'Linear Regression Optuna':
        r2 = train_and_evaluate_model(model_name, model, X_train_filled_linear, y_train_linear, X_test_filled_linear,
                                      y_test_linear)
    if r2 is not None:
        r2_df_optuna = pd.concat([r2_df_optuna, pd.DataFrame({'Model': [model_name], 'R2': [r2]})], ignore_index=True)

r2_df_sorted_optuna = r2_df_optuna.sort_values(by='R2', ascending=False)


Model Performance for Random Forest Optuna
mae 1.3925362880532575
mse 4.418260492573227
r2 0.9526904510045223
Model Performance for Linear Regression Optuna
mae 2.897383223170628
mse 14.819406783804851
r2 0.8402041804175702
Model Performance for SVR Optuna
mae 2.0750974079379896
mse 9.279025379004164
r2 0.9006426835773531


In [33]:

r2_df_sorted_optuna


Unnamed: 0,Model,R2
5,SVR Optuna,0.900643
0,XGBoost Optuna,0.880852
1,LightGBM Optuna,0.880852
2,CatBoost Optuna,0.880852
3,Random Forest Optuna,0.880852
4,Linear Regression Optuna,0.840204


### Final Review

In [34]:
final_review = pd.concat([r2_df_sorted,r2_df_sorted_optuna], axis=0)

final_review_sorted = final_review.sort_values(by='R2', ascending=False)
final_review_sorted.reset_index(drop=True, inplace=True)

final_review_sorted

Unnamed: 0,Model,R2
0,CatBoost,0.965991
1,Random Forest,0.965991
2,LightGBM,0.962121
3,XGBoost,0.958199
4,SVR Optuna,0.900643
5,SVR,0.880852
6,XGBoost Optuna,0.880852
7,LightGBM Optuna,0.880852
8,CatBoost Optuna,0.880852
9,Random Forest Optuna,0.880852


### Stacking

In [35]:
from mlxtend.regressor import StackingCVRegressor

In [36]:
reg1 = xgb_model_def
reg2 = lgb_model_def

In [37]:
meta_reg = catboost_model_def

In [38]:
stacking_reg = StackingCVRegressor(regressors=[reg1, reg2],
                                   meta_regressor=meta_reg,
                                   cv=5,
                                   use_features_in_secondary=True,
                                   verbose=1,
                                   random_state=42)

In [39]:
train_and_evaluate_model('Stacking Regressor with Cat, Xgb and LGBM', stacking_reg, X_train_filled, y_train, X_test_filled, y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000284 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3243
[LightGBM] [Info] Number of data points in the train set: 1639, number of used features: 20
[LightGBM] [Info] Start training from score 69.232581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 1639, number of used features: 20
[LightGBM] [Info] Start training from score 69.262782
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000291 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3238
[LightGBM] [Info] Number of data points in the train set: 1639, number of used features: 20
[LightGBM] [Info] Start trai

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished


13:	learn: 5.6197582	total: 32.9ms	remaining: 2.32s
14:	learn: 5.4230785	total: 35.5ms	remaining: 2.33s
15:	learn: 5.2387250	total: 37.8ms	remaining: 2.33s
16:	learn: 5.0577505	total: 40.3ms	remaining: 2.33s
17:	learn: 4.8860023	total: 42.6ms	remaining: 2.32s
18:	learn: 4.7225766	total: 44.9ms	remaining: 2.32s
19:	learn: 4.5648624	total: 47.5ms	remaining: 2.33s
20:	learn: 4.4193143	total: 50ms	remaining: 2.33s
21:	learn: 4.2771845	total: 52.4ms	remaining: 2.33s
22:	learn: 4.1394623	total: 54.9ms	remaining: 2.33s
23:	learn: 4.0086891	total: 57.5ms	remaining: 2.34s
24:	learn: 3.8802715	total: 60.2ms	remaining: 2.35s
25:	learn: 3.7644050	total: 62.9ms	remaining: 2.36s
26:	learn: 3.6528220	total: 65.6ms	remaining: 2.36s
27:	learn: 3.5543981	total: 68ms	remaining: 2.36s
28:	learn: 3.4501999	total: 70.6ms	remaining: 2.36s
29:	learn: 3.3607114	total: 73.1ms	remaining: 2.36s
30:	learn: 3.2706974	total: 75.7ms	remaining: 2.37s
31:	learn: 3.1858520	total: 78.6ms	remaining: 2.38s
32:	learn: 3.113

0.9654412800800484

### Univariate Feature Selection

In [42]:
variables = []
train_r2_scores = []
test_r2_scores = []

for i in X_train.columns:
    X_train_single_var = X_train_filled[[i]]
    X_test_single_var = X_test_filled[[i]]

    # Fit a regression model using the single variable on the training set
    catboost_model_def.fit(X_train_single_var, y_train)
    y_pred_train_single_var = catboost_model_def.predict(X_train_single_var)

    # Calculate R2 for the single variable model on the training set
    train_r2 = metrics.r2_score(y_train, y_pred_train_single_var)

    # Fit a regression model using the single variable on the test set
    y_pred_test_single_var = catboost_model_def.predict(X_test_single_var)

    # Calculate R2 for the single variable model on the test set
    test_r2 = metrics.r2_score(y_test, y_pred_test_single_var)

    # Append the results to the lists
    variables.append(i)
    train_r2_scores.append(train_r2)
    test_r2_scores.append(test_r2)

# Create a DataFrame from the results

results_df = pd.DataFrame({'Variable': variables, 'Train R2': train_r2_scores, 'Test R2': test_r2_scores})

results_df_sorted = results_df.sort_values(by='Test R2', ascending=False)

pd.options.display.float_format = '{:.4f}'.format

results_df_sorted


Learning rate set to 0.045857
0:	learn: 9.4458386	total: 588us	remaining: 588ms
1:	learn: 9.4337572	total: 1.06ms	remaining: 531ms
2:	learn: 9.4224713	total: 1.65ms	remaining: 549ms
3:	learn: 9.4136607	total: 1.99ms	remaining: 495ms
4:	learn: 9.4050576	total: 2.59ms	remaining: 515ms
5:	learn: 9.3970692	total: 3.04ms	remaining: 504ms
6:	learn: 9.3894581	total: 3.63ms	remaining: 516ms
7:	learn: 9.3821692	total: 4.14ms	remaining: 513ms
8:	learn: 9.3760599	total: 4.52ms	remaining: 497ms
9:	learn: 9.3702140	total: 5.1ms	remaining: 505ms
10:	learn: 9.3647172	total: 5.68ms	remaining: 511ms
11:	learn: 9.3595027	total: 6.25ms	remaining: 515ms
12:	learn: 9.3547816	total: 6.75ms	remaining: 513ms
13:	learn: 9.3507368	total: 7.35ms	remaining: 518ms
14:	learn: 9.3472900	total: 7.91ms	remaining: 520ms
15:	learn: 9.3434488	total: 8.55ms	remaining: 526ms
16:	learn: 9.3400104	total: 9.14ms	remaining: 529ms
17:	learn: 9.3367563	total: 9.72ms	remaining: 531ms
18:	learn: 9.3341648	total: 10.3ms	remaining: 

Unnamed: 0,Variable,Train R2,Test R2
2,Adult_Mortality,0.8153,0.8506
13,HIV/AIDS,0.6789,0.6698
18,Income_Composition,0.735,0.654
19,Schooling,0.6125,0.5556
8,BMI,0.6054,0.4809
17,thinness_5_to_9,0.5228,0.4038
10,Polio,0.4319,0.3968
9,under_five_deaths,0.5276,0.3941
3,Infant_Deaths,0.4654,0.3899
16,thinness_1_to_19,0.5165,0.3893


In [45]:
results_df[results_df['Test R2']>0.2].Variable.tolist()

['Status',
 'Adult_Mortality',
 'Infant_Deaths',
 'Percentage_Exp',
 'BMI',
 'under_five_deaths',
 'Polio',
 'Diphtheria',
 'HIV/AIDS',
 'GDP',
 'thinness_1_to_19',
 'thinness_5_to_9',
 'Income_Composition',
 'Schooling']

### Final Model

In [48]:
fin_input = new_data[results_df[results_df['Test R2']>0.2].Variable.tolist()]
fin_output =data['Life_Expectancy']
X_train_uni, X_test_uni, y_train_uni, y_test_uni = train_test_split(fin_input, fin_output, test_size=0.3, random_state=42)
catboost_model_def = catboost_model_def.fit(X_train_uni, y_train_uni)

Learning rate set to 0.045857
0:	learn: 9.1118040	total: 1.9ms	remaining: 1.89s
1:	learn: 8.8172222	total: 3.2ms	remaining: 1.6s
2:	learn: 8.5266300	total: 4.49ms	remaining: 1.49s
3:	learn: 8.2395840	total: 5.94ms	remaining: 1.48s
4:	learn: 7.9800010	total: 7.26ms	remaining: 1.45s
5:	learn: 7.7160629	total: 8.74ms	remaining: 1.45s
6:	learn: 7.4624616	total: 9.92ms	remaining: 1.41s
7:	learn: 7.2254020	total: 11.3ms	remaining: 1.4s
8:	learn: 6.9816552	total: 12.6ms	remaining: 1.39s
9:	learn: 6.7556149	total: 13.9ms	remaining: 1.38s
10:	learn: 6.5445763	total: 15.2ms	remaining: 1.36s
11:	learn: 6.3419108	total: 16.5ms	remaining: 1.36s
12:	learn: 6.1481759	total: 17.9ms	remaining: 1.36s
13:	learn: 5.9741676	total: 19.5ms	remaining: 1.37s
14:	learn: 5.8089099	total: 21ms	remaining: 1.38s
15:	learn: 5.6463647	total: 22.4ms	remaining: 1.38s
16:	learn: 5.4778938	total: 23.8ms	remaining: 1.38s
17:	learn: 5.3322923	total: 25.4ms	remaining: 1.39s
18:	learn: 5.1902949	total: 27ms	remaining: 1.39s


In [49]:
train_and_evaluate_model('CatBoost for selected features', catboost_model_def, X_train_uni, y_train_uni, X_test_uni, y_test_uni)

Learning rate set to 0.045857
0:	learn: 9.1118040	total: 1.85ms	remaining: 1.85s
1:	learn: 8.8172222	total: 3.2ms	remaining: 1.6s
2:	learn: 8.5266300	total: 4.51ms	remaining: 1.5s
3:	learn: 8.2395840	total: 6.14ms	remaining: 1.53s
4:	learn: 7.9800010	total: 7.58ms	remaining: 1.51s
5:	learn: 7.7160629	total: 8.91ms	remaining: 1.48s
6:	learn: 7.4624616	total: 10.1ms	remaining: 1.44s
7:	learn: 7.2254020	total: 11.4ms	remaining: 1.42s
8:	learn: 6.9816552	total: 12.7ms	remaining: 1.4s
9:	learn: 6.7556149	total: 14ms	remaining: 1.39s
10:	learn: 6.5445763	total: 15.7ms	remaining: 1.41s
11:	learn: 6.3419108	total: 17.1ms	remaining: 1.41s
12:	learn: 6.1481759	total: 18.8ms	remaining: 1.43s
13:	learn: 5.9741676	total: 20.3ms	remaining: 1.43s
14:	learn: 5.8089099	total: 21.7ms	remaining: 1.43s
15:	learn: 5.6463647	total: 23ms	remaining: 1.42s
16:	learn: 5.4778938	total: 24.5ms	remaining: 1.42s
17:	learn: 5.3322923	total: 25.7ms	remaining: 1.4s
18:	learn: 5.1902949	total: 27ms	remaining: 1.39s
19:

0.9654482490541535