# Scikit-learn Pipeline - Regression

## 1. Import libraries and data
Dataset: [Life Expectancy](https://www.kaggle.com/datasets/kumarajarshi/life-expectancy-who?select=Life+Expectancy+Data.csv)

In [24]:
# load sample dataset
import pandas as pd
import seaborn as sns

from ydata_profiling import ProfileReport

from sklearn.model_selection import train_test_split

from sklearn.ensemble import ExtraTreesRegressor

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

from yellowbrick.regressor import prediction_error

In [11]:
df = pd.read_csv('./Life Expectancy Data.csv')
print(df.shape)
df.head()

(2938, 22)


Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


## 2. EDA and Data Cleansing

In [5]:
df.columns

Index(['Country', 'Year', 'Status', 'Life expectancy ', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure',
       'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling'],
      dtype='object')

In [12]:
# Fix columns Life ecpectancy
df.columns = ['Country', 'Year', 'Status', 'LifeExpectancy', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure',
       'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling']

In [7]:
# simple check for nulls
df.isna().sum()[df.isna().sum() > 0]

LifeExpectancy                      10
Adult Mortality                     10
Alcohol                            194
Hepatitis B                        553
 BMI                                34
Polio                               19
Total expenditure                  226
Diphtheria                          19
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64

In [8]:
df.groupby('Country')['LifeExpectancy'].median().sort_values(ascending=False)

Country
Japan                    82.55
Switzerland              82.20
Iceland                  81.95
Singapore                81.85
Sweden                   81.80
                         ...  
Niue                       NaN
Palau                      NaN
Saint Kitts and Nevis      NaN
San Marino                 NaN
Tuvalu                     NaN
Name: LifeExpectancy, Length: 193, dtype: float64

In [13]:
# Drop Country since it can have strong correllation with Life Expectancy
df = df.drop(columns='Country', axis=1, inplace=False)
df['Status'].replace(['Developing', 'Developed'], [0, 1], inplace=True)
df.head()

Unnamed: 0,Year,Status,LifeExpectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,2015,0,65.0,263.0,62,0.01,71.279624,65.0,1154,19.1,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,2014,0,59.9,271.0,64,0.01,73.523582,62.0,492,18.6,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,2013,0,59.9,268.0,66,0.01,73.219243,64.0,430,18.1,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,2012,0,59.5,272.0,69,0.01,78.184215,67.0,2787,17.6,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,2011,0,59.2,275.0,71,0.01,7.097109,68.0,3013,17.2,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [15]:
# drop NaN in target columns since it's our target and there are 10 rows, NaN in other columns will be imputed later in the pipeline
df = df.drop(df[df['LifeExpectancy'].isna()].index)
df.shape

(2928, 21)

In [16]:
df.corr(numeric_only=True)['LifeExpectancy'].nlargest(15)

LifeExpectancy                     1.000000
Schooling                          0.751975
Income composition of resources    0.724776
 BMI                               0.567694
Status                             0.482136
Diphtheria                         0.479495
Polio                              0.465556
GDP                                0.461455
Alcohol                            0.404877
percentage expenditure             0.381864
Hepatitis B                        0.256762
Total expenditure                  0.218086
Year                               0.170033
Population                        -0.021538
Measles                           -0.157586
Name: LifeExpectancy, dtype: float64

## 3. Set aside unseen data and build pipeline

In [17]:
# set aside unseen data set
data_unseen = df.sample(n=100)
data        = df.drop(data_unseen.index)
print(f'Data for model: {data.shape},\nData for unseen predictions: {data_unseen.shape}')

Data for model: (2828, 21),
Data for unseen predictions: (100, 21)


In [18]:
# prepare X and y
X = data.drop(columns='LifeExpectancy',axis=1)
y = data['LifeExpectancy']

In [19]:
# split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2828 entries, 0 to 2936
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Year                             2828 non-null   int64  
 1   Status                           2828 non-null   int64  
 2   LifeExpectancy                   2828 non-null   float64
 3   Adult Mortality                  2828 non-null   float64
 4   infant deaths                    2828 non-null   int64  
 5   Alcohol                          2643 non-null   float64
 6   percentage expenditure           2828 non-null   float64
 7   Hepatitis B                      2297 non-null   float64
 8   Measles                          2828 non-null   int64  
 9    BMI                             2798 non-null   float64
 10  under-five deaths                2828 non-null   int64  
 11  Polio                            2811 non-null   float64
 12  Total expenditure   

In [21]:
# encoding: get the categorical and numeric column names
num_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print(f'Numeric columns: \n {num_cols}\n\n Catrgory columns: {cat_cols}')

Numeric columns: 
 ['Year', 'Status', 'Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years', ' thinness 5-9 years', 'Income composition of resources', 'Schooling']

 Catrgory columns: []


In [22]:
# pipeline for numerical columns
num_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)
num_pipe

In [25]:
# pipeline for categorical columns: replace NaN by k Nearest neighbour
cat_pipe = make_pipeline(
    KNNImputer(n_neighbors=5),
    OneHotEncoder(handle_unknown='ignore', sparse=False)
)
cat_pipe

In [26]:
# combine both into 1 pipelines
full_pipe = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])
full_pipe

In [28]:
# build the model
et_life_expec = make_pipeline(full_pipe, ExtraTreesRegressor())
et_life_expec

In [30]:
# train the model
et_life_expec.fit(X_train, y_train)

In [36]:
# make predictions on the test set
y_pred = et_life_expec.predict(X_test)

## 5. Measure the model performance

In [33]:
# measure accuracy
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test,y_pred)
mean_act = y_test.mean()
mean_pred = y_pred.mean()
mape = mean_absolute_percentage_error(y_test,y_pred)
print(f'r2: {r2}\nmae: {mae}\nact_mean: {mean_act}\npred_mean: {mean_pred}\nmape: {mape}')

r2: 0.969666607692539
mae: 0.9755641931684358
act_mean: 69.23757361601884
pred_mean: 69.26019787985867
mape: 0.014576917739932476


## 6. Save model

In [37]:
import joblib
joblib.dump(et_life_expec, './et_life_expec_sklearn.pkl')
print(et_life_expec)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Year', 'Status',
                                                   'Adult Mortality',
                                                   'infant deaths', 'Alcohol',
                                                   'percentage expenditure',
                                                   'Hepatitis B', 'Measles ',
                                                   ' BMI ',
                                                   'under-five deaths ',
                                        