In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
# download the dataset from Kaggle
!kaggle datasets download -d kumarajarshi/life-expectancy-who -p ../Data

Dataset URL: https://www.kaggle.com/datasets/kumarajarshi/life-expectancy-who
License(s): other
Downloading life-expectancy-who.zip to ../Data
100% 119k/119k [00:00<00:00, 554kB/s]
100% 119k/119k [00:00<00:00, 553kB/s]


In [3]:
# unzip the dataset
!unzip ../Data/life-expectancy-who.zip -d ../Data

Archive:  ../Data/life-expectancy-who.zip
  inflating: ../Data/Life Expectancy Data.csv  


In [128]:
life_expectancy = pd.read_csv("../Data/Life Expectancy Data.csv")
life_expectancy

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.470,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959000,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,Zimbabwe,2004,Developing,44.3,723.0,27,4.36,0.000000,68.0,31,...,67.0,7.13,65.0,33.6,454.366654,12777511.0,9.4,9.4,0.407,9.2
2934,Zimbabwe,2003,Developing,44.5,715.0,26,4.06,0.000000,7.0,998,...,7.0,6.52,68.0,36.7,453.351155,12633897.0,9.8,9.9,0.418,9.5
2935,Zimbabwe,2002,Developing,44.8,73.0,25,4.43,0.000000,73.0,304,...,73.0,6.53,71.0,39.8,57.348340,125525.0,1.2,1.3,0.427,10.0
2936,Zimbabwe,2001,Developing,45.3,686.0,25,1.72,0.000000,76.0,529,...,76.0,6.16,75.0,42.1,548.587312,12366165.0,1.6,1.7,0.427,9.8


In [129]:
# deleting extra spaces
life_expectancy.rename(columns = {'Life expectancy ': 'Life expectancy',
                                  'Measles ': 'Measles',
                                  ' BMI ': 'BMI',
                                  'under-five deaths ': 'under-five deaths',
                                  'Diphtheria ': 'Diphtheria',
                                  ' thinness  1-19 years':  'thinness  1-19 years',
                                  ' thinness 5-9 years': 'thinness 5-9 years'}, inplace = True)
life_expectancy.columns

Index(['Country', 'Year', 'Status', 'Life expectancy', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles', 'BMI', 'under-five deaths', 'Polio', 'Total expenditure',
       'Diphtheria', ' HIV/AIDS', 'GDP', 'Population', 'thinness  1-19 years',
       'thinness 5-9 years', 'Income composition of resources', 'Schooling'],
      dtype='object')

In [130]:
# splitting the dataset
train, test = train_test_split(life_expectancy, test_size = 0.2)

In [131]:
# checking for missing values
trainset_missing_values = train.isna().sum()
print('train set missing values:\n', trainset_missing_values)

testset_missing_values = test.isna().sum()
print('test set missing values:\n', testset_missing_values)

train set missing values:
 Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            164
percentage expenditure               0
Hepatitis B                        422
Measles                              0
BMI                                 29
under-five deaths                    0
Polio                               18
Total expenditure                  190
Diphtheria                          18
 HIV/AIDS                            0
GDP                                362
Population                         516
thinness  1-19 years                29
thinness 5-9 years                  29
Income composition of resources    138
Schooling                          134
dtype: int64
test set missing values:
 Country                              0
Year                                 

In [132]:
# handelling missing values for train set
cols_with_na = train.isna().sum()[train.isna().sum() > 0].index.tolist()

for column in cols_with_na:
    mean_values = train.groupby('Country')[column].transform('mean')
    train.loc[:, column] = train[column].fillna(mean_value)

In [133]:
# handelling missing values for test set
for column in cols_with_na:
    test.loc[:, column] = test[column].fillna(mean_value)

In [134]:
# checking for missing values
trainset_missing_values = train.isna().sum()
print('train set missing values:\n', trainset_missing_values)

testset_missing_values = test.isna().sum()
print('test set missing values:\n', testset_missing_values)

train set missing values:
 Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                             37
percentage expenditure               0
Hepatitis B                        109
Measles                              0
BMI                                  6
under-five deaths                    0
Polio                                4
Total expenditure                   62
Diphtheria                           4
 HIV/AIDS                            0
GDP                                192
Population                         228
thinness  1-19 years                 6
thinness 5-9 years                   6
Income composition of resources    138
Schooling                          134
dtype: int64
test set missing values:
 Country                             0
Year                                0


In [135]:
# handling missing values for columns that still have NA entries for train set
cols_with_na = train.isna().sum()[train.isna().sum() > 0].index.tolist()

mode_values = train[column].mode()[0]

for column in cols_with_na:
    train.loc[:, column] = train[column].fillna(mode_values)

In [136]:
# handelling missing values for test set
for column in cols_with_na:
    test.loc[:, column] = test[column].fillna(mode_values)

In [137]:
# checking for missing values
trainset_missing_values = train.isna().sum()
print('train set missing values:\n', trainset_missing_values)

testset_missing_values = test.isna().sum()
print('test set missing values:\n', testset_missing_values)

train set missing values:
 Country                            0
Year                               0
Status                             0
Life expectancy                    0
Adult Mortality                    0
infant deaths                      0
Alcohol                            0
percentage expenditure             0
Hepatitis B                        0
Measles                            0
BMI                                0
under-five deaths                  0
Polio                              0
Total expenditure                  0
Diphtheria                         0
 HIV/AIDS                          0
GDP                                0
Population                         0
thinness  1-19 years               0
thinness 5-9 years                 0
Income composition of resources    0
Schooling                          0
dtype: int64
test set missing values:
 Country                            0
Year                               0
Status                             0
Life expe

In [138]:
# mapping for ordinal variables for train set
label_encoder = LabelEncoder()
train.loc[:, 'Status'] = label_encoder.fit_transform(train['Status']).astype('int32')

train

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
1979,Papua New Guinea,2011,1,62.0,285.0,10,0.880000,23.407313,66.000,0,...,69.0,4.940000,72.0,1.0,177.867119,7.269348e+06,1.3,1.3,0.494,9.9
2077,Qatar,2009,1,77.0,79.0,0,1.110000,3688.694288,99.000,112,...,98.0,2.630000,99.0,0.1,61478.238130,1.279333e+01,5.0,4.6,0.828,11.8
2259,Senegal,2005,1,65.0,242.0,23,0.330000,96.037251,84.000,0,...,84.0,5.350000,84.0,0.7,773.869871,1.125127e+07,11.4,11.3,0.415,6.4
2765,United Kingdom of Great Britain and Northern I...,2012,0,86.0,72.0,3,10.420000,0.000000,12.900,2092,...,95.0,9.410000,95.0,0.1,12.900000,1.290000e+01,0.8,0.5,12.900,12.9
802,Egypt,2015,1,79.0,159.0,51,12.900000,0.000000,93.000,5432,...,93.0,12.900000,93.0,0.1,12.900000,1.290000e+01,2.8,2.8,0.688,13.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1064,Guatemala,2009,1,76.0,198.0,12,2.140000,445.442337,92.000,0,...,92.0,6.790000,92.0,0.4,2635.753474,1.431628e+06,1.3,1.3,0.596,10.4
1225,Iran (Islamic Republic of),2008,1,72.7,135.0,24,0.020000,0.000000,99.000,127,...,99.0,6.280000,99.0,0.1,12.691667,1.269167e+01,7.6,7.8,0.716,12.1
1838,Netherlands,2006,0,79.8,71.0,1,9.790000,8344.010392,12.900,1,...,96.0,9.360000,96.0,0.1,44453.971190,1.634611e+06,1.0,1.0,0.893,16.5
1201,India,2000,1,62.5,224.0,1800,0.930000,19.266157,9.875,38835,...,57.0,4.260000,58.0,0.3,438.864634,1.535912e+06,27.7,28.6,0.489,8.3


In [139]:
# mapping for ordinal variables for test set
test.loc[:, 'Status'] = label_encoder.transform(test['Status']).astype('int32')

test

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
509,Canada,2002,1,79.5,79.0,2,7.700000,3895.856308,15.872727,6,...,88.0,9.370000,88.0,0.1,24167.843100,3.136200e+04,0.5,0.4,0.872,15.8
663,Cuba,2009,1,78.1,11.0,1,4.010000,818.877102,96.000000,0,...,99.0,11.780000,96.0,0.1,5484.776300,1.478182e+01,3.4,3.2,0.782,17.7
589,Colombia,2002,1,71.8,163.0,17,4.450000,393.877363,78.000000,139,...,83.0,5.670000,81.0,0.1,2355.725857,4.157249e+07,2.5,2.3,0.656,11.5
1162,Hungary,2007,0,73.5,176.0,1,12.550000,163.343328,12.900000,0,...,99.0,7.510000,99.0,0.1,13842.654900,1.557800e+04,1.9,1.9,0.809,15.3
581,Colombia,2010,1,73.6,15.0,12,4.280000,113.243635,88.000000,0,...,88.0,6.760000,88.0,0.1,625.655440,4.591897e+06,2.2,2.0,0.695,12.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1424,Kyrgyzstan,2001,1,67.2,217.0,4,2.620000,0.000000,57.000000,17,...,99.0,4.790000,99.0,0.1,12.353846,1.235385e+01,3.6,3.6,0.593,11.8
1166,Hungary,2003,0,72.5,184.0,1,13.240000,1030.220271,15.269231,0,...,99.0,8.420000,99.0,0.1,8396.253230,1.129552e+06,2.1,2.1,0.784,14.9
673,Cyprus,2015,0,85.0,52.0,0,13.509091,0.000000,97.000000,0,...,97.0,13.509091,97.0,0.1,2375.112700,1.169850e+05,1.0,1.0,0.854,14.3
1164,Hungary,2005,0,72.9,182.0,1,12.940000,1317.083480,15.269231,2,...,99.0,8.280000,99.0,0.1,11161.724410,1.876500e+04,2.0,2.0,0.795,15.0


In [140]:
# dropping the country column
train = train.drop(columns = ['Country'])

test = test.drop(columns = ['Country'])

In [141]:
#splitting the dataset
target_column = 'Life expectancy'

X_train = train.drop(columns=[target_column])
y_train = train[target_column]

X_test = test.drop(columns=[target_column])
y_test = test[target_column]

In [142]:
# standard scaling
features = X_train.columns.tolist()

std_scaler = StandardScaler()
X_train[features] = X_train[features].astype('float64')
X_test[features] = X_test[features].astype('float64')

X_train[features] = std_scaler.fit_transform(X_train[features])
X_test[features]  = std_scaler.transform(X_test[features])

In [143]:
# generating polynomial features
poly_transformer = PolynomialFeatures(degree = 2)
poly_transformer.fit(X_train)

X_train_poly = poly_transformer.transform(X_train)
X_test_poly = poly_transformer.transform(X_test)

In [144]:
# training the model
model = LinearRegression()
model.fit(X_train_poly, y_train)

In [145]:
# predicting on the train and test sets
y_pred_train = model.predict(X_train_poly)
y_pred_test = model.predict(X_test_poly)

In [147]:
# evaluate your model
from sklearn.metrics import r2_score

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print("R^2 Score on Training Set:", r2_train)
print("R^2 Score on Test Set:", r2_test)

R^2 Score on Training Set: 0.8665325243524031
R^2 Score on Test Set: 0.8479118382819252
