In [2]:
# Install necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
import statsmodels.api as sm

In [3]:
df = pd.read_csv('./input/HDI.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Id,Country,HDI Rank,HDI,Life expectancy,Mean years of schooling,Gross national income (GNI) per capita,GNI per capita rank minus HDI rank,Change in HDI rank 2010-2015,...,Coefficient of human inequality,Inequality in life expectancy (%) 2010-2015,Inequality-adjusted life expectancy index,Inequality in education(%),Inequality-adjusted education index,Inequality in income (%),Inequality-adjusted income index,Income inequality (Quintile ratio) 2010-2015,Income inequality (Palma ratio) 2010-2015,Income inequality (Gini coefficient) 2010-2015
0,0,1,Norway,1.0,0.949,81.7,12.7,67614.0,5.0,0.0,...,5.4,3.3,0.918,2.4,0.894,10.4,0.882,3.8,0.9,25.9
1,1,2,Australia,2.0,0.939,82.5,13.2,42822.0,19.0,1.0,...,8.0,4.3,0.921,1.9,0.921,17.7,0.753,6.0,1.4,34.9
2,2,3,Switzerland,2.0,0.939,83.1,13.4,56364.0,7.0,0.0,...,8.4,3.8,0.934,5.7,0.84,15.7,0.806,4.9,1.2,31.6
3,3,4,Germany,4.0,0.926,81.1,13.2,45000.0,13.0,0.0,...,7.0,3.7,0.905,2.6,0.891,14.8,0.787,4.6,1.1,30.1
4,4,5,Denmark,5.0,0.925,80.4,12.7,44519.0,13.0,2.0,...,7.0,3.8,0.894,3.0,0.896,14.3,0.789,4.5,1.0,29.1


In [4]:
# See if any countries are missing our target field, which is infant mortality
columns = df.columns.values.tolist()
target_col = 'Mortality rates Infant (per 1,000 live births) 2015'
target_index = columns.index(target_col)
missing_target = df[df[target_col].isna()]
if not missing_target.empty:
    print('The following countries are missing infant mortality rates and will be removed from our dataset:')
    print(missing_target[['Country']])
    df = df.dropna(subset=[target_col])
else:
    print('All countries in the dataset have information on the target value.')

The following countries are missing infant mortality rates and will be removed from our dataset:
                   Country
11  Hong Kong, China (SAR)
14           Liechtenstein


In [5]:
# Variables picked for regression based on looking at correlation matrix
target_col = 'Mortality rates Infant (per 1,000 live births) 2015'
reg_vars = ['Mean years of schooling',
                  'Mean years of schooling Female',
                  'Mean years of schooling Male',
                  'Labour force participation rate (% ages 15 and older) Female ',
                  'Infants lacking immunization DTP (% of one-year-olds)',
                  'Infants lacking immunization Measles (% of one-year-olds)',
                  'Child malnutrition Stunting (moderate or severe) 2010-2015',
                  'Deaths due to Malria (per 100,000 people) ',
                  'Deaths due to Tuberculosis (per 100,000 people) ',
                  'Physicians  (per 10,000 people) 2001-2014']


In [6]:
df_x = df.loc[:, reg_vars]
df_y = df.loc[:, target_col]
print(df_x.isnull().sum())
df_x.describe()

Mean years of schooling                                           7
Mean years of schooling Female                                   25
Mean years of schooling Male                                     25
Labour force participation rate (% ages 15 and older) Female     14
Infants lacking immunization DTP (% of one-year-olds)             0
Infants lacking immunization Measles (% of one-year-olds)         0
Child malnutrition Stunting (moderate or severe) 2010-2015       51
Deaths due to Malria (per 100,000 people)                        97
Deaths due to Tuberculosis (per 100,000 people)                   1
Physicians  (per 10,000 people) 2001-2014                         3
dtype: int64


Unnamed: 0,Mean years of schooling,Mean years of schooling Female,Mean years of schooling Male,Labour force participation rate (% ages 15 and older) Female,Infants lacking immunization DTP (% of one-year-olds),Infants lacking immunization Measles (% of one-year-olds),Child malnutrition Stunting (moderate or severe) 2010-2015,"Deaths due to Malria (per 100,000 people)","Deaths due to Tuberculosis (per 100,000 people)","Physicians (per 10,000 people) 2001-2014"
count,186.0,168.0,168.0,179.0,193.0,193.0,142.0,96.0,192.0,190.0
mean,8.332258,8.075595,8.814881,52.559218,6.818653,12.38342,22.152113,32.289583,13.966146,16.516842
std,3.091266,3.459786,2.854068,16.188148,9.052121,13.728701,13.45086,40.121496,21.272098,15.940184
min,1.4,1.0,2.0,12.2,1.0,1.0,1.3,0.0,0.0,0.1
25%,6.025,4.975,6.475,43.9,1.0,3.0,10.025,0.1,0.8,2.225
50%,8.6,8.55,8.95,52.8,3.0,7.0,20.9,7.5,3.55,11.75
75%,10.9,10.925,11.4,62.25,9.0,15.0,32.7,65.275,17.25,27.3
max,13.4,13.4,13.6,86.4,51.0,78.0,57.5,152.6,120.0,77.4


In [7]:
# Fill missing values with the mean of that column
print("Filling null values with mean value of their respective variable")
df_x.fillna(df_x.mean(), inplace=True)
print(str(df_x.isnull().sum()))

Filling null values with mean value of their respective variable
Mean years of schooling                                          0
Mean years of schooling Female                                   0
Mean years of schooling Male                                     0
Labour force participation rate (% ages 15 and older) Female     0
Infants lacking immunization DTP (% of one-year-olds)            0
Infants lacking immunization Measles (% of one-year-olds)        0
Child malnutrition Stunting (moderate or severe) 2010-2015       0
Deaths due to Malria (per 100,000 people)                        0
Deaths due to Tuberculosis (per 100,000 people)                  0
Physicians  (per 10,000 people) 2001-2014                        0
dtype: int64


In [8]:
# Run regression on all 10 vars, using all records for initial fit
reg = LinearRegression()
reg.fit(df_x, df_y)
train_slopes = dict(zip(reg_vars, reg.coef_))
print("score: ", reg.score(df_x, df_y))
print("Model slope: ", train_slopes)
print("Model intercept: ", reg.intercept_)


score:  0.808136505376418
Model slope:  {'Mean years of schooling': -1.9308952952820524, 'Mean years of schooling Female': -2.2920094544818888, 'Mean years of schooling Male': 2.449771736125112, 'Labour force participation rate (% ages 15 and older) Female ': 0.032918898658537595, 'Infants lacking immunization DTP (% of one-year-olds)': 0.6354716070976296, 'Infants lacking immunization Measles (% of one-year-olds)': -0.17675815507451006, 'Child malnutrition Stunting (moderate or severe) 2010-2015': 0.1703728973992519, 'Deaths due to Malria (per 100,000 people) ': 0.17716508419242688, 'Deaths due to Tuberculosis (per 100,000 people) ': 0.26292442433095253, 'Physicians  (per 10,000 people) 2001-2014': -0.2639617840613093}
Model intercept:  24.31763555736603


In [9]:
df_x2 = sm.add_constant(df_x)
est = sm.OLS(df_y, df_x2)
est2 = est.fit()
print(est2.summary())

                                             OLS Regression Results                                            
Dep. Variable:     Mortality rates Infant (per 1,000 live births) 2015   R-squared:                       0.808
Model:                                                             OLS   Adj. R-squared:                  0.798
Method:                                                  Least Squares   F-statistic:                     76.66
Date:                                                 Sat, 03 Jun 2023   Prob (F-statistic):           7.76e-60
Time:                                                         21:11:56   Log-Likelihood:                -710.59
No. Observations:                                                  193   AIC:                             1443.
Df Residuals:                                                      182   BIC:                             1479.
Df Model:                                                           10                                  

In [10]:
# Dropping variables where p-value > 0.05
vars_to_drop = ['Labour force participation rate (% ages 15 and older) Female ',
                'Infants lacking immunization Measles (% of one-year-olds)',
                'Child malnutrition Stunting (moderate or severe) 2010-2015']

new_vars = [var for var in reg_vars if var not in vars_to_drop]
df_x = df_x.loc[:, new_vars]
df_x.head()

Unnamed: 0,Mean years of schooling,Mean years of schooling Female,Mean years of schooling Male,Infants lacking immunization DTP (% of one-year-olds),"Deaths due to Malria (per 100,000 people)","Deaths due to Tuberculosis (per 100,000 people)","Physicians (per 10,000 people) 2001-2014"
0,12.7,12.8,12.7,1.0,32.289583,0.2,42.8
1,13.2,13.4,13.0,8.0,32.289583,0.2,32.7
2,13.4,13.3,13.5,2.0,32.289583,0.1,40.5
3,13.2,12.9,13.6,2.0,32.289583,0.4,38.9
4,12.7,12.6,12.9,4.0,32.289583,0.4,34.9


In [11]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.3, random_state = 0)
model = LinearRegression()
model.fit(X_train, y_train)
train_slopes = dict(zip(new_vars, model.coef_))
print("score: ", model.score(X_train,y_train))
print("Model slope: ", train_slopes)
print("Model intercept: ", model.intercept_)

y_pred = model.predict(X_test)
print("score: ", model.score(X_test,y_test))

score:  0.8100718969549778
Model slope:  {'Mean years of schooling': -2.463970131593848, 'Mean years of schooling Female': -2.6679215123032587, 'Mean years of schooling Male': 3.5128850973177137, 'Infants lacking immunization DTP (% of one-year-olds)': 0.6370479974729782, 'Deaths due to Malria (per 100,000 people) ': 0.17372389589953327, 'Deaths due to Tuberculosis (per 100,000 people) ': 0.23482737816102128, 'Physicians  (per 10,000 people) 2001-2014': -0.2931104967961959}
Model intercept:  27.23525022036633
score:  0.7458172386631148


NameError: name 'new_est' is not defined

In [12]:

df_x2 = sm.add_constant(df_x)
est = sm.OLS(df_y, df_x2)
est2 = est.fit()
print(est2.summary())


                                             OLS Regression Results                                            
Dep. Variable:     Mortality rates Infant (per 1,000 live births) 2015   R-squared:                       0.801
Model:                                                             OLS   Adj. R-squared:                  0.794
Method:                                                  Least Squares   F-statistic:                     106.6
Date:                                                 Sat, 03 Jun 2023   Prob (F-statistic):           1.77e-61
Time:                                                         21:12:35   Log-Likelihood:                -713.94
No. Observations:                                                  193   AIC:                             1444.
Df Residuals:                                                      185   BIC:                             1470.
Df Model:                                                            7                                  