In [101]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 

life = pd.read_csv("../data/life_expectancy_data.csv")
life.columns = life.columns.str.strip() # remove white space of data header
life.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [102]:
# check of missing values
life.isnull().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
BMI                                 34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
HIV/AIDS                             0
GDP                                448
Population                         652
thinness  1-19 years                34
thinness 5-9 years                  34
Income composition of resources    167
Schooling                          163
dtype: int64

All predicting variables was then divided into several broad categories:​
    - Immunization related factors: Hepatitis B, Measles, Polio, Diphtheria
    - Mortality factors: Alcohol, BMI, thinness 1-19 years, thinness 5-9 years, HIV/AIDS
    - Economical factors (country effort):Status, Percentage expenditure, total expenditure GDP, population
    - Social factors(personal effort/surroundings): Schooling, Country, Income composition of resources 
Outcome: Life Expectancy, Infant Deaths, Under-five deaths

1. We would do the prediction based on immunization related factors. 
    - Hepatitis B in % 
    - Polio in % 
    - Diphtheria in % 

2. Prediction based on social factors 
    - Schooling 
    - Country
    - Income composition of resources 
        

## Immunization Factors

In [103]:
new_life = life.copy()
new_columns = ["Hepatitis B" , "Polio" , "Diphtheria", "Measles", "Life expectancy" , "Country" , "Year", "Population"]
new_life = new_life.drop([c for c in new_life.columns if c not in new_columns], axis='columns')
new_life.head()

Unnamed: 0,Country,Year,Life expectancy,Hepatitis B,Measles,Polio,Diphtheria,Population
0,Afghanistan,2015,65.0,65.0,1154,6.0,65.0,33736494.0
1,Afghanistan,2014,59.9,62.0,492,58.0,62.0,327582.0
2,Afghanistan,2013,59.9,64.0,430,62.0,64.0,31731688.0
3,Afghanistan,2012,59.5,67.0,2787,67.0,67.0,3696958.0
4,Afghanistan,2011,59.2,68.0,3013,68.0,68.0,2978599.0


In [104]:
new_life.isnull().sum()

Country              0
Year                 0
Life expectancy     10
Hepatitis B        553
Measles              0
Polio               19
Diphtheria          19
Population         652
dtype: int64

In [105]:
# check null data for life expectancy 
null_data = new_life[new_life["Life expectancy"].isnull()]
null_data


Unnamed: 0,Country,Year,Life expectancy,Hepatitis B,Measles,Polio,Diphtheria,Population
624,Cook Islands,2013,,98.0,0,98.0,98.0,
769,Dominica,2013,,96.0,0,96.0,96.0,
1650,Marshall Islands,2013,,8.0,0,79.0,79.0,
1715,Monaco,2013,,99.0,0,99.0,99.0,
1812,Nauru,2013,,87.0,0,87.0,87.0,
1909,Niue,2013,,99.0,0,99.0,99.0,
1958,Palau,2013,,99.0,0,99.0,99.0,292.0
2167,Saint Kitts and Nevis,2013,,97.0,0,96.0,96.0,
2216,San Marino,2013,,69.0,0,69.0,69.0,
2713,Tuvalu,2013,,9.0,0,9.0,9.0,1819.0


In [106]:
# check null data fro Polio
null_data = new_life[new_life["Polio"].isnull()]
null_data

Unnamed: 0,Country,Year,Life expectancy,Hepatitis B,Measles,Polio,Diphtheria,Population
1742,Montenegro,2005,73.6,,0,,,614261.0
1743,Montenegro,2004,73.5,,0,,,613353.0
1744,Montenegro,2003,73.5,,0,,,612267.0
1745,Montenegro,2002,73.4,,0,,,69828.0
1746,Montenegro,2001,73.3,,0,,,67389.0
1747,Montenegro,2000,73.0,,0,,,6495.0
2414,South Sudan,2010,55.0,,0,,,167192.0
2415,South Sudan,2009,54.3,,0,,,967667.0
2416,South Sudan,2008,53.6,,0,,,9263136.0
2417,South Sudan,2007,53.1,,0,,,88568.0


In [107]:
# check null data for Diphtheria
null_data = new_life[new_life["Diphtheria"].isnull()]
null_data

Unnamed: 0,Country,Year,Life expectancy,Hepatitis B,Measles,Polio,Diphtheria,Population
1742,Montenegro,2005,73.6,,0,,,614261.0
1743,Montenegro,2004,73.5,,0,,,613353.0
1744,Montenegro,2003,73.5,,0,,,612267.0
1745,Montenegro,2002,73.4,,0,,,69828.0
1746,Montenegro,2001,73.3,,0,,,67389.0
1747,Montenegro,2000,73.0,,0,,,6495.0
2414,South Sudan,2010,55.0,,0,,,167192.0
2415,South Sudan,2009,54.3,,0,,,967667.0
2416,South Sudan,2008,53.6,,0,,,9263136.0
2417,South Sudan,2007,53.1,,0,,,88568.0


In [108]:
# only get specific year data 
new_life_2015 = new_life[new_life["Year"] == 2015]
new_life_2015

Unnamed: 0,Country,Year,Life expectancy,Hepatitis B,Measles,Polio,Diphtheria,Population
0,Afghanistan,2015,65.0,65.0,1154,6.0,65.0,33736494.0
16,Albania,2015,77.8,99.0,0,99.0,99.0,28873.0
32,Algeria,2015,75.6,95.0,63,95.0,95.0,39871528.0
48,Angola,2015,52.4,64.0,118,7.0,64.0,2785935.0
64,Antigua and Barbuda,2015,76.4,99.0,0,86.0,99.0,
...,...,...,...,...,...,...,...,...
2858,Venezuela (Bolivarian Republic of),2015,74.1,87.0,0,87.0,87.0,
2874,Viet Nam,2015,76.0,97.0,256,97.0,97.0,
2890,Yemen,2015,65.7,69.0,468,63.0,69.0,
2906,Zambia,2015,61.8,9.0,9,9.0,9.0,161587.0


In [109]:
new_life_2015.isnull().sum()

Country             0
Year                0
Life expectancy     0
Hepatitis B         9
Measles             0
Polio               0
Diphtheria          0
Population         41
dtype: int64

In [110]:
# replacing NaN in Hep3 to 0 
# new_life_2015 = null_data[new_life_2015["Hepatitis B"].isnull()]
new_life_2015["Hepatitis B"] = new_life_2015["Hepatitis B"].fillna(0)
print(new_life_2015.isnull().sum())
new_life_2015

Country             0
Year                0
Life expectancy     0
Hepatitis B         0
Measles             0
Polio               0
Diphtheria          0
Population         41
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Country,Year,Life expectancy,Hepatitis B,Measles,Polio,Diphtheria,Population
0,Afghanistan,2015,65.0,65.0,1154,6.0,65.0,33736494.0
16,Albania,2015,77.8,99.0,0,99.0,99.0,28873.0
32,Algeria,2015,75.6,95.0,63,95.0,95.0,39871528.0
48,Angola,2015,52.4,64.0,118,7.0,64.0,2785935.0
64,Antigua and Barbuda,2015,76.4,99.0,0,86.0,99.0,
...,...,...,...,...,...,...,...,...
2858,Venezuela (Bolivarian Republic of),2015,74.1,87.0,0,87.0,87.0,
2874,Viet Nam,2015,76.0,97.0,256,97.0,97.0,
2890,Yemen,2015,65.7,69.0,468,63.0,69.0,
2906,Zambia,2015,61.8,9.0,9,9.0,9.0,161587.0


In [111]:
null_data = new_life_2015[new_life_2015["Population"].isnull()]
null_data

Unnamed: 0,Country,Year,Life expectancy,Hepatitis B,Measles,Polio,Diphtheria,Population
64,Antigua and Barbuda,2015,76.4,99.0,0,86.0,99.0,
160,Bahamas,2015,76.1,95.0,0,95.0,95.0,
176,Bahrain,2015,76.9,98.0,0,98.0,98.0,
208,Barbados,2015,75.5,97.0,0,97.0,97.0,
304,Bolivia (Plurinational State of),2015,77.0,99.0,0,99.0,99.0,
368,Brunei Darussalam,2015,77.7,99.0,4,99.0,99.0,
432,Côte d'Ivoire,2015,53.3,83.0,65,81.0,83.0,
608,Congo,2015,64.7,8.0,1359,8.0,8.0,
657,Cuba,2015,79.1,99.0,0,99.0,99.0,
689,Czechia,2015,78.8,97.0,9,97.0,97.0,


In [112]:
pop_2015 = pd.read_csv("../data/pop-2015.csv")
pop_2015 = pop_2015.rename(columns={'2015 [YR2015]':'Population' , 'Country Name': 'Country'})
pop_2015.head()

Unnamed: 0,Series Name,Series Code,Country,Country Code,Population
0,"Population, total",SP.POP.TOTL,Afghanistan,AFG,34413603
1,"Population, total",SP.POP.TOTL,Albania,ALB,2880703
2,"Population, total",SP.POP.TOTL,Algeria,DZA,39728020
3,"Population, total",SP.POP.TOTL,American Samoa,ASM,55806
4,"Population, total",SP.POP.TOTL,Andorra,AND,77993


In [113]:
merged = pd.merge(new_life_2015, pop_2015, on="Country")
merged.isnull().sum()

Country             0
Year                0
Life expectancy     0
Hepatitis B         0
Measles             0
Polio               0
Diphtheria          0
Population_x       16
Series Name         0
Series Code         0
Country Code        0
Population_y        0
dtype: int64

In [114]:
null_data = merged[merged["Population_x"].isnull()]
null_data

Unnamed: 0,Country,Year,Life expectancy,Hepatitis B,Measles,Polio,Diphtheria,Population_x,Series Name,Series Code,Country Code,Population_y
4,Antigua and Barbuda,2015,76.4,99.0,0,86.0,99.0,,"Population, total",SP.POP.TOTL,ATG,93571
10,Bahrain,2015,76.9,98.0,0,98.0,98.0,,"Population, total",SP.POP.TOTL,BHR,1371853
12,Barbados,2015,75.5,97.0,0,97.0,97.0,,"Population, total",SP.POP.TOTL,BRB,285327
21,Brunei Darussalam,2015,77.7,99.0,4,99.0,99.0,,"Population, total",SP.POP.TOTL,BRN,414914
37,Cuba,2015,79.1,99.0,0,99.0,99.0,,"Population, total",SP.POP.TOTL,CUB,11324777
45,Eritrea,2015,64.7,95.0,198,95.0,95.0,,"Population, total",SP.POP.TOTL,ERI,..
56,Grenada,2015,73.6,92.0,0,99.0,92.0,,"Population, total",SP.POP.TOTL,GRD,109603
77,Kuwait,2015,74.7,99.0,18,99.0,99.0,,"Population, total",SP.POP.TOTL,KWT,3835588
82,Libya,2015,72.7,97.0,82,97.0,97.0,,"Population, total",SP.POP.TOTL,LBY,6418315
102,New Zealand,2015,81.6,92.0,10,92.0,92.0,,"Population, total",SP.POP.TOTL,NZL,4609400


In [115]:
# drop columns that we do not need from pop_2015
new_life_2015 = merged.drop(['Population_x', 'Series Name', 'Series Code'], 1)
new_life_2015 = new_life_2015.rename(columns={'Population_y':'Population'})
# new_life_2015['Population'] = new_life_2015['Population'].apply(lambda x : float(x))
new_life_2015.head()

Unnamed: 0,Country,Year,Life expectancy,Hepatitis B,Measles,Polio,Diphtheria,Country Code,Population
0,Afghanistan,2015,65.0,65.0,1154,6.0,65.0,AFG,34413603
1,Albania,2015,77.8,99.0,0,99.0,99.0,ALB,2880703
2,Algeria,2015,75.6,95.0,63,95.0,95.0,DZA,39728020
3,Angola,2015,52.4,64.0,118,7.0,64.0,AGO,27884380
4,Antigua and Barbuda,2015,76.4,99.0,0,86.0,99.0,ATG,93571


In [116]:
problem_data = new_life_2015[new_life_2015['Population'] == '..']
new_life_2015[new_life_2015['Population'] == '..'].replace('..',3214000 )

Unnamed: 0,Country,Year,Life expectancy,Hepatitis B,Measles,Polio,Diphtheria,Country Code,Population
45,Eritrea,2015,64.7,95.0,198,95.0,95.0,ERI,3214000


In [117]:
new_life_2015.loc[new_life_2015['Population'] == "..", 'Population'] = 3214000 
problem_data = new_life_2015[new_life_2015['Population'] == '..']
problem_data

Unnamed: 0,Country,Year,Life expectancy,Hepatitis B,Measles,Polio,Diphtheria,Country Code,Population


In [118]:
# adding measles immunization to the dataset ( dataset from WorldBank )
measles_2015 = pd.read_csv("../data/measles_2015.csv")
measles_2015 = measles_2015.rename(columns={'2015 [YR2015]':'Measles Immunity' , 'Country Name': 'Country'})
merged = pd.merge(new_life_2015, measles_2015, on="Country")
merged.isnull().sum()
merged.head()

Unnamed: 0,Country,Year,Life expectancy,Hepatitis B,Measles,Polio,Diphtheria,Country Code_x,Population,Series Name,Series Code,Country Code_y,Measles Immunity
0,Afghanistan,2015,65.0,65.0,1154,6.0,65.0,AFG,34413603,"Immunization, measles (% of children ages 12-2...",SH.IMM.MEAS,AFG,63
1,Albania,2015,77.8,99.0,0,99.0,99.0,ALB,2880703,"Immunization, measles (% of children ages 12-2...",SH.IMM.MEAS,ALB,97
2,Algeria,2015,75.6,95.0,63,95.0,95.0,DZA,39728020,"Immunization, measles (% of children ages 12-2...",SH.IMM.MEAS,DZA,95
3,Angola,2015,52.4,64.0,118,7.0,64.0,AGO,27884380,"Immunization, measles (% of children ages 12-2...",SH.IMM.MEAS,AGO,51
4,Antigua and Barbuda,2015,76.4,99.0,0,86.0,99.0,ATG,93571,"Immunization, measles (% of children ages 12-2...",SH.IMM.MEAS,ATG,91


In [119]:
# drop columns that we do not need from pop_2015
new_life_2015 = merged.drop(['Series Name', 'Series Code', 'Country Code_x' , 'Measles', 'Year' , 'Population'], 1)
new_life_2015.head()
# new_life.dtypes

Unnamed: 0,Country,Life expectancy,Hepatitis B,Polio,Diphtheria,Country Code_y,Measles Immunity
0,Afghanistan,65.0,65.0,6.0,65.0,AFG,63
1,Albania,77.8,99.0,99.0,99.0,ALB,97
2,Algeria,75.6,95.0,95.0,95.0,DZA,95
3,Angola,52.4,64.0,7.0,64.0,AGO,51
4,Antigua and Barbuda,76.4,99.0,86.0,99.0,ATG,91


In [120]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler

new_life_2015['Country'] = LabelEncoder().fit_transform(new_life_2015['Country'])
new_life_2015.head()



Unnamed: 0,Country,Life expectancy,Hepatitis B,Polio,Diphtheria,Country Code_y,Measles Immunity
0,0,65.0,65.0,6.0,65.0,AFG,63
1,1,77.8,99.0,99.0,99.0,ALB,97
2,2,75.6,95.0,95.0,95.0,DZA,95
3,3,52.4,64.0,7.0,64.0,AGO,51
4,4,76.4,99.0,86.0,99.0,ATG,91


In [121]:
# building model
selected_features = ['Country', 'Hepatitis B' , 'Polio' , 'Diphtheria', 'Measles Immunity']

X = new_life_2015[selected_features]
y = new_life_2015['Life expectancy']

scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, random_state=0)


In [122]:
# training model (Immunizing factors)
from sklearn.metrics import accuracy_score

model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

accuracy_score = model.score(X_train, y_train)
print('Model Accuracy(Immunizing factors): ', accuracy_score )



Model Accuracy(Immunizing factors):  0.5701757851753699


## Social Factor 

In [123]:
new_life = life.copy()
new_columns = ["Alcohol" , "Schooling", "Income composition of resources", "Life expectancy" , "Country" , "Year", "Population"]
new_life = new_life.drop([c for c in new_life.columns if c not in new_columns], axis='columns')
new_life_2015 = new_life[new_life["Year"] == 2015]
new_life_2015

Unnamed: 0,Country,Year,Life expectancy,Alcohol,Population,Income composition of resources,Schooling
0,Afghanistan,2015,65.0,0.01,33736494.0,0.479,10.1
16,Albania,2015,77.8,4.60,28873.0,0.762,14.2
32,Algeria,2015,75.6,,39871528.0,0.743,14.4
48,Angola,2015,52.4,,2785935.0,0.531,11.4
64,Antigua and Barbuda,2015,76.4,,,0.784,13.9
...,...,...,...,...,...,...,...
2858,Venezuela (Bolivarian Republic of),2015,74.1,,,0.769,14.3
2874,Viet Nam,2015,76.0,,,0.678,12.6
2890,Yemen,2015,65.7,,,0.499,9.0
2906,Zambia,2015,61.8,,161587.0,0.576,12.5


In [124]:
merged = pd.merge(new_life_2015, pop_2015, on="Country")
merged

Unnamed: 0,Country,Year,Life expectancy,Alcohol,Population_x,Income composition of resources,Schooling,Series Name,Series Code,Country Code,Population_y
0,Afghanistan,2015,65.0,0.01,33736494.0,0.479,10.1,"Population, total",SP.POP.TOTL,AFG,34413603
1,Albania,2015,77.8,4.60,28873.0,0.762,14.2,"Population, total",SP.POP.TOTL,ALB,2880703
2,Algeria,2015,75.6,,39871528.0,0.743,14.4,"Population, total",SP.POP.TOTL,DZA,39728020
3,Angola,2015,52.4,,2785935.0,0.531,11.4,"Population, total",SP.POP.TOTL,AGO,27884380
4,Antigua and Barbuda,2015,76.4,,,0.784,13.9,"Population, total",SP.POP.TOTL,ATG,93571
...,...,...,...,...,...,...,...,...,...,...,...
152,Uruguay,2015,77.0,,3431552.0,0.794,15.5,"Population, total",SP.POP.TOTL,URY,3412013
153,Uzbekistan,2015,69.4,,312989.0,0.697,12.1,"Population, total",SP.POP.TOTL,UZB,31298900
154,Vanuatu,2015,72.0,,26463.0,0.598,10.8,"Population, total",SP.POP.TOTL,VUT,271128
155,Zambia,2015,61.8,,161587.0,0.576,12.5,"Population, total",SP.POP.TOTL,ZMB,15879370


In [125]:
new_life_2015 = merged.drop(['Population_x', 'Series Name', 'Series Code', 'Population_y'], 1)
new_life_2015.head()

Unnamed: 0,Country,Year,Life expectancy,Alcohol,Income composition of resources,Schooling,Country Code
0,Afghanistan,2015,65.0,0.01,0.479,10.1,AFG
1,Albania,2015,77.8,4.6,0.762,14.2,ALB
2,Algeria,2015,75.6,,0.743,14.4,DZA
3,Angola,2015,52.4,,0.531,11.4,AGO
4,Antigua and Barbuda,2015,76.4,,0.784,13.9,ATG


In [126]:
new_life_2015.isnull().sum()

Country                              0
Year                                 0
Life expectancy                      0
Alcohol                            152
Income composition of resources      1
Schooling                            1
Country Code                         0
dtype: int64

In [127]:
alcohol_consumption = pd.read_csv('../data/alcohol_consumption.csv')
alcohol_consumption.head()
alcohol_2015 = alcohol_consumption.rename(columns={'2015 [YR2015]':'Alcohol Consumption' , 'Country Name': 'Country'})
merged = pd.merge(new_life_2015, alcohol_2015, on="Country Code")
merged.isnull().sum()
new_life_2015 = merged.drop(['Series Name', 'Series Code', 'Year'], 1)
new_life_2015.head()

Unnamed: 0,Country_x,Life expectancy,Alcohol,Income composition of resources,Schooling,Country Code,Country_y,Alcohol Consumption
0,Afghanistan,65.0,0.01,0.479,10.1,AFG,Afghanistan,0.21
1,Albania,77.8,4.6,0.762,14.2,ALB,Albania,6.74
2,Algeria,75.6,,0.743,14.4,DZA,Algeria,0.93
3,Angola,52.4,,0.531,11.4,AGO,Angola,7.96
4,Antigua and Barbuda,76.4,,0.784,13.9,ATG,Antigua and Barbuda,5.89


In [128]:
new_life_2015.isnull().sum()

Country_x                            0
Life expectancy                      0
Alcohol                            152
Income composition of resources      1
Schooling                            1
Country Code                         0
Country_y                            0
Alcohol Consumption                  0
dtype: int64

In [129]:
new_life_2015 = new_life_2015.drop(['Alcohol', 'Country_y'],1)
new_life_2015.isnull().sum()

Country_x                          0
Life expectancy                    0
Income composition of resources    1
Schooling                          1
Country Code                       0
Alcohol Consumption                0
dtype: int64

In [130]:
# not able to find Somalia data on Schooling and Income composition of resources so removed 
missing_data = new_life_2015[new_life_2015['Schooling'].isnull()]
missing_data
new_life_2015 = new_life_2015.drop(index=130)


In [131]:
missing_data = new_life_2015[new_life_2015['Alcohol Consumption'] == ".."]
missing_data
new_life_2015 = new_life_2015.drop(index=132)


In [132]:
# building model
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

selected_features = ['Income composition of resources', 'Schooling', 'Alcohol Consumption']

X = new_life_2015[selected_features]
y = new_life_2015['Life expectancy']

scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, random_state=0)


In [133]:
# training model (Social Factor)
from sklearn.metrics import accuracy_score

model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

accuracy_score = model.score(X_train, y_train)
print('Model Accuracy(Social Factor): ', accuracy_score )

Model Accuracy(Social Factor):  0.8166200517428115
