In Class Practice

In [1]:
# Some basic package imports
import os
import numpy as np
import pandas as pd

# Visualization packages
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.defaule = 'colab'
import seaborn as sns

# ML packages
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score 
from sklearn.metrics import confusion_matrix, classification_report

# Additional packages
from collections import Counter

In [2]:
import kagglehub
# Download latest version
path = kagglehub.dataset_download("kumarajarshi/life-expectancy-who")

print("Path to dataset files:", path)

# Note this downloads three files. We will use the second one.
file = path + '/' + os.listdir(path)[0]
df = pd.read_csv(file)
df

Path to dataset files: C:\Users\josee\.cache\kagglehub\datasets\kumarajarshi\life-expectancy-who\versions\1


Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.470,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959000,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,Zimbabwe,2004,Developing,44.3,723.0,27,4.36,0.000000,68.0,31,...,67.0,7.13,65.0,33.6,454.366654,12777511.0,9.4,9.4,0.407,9.2
2934,Zimbabwe,2003,Developing,44.5,715.0,26,4.06,0.000000,7.0,998,...,7.0,6.52,68.0,36.7,453.351155,12633897.0,9.8,9.9,0.418,9.5
2935,Zimbabwe,2002,Developing,44.8,73.0,25,4.43,0.000000,73.0,304,...,73.0,6.53,71.0,39.8,57.348340,125525.0,1.2,1.3,0.427,10.0
2936,Zimbabwe,2001,Developing,45.3,686.0,25,1.72,0.000000,76.0,529,...,76.0,6.16,75.0,42.1,548.587312,12366165.0,1.6,1.7,0.427,9.8


In [7]:
# Clean up column names
df.rename(columns={'Life expectancy ': 'Life expectancy',
                   'Adult Mortality': 'Adult mortality',
                   'infant deaths': 'Infant deaths',
                   'percentage expenditure': 'Percentage expenditure',
                   'Measles ': 'Measles',
                   ' BMI ': 'BMI',
                   'under-five deaths ': 'Under-five deaths',
                   'Diphtheria ': 'Diphtheria',
                   ' HIV/AIDS': 'HIV/AIDS',
                   ' thinness  1-19 years': 'Thinness 10-19 years',
                   ' thinness 5-9 years': 'Thinness 5-9 years'}, inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult mortality                  2928 non-null   float64
 5   Infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   Percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10  BMI                              2904 non-null   float64
 11  Under-five deaths                2938 non-null   int64  
 12  Polio               

In [55]:
def nan_counts(df):
    '''
    Prints out information about nans in a dataset
    '''
    num_rows = len(df)
    for c in df.keys():
        num_na = sum(df[c].isna())
        num_vals = num_rows - num_na
        if num_vals != num_rows:
            print(f'{c} has {num_na} NaNs, which is {num_na/num_rows*100:.4f}% of the data.')

In [56]:
nan_counts(df)

Life expectancy has 10 NaNs, which is 0.3404% of the data.
Adult mortality has 10 NaNs, which is 0.3404% of the data.
Alcohol has 194 NaNs, which is 6.6031% of the data.
Hepatitis B has 553 NaNs, which is 18.8223% of the data.
BMI has 34 NaNs, which is 1.1572% of the data.
Polio has 19 NaNs, which is 0.6467% of the data.
Total expenditure has 226 NaNs, which is 7.6923% of the data.
Diphtheria has 19 NaNs, which is 0.6467% of the data.
GDP has 448 NaNs, which is 15.2485% of the data.
Population has 652 NaNs, which is 22.1920% of the data.
Thinness 10-19 years has 34 NaNs, which is 1.1572% of the data.
Thinness 5-9 years has 34 NaNs, which is 1.1572% of the data.
Income composition of resources has 167 NaNs, which is 5.6841% of the data.
Schooling has 163 NaNs, which is 5.5480% of the data.


In [37]:
col = 'Alcohol'
mask = df[col].isna()
df[mask]

Unnamed: 0,Country,Year,Status,Life expectancy,Adult mortality,Infant deaths,Alcohol,Percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness 10-19 years,Thinness 5-9 years,Income composition of resources,Schooling
32,Algeria,2015,Developing,75.6,19.0,21,,0.0,95.0,63,...,95.0,,95.0,0.1,4132.762920,39871528.0,6.0,5.8,0.743,14.4
48,Angola,2015,Developing,52.4,335.0,66,,0.0,64.0,118,...,7.0,,64.0,1.9,3695.793748,2785935.0,8.3,8.2,0.531,11.4
64,Antigua and Barbuda,2015,Developing,76.4,13.0,0,,0.0,99.0,0,...,86.0,,99.0,0.2,13566.954100,,3.3,3.3,0.784,13.9
80,Argentina,2015,Developing,76.3,116.0,8,,0.0,94.0,0,...,93.0,,94.0,0.1,13467.123600,43417765.0,1.0,0.9,0.826,17.3
96,Armenia,2015,Developing,74.8,118.0,1,,0.0,94.0,33,...,96.0,,94.0,0.1,369.654776,291695.0,2.1,2.2,0.741,12.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2858,Venezuela (Bolivarian Republic of),2015,Developing,74.1,157.0,9,,0.0,87.0,0,...,87.0,,87.0,0.1,,,1.6,1.5,0.769,14.3
2874,Viet Nam,2015,Developing,76.0,127.0,28,,0.0,97.0,256,...,97.0,,97.0,0.1,,,14.2,14.5,0.678,12.6
2890,Yemen,2015,Developing,65.7,224.0,37,,0.0,69.0,468,...,63.0,,69.0,0.1,,,13.6,13.4,0.499,9.0
2906,Zambia,2015,Developing,61.8,33.0,27,,0.0,9.0,9,...,9.0,,9.0,4.1,1313.889646,161587.0,6.3,6.1,0.576,12.5


In [38]:
my_data = df[mask].copy()

cols = ['Country', 'Year', 'Status']
for c in cols:
    print('Masked Data')
    display(my_data[c].value_counts().reset_index().sort_values(by=c))
    print('All Data')
    display(df[c].value_counts().reset_index().sort_values(by=c))

Masked Data


Unnamed: 0,Country,count
16,Algeria,1
17,Angola,1
2,Antigua and Barbuda,1
3,Argentina,1
4,Armenia,1
...,...,...
173,Venezuela (Bolivarian Republic of),1
174,Viet Nam,1
175,Yemen,1
176,Zambia,1


All Data


Unnamed: 0,Country,count
0,Afghanistan,16
1,Albania,16
2,Algeria,16
3,Angola,16
4,Antigua and Barbuda,16
...,...,...
180,Venezuela (Bolivarian Republic of),16
129,Viet Nam,16
162,Yemen,16
161,Zambia,16


Masked Data


Unnamed: 0,Year,count
15,2000,1
14,2001,1
13,2002,1
12,2003,1
11,2004,1
1,2005,2
10,2006,1
9,2007,1
8,2008,1
7,2009,1


All Data


Unnamed: 0,Year,count
15,2000,183
14,2001,183
13,2002,183
12,2003,183
11,2004,183
10,2005,183
9,2006,183
8,2007,183
7,2008,183
6,2009,183


Masked Data


Unnamed: 0,Status,count
1,Developed,28
0,Developing,166


All Data


Unnamed: 0,Status,count
1,Developed,512
0,Developing,2426


In [44]:
year = 2015
mask = df['Year'] == year
df[mask]

Unnamed: 0,Country,Year,Status,Life expectancy,Adult mortality,Infant deaths,Alcohol,Percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness 10-19 years,Thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1
16,Albania,2015,Developing,77.8,74.0,0,4.60,364.975229,99.0,0,...,99.0,6.00,99.0,0.1,3954.227830,28873.0,1.2,1.3,0.762,14.2
32,Algeria,2015,Developing,75.6,19.0,21,,0.000000,95.0,63,...,95.0,,95.0,0.1,4132.762920,39871528.0,6.0,5.8,0.743,14.4
48,Angola,2015,Developing,52.4,335.0,66,,0.000000,64.0,118,...,7.0,,64.0,1.9,3695.793748,2785935.0,8.3,8.2,0.531,11.4
64,Antigua and Barbuda,2015,Developing,76.4,13.0,0,,0.000000,99.0,0,...,86.0,,99.0,0.2,13566.954100,,3.3,3.3,0.784,13.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2858,Venezuela (Bolivarian Republic of),2015,Developing,74.1,157.0,9,,0.000000,87.0,0,...,87.0,,87.0,0.1,,,1.6,1.5,0.769,14.3
2874,Viet Nam,2015,Developing,76.0,127.0,28,,0.000000,97.0,256,...,97.0,,97.0,0.1,,,14.2,14.5,0.678,12.6
2890,Yemen,2015,Developing,65.7,224.0,37,,0.000000,69.0,468,...,63.0,,69.0,0.1,,,13.6,13.4,0.499,9.0
2906,Zambia,2015,Developing,61.8,33.0,27,,0.000000,9.0,9,...,9.0,,9.0,4.1,1313.889646,161587.0,6.3,6.1,0.576,12.5


In [53]:
# Preprocessing

# Mask out 2015 because of many NaNs in Alcohol col
mask = df['Year'] != 2015
df_no15 = df[mask].copy()

# Drop NaNs
mask = df_no15['Alcohol'].notna()
df_no15_nona = df_no15[mask]

df_no15_nona['Alcohol'].isna().sum()

np.int64(0)

In [57]:
nan_counts(df_no15_nona)

Life expectancy has 9 NaNs, which is 0.3287% of the data.
Adult mortality has 9 NaNs, which is 0.3287% of the data.
Hepatitis B has 528 NaNs, which is 19.2841% of the data.
BMI has 17 NaNs, which is 0.6209% of the data.
Polio has 7 NaNs, which is 0.2557% of the data.
Total expenditure has 33 NaNs, which is 1.2053% of the data.
Diphtheria has 7 NaNs, which is 0.2557% of the data.
GDP has 411 NaNs, which is 15.0110% of the data.
Population has 611 NaNs, which is 22.3156% of the data.
Thinness 10-19 years has 17 NaNs, which is 0.6209% of the data.
Thinness 5-9 years has 17 NaNs, which is 0.6209% of the data.
Income composition of resources has 157 NaNs, which is 5.7341% of the data.
Schooling has 153 NaNs, which is 5.5880% of the data.
