In [105]:
import pandas as pd
import numpy as  np
from matplotlib import pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


In [106]:
rawdata = pd.read_csv("data/raw/LifeExpectancyData.csv")
# Options for missing features - dropna(), drop(), fillna()
# drop the missing values from our target value
data = rawdata.dropna(axis=0, subset=['Life expectancy'])
data.isnull().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                      0
Adult Mortality                      0
infant deaths                        0
Alcohol                            193
percentage expenditure               0
Hepatitis B                        553
Measles                              0
BMI                                 32
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
HIV/AIDS                             0
GDP                                443
Population                         644
thinness  1-19 years                32
thinness 5-9 years                  32
Income composition of resources    160
Schooling                          160
dtype: int64

In [107]:
# Now we can impute missing values for numerical attributes with fillna() 
# compute the median/mean value on the data and use that to fill the missing values 
# We are also missing 21% of observations for "population" so figure out how to handle this later

# NOTE: Cannot do this with categoricals, so drop them first. 
num_only_df = data.drop(["Country", "Status"], axis=1)
imputer = SimpleImputer(strategy="mean")
X = imputer.fit_transform(num_only_df)
imputed_df = pd.DataFrame(X, columns=num_only_df.columns)  # Now we can put this back into a DataFrame


In [141]:
# Now, we can handle the categorical values, "Country" and "Status"
# We could do this manually, but instead we will use sklearn LabelEncoder and OneHotEncoder
# Do "Country" first, we must convert text labels to numbers
lbl_encoder = LabelEncoder()
country = data["Country"]
country_encoded = lbl_encoder.fit_transform(country)

# Now we can convert this into a vector 
oh_encoder = OneHotEncoder()
# must use reshape to get 2d array from 1d (country_ef_encoded)
# reshape() -1 argument means "unspecified", ie it's inferred from len of array and remaining dimensions
# We use toarray() to create a smaller data structure that only stores the location of the nonzero element,
# instead of a sparse matrix with a bunch of zeroes
country_oh = oh_encoder.fit_transform(country_encoded.reshape(-1,1)).toarray()

# We can do the previous labeling/encoding in one step with LabelBinarizer() 
# status = data["Status"]
# status_encoder = LabelBinarizer()
# status_oh = status_encoder.fit_transform(status)
# country_oh.shape
status_encoder = LabelEncoder()
status = data["Status"]
status_encoded = status_encoder.fit_transform(status)
np.unique(status_encoded)


array([0, 1])

In [142]:
country_df = pd.DataFrame(data=country_oh[:,:], columns=country_oh[0,:])
#axis=1 will stack the columns in the second DataFrame to the RIGHT of the first DataFrame
final_df = pd.concat([imputed_df, country_df], axis=1)
final_df['status'] = status_encoded


Unnamed: 0,status
239,1
240,0
241,0
242,0
243,0
244,0
245,0
246,0
247,0
248,0


In [110]:
# Create a transformation pipeline so we can execute the steps in the right order (just fiddling)

# num_attribs = list(imputed_df)
# categorical_attribs = ["Status", "Country"]
# 
# 
# num_Pipeline = Pipeline([
#     ('selector', DataFrameSelector(num_attribs))
#     ('imputer', SimpleImputer(strategy="mean")),
#     ()
# 
# ])

