In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('C:\\Users\\koriv\\Desktop\\MachineLearning_DataScience\\Hands_On_Machine_Learning\\missing_value_handeling\\income_evaluation.csv', na_values = ' ?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
#check for total number of missing values in the dataset
df.isna().sum()

age                   0
 workclass         1836
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week       0
 native-country     583
 income               0
dtype: int64

In [5]:
# As observered all the missing values were in catogorical side and numerical columns seems to be full.
# Lets add some missing values into the numberical columns as well, to see how we can better handel them
dft= df
# add missing values to hours per week 
np.random.seed(seed=0)
h = np.random.choice(a=df.index, replace=False, size=20)
dft.loc[h, ' hours-per-week'] = np.nan
# add missing values to age 
np.random.seed(seed=10)
a = np.random.choice(a=df.index, replace=False, size=28)
dft.loc[a, 'age'] = np.nan

In [8]:
#check for total number of missing values in the new transformed dataset
dft.isna().sum()

age                  28
 workclass         1836
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week      20
 native-country     583
 income               0
dtype: int64

In [17]:
# Split the data into training and test dataset to check generalization error

X_train, X_test, y_train, y_test = train_test_split(dft.drop(' income', axis=1),
                                                    dft[' income'], test_size=0.2,
                                                    random_state=30)

In [20]:
X_train.isna().sum()

age                  23
 workclass         1452
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1456
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week      14
 native-country     467
dtype: int64

In [18]:
#fill age column using mean(filling missing values for numerical columns)
si_age = SimpleImputer(strategy='mean', add_indicator=True) #creating simple imputer object
a = pd.DataFrame(si_age.fit_transform(X_train[['age']]))
print(si_age.statistics_) #value with which the model will impute the missing values==> mean
a[a[1] == 1] #checking for values where age column was missing

[38.54201729]


Unnamed: 0,0,1
2969,38.542017,1.0
3219,38.542017,1.0
3522,38.542017,1.0
4925,38.542017,1.0
5543,38.542017,1.0
5754,38.542017,1.0
6305,38.542017,1.0
7237,38.542017,1.0
8587,38.542017,1.0
11314,38.542017,1.0


In [19]:
#fill occupation column using a constant value(filling missing values for catogorical columns)
si_occ = SimpleImputer(strategy='constant', add_indicator=True, fill_value='not available')
si_occ.fit_transform(X_train[[' occupation']])
b = pd.DataFrame(si_occ.fit_transform(X_train[[' occupation']]))
print(si_occ.statistics_) #value with which the model will impute the missing values==> mean
b[b[1] == 1] #checking for values where age column was missing

['not available']


Unnamed: 0,0,1
26,not available,True
29,not available,True
51,not available,True
61,not available,True
73,not available,True
...,...,...
25968,not available,True
25974,not available,True
25998,not available,True
26004,not available,True


In [21]:
X_train.isna().sum()

age                  23
 workclass         1452
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1456
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week      14
 native-country     467
dtype: int64

The imputation strategy avialable in SimpleImputer.  
If “mean”, then replace missing values using the mean along each column. Can only be used with numeric data.  
If “median”, then replace missing values using the median along each column. Can only be used with numeric data.  
If “most_frequent”, then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned.  
If “constant”, then replace missing values with fill_value. Can be used with strings or numeric data.  

In [23]:
# Propagating imputed object to the testing set.
## Lets see for age column
si_age.transform(X_test[['age']])
a_test = pd.DataFrame(si_age.transform(X_test[['age']]))
a_test[a_test[1] == 1]

Unnamed: 0,0,1
2526,38.542017,1.0
4068,38.542017,1.0
4111,38.542017,1.0
5324,38.542017,1.0
5930,38.542017,1.0
