In [20]:
from fancyimpute import KNN
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [21]:
df = pd.read_csv('train.csv',index_col=0)

In [22]:
y = df.Survived
X = df.drop('Survived',axis=1)

In [23]:
X

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Baund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
870,3,"Johnson, Master. Harold Theodor",male,4.0,1,1,347742,11.1333,,S
871,3,"Balkic, Mr. Cerin",male,26.0,0,0,349248,7.8958,,S
872,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
873,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S


Ordinal Encoder is needed to convert categorical to numbers.  Ordinal Encoder Does not like Nan

In [24]:
encoder = OrdinalEncoder()

In [25]:
categorical_features = ['Embarked', 'Sex', 'Pclass']

In [26]:
def encode(data):
    '''function to encode non-null data and replace it in the original data'''
    #retains only non-null values
    nonulls = np.array(data.dropna())
    #reshapes the data for encoding
    impute_reshape = nonulls.reshape(-1,1)
    #encode date
    impute_ordinal = encoder.fit_transform(impute_reshape)
    #Assign back encoded values to non-null values
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data

#create a for loop to iterate through each column in the data
for columns in categorical_features:
    encode(X[columns])

In [27]:
X

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2.0,"Baund, Mr. Owen Harris",1.0,22.0,1,0,A/5 21171,7.2500,,2
2,0.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.0,38.0,1,0,PC 17599,71.2833,C85,0
3,2.0,"Heikkinen, Miss. Laina",0.0,26.0,0,0,STON/O2. 3101282,7.9250,,2
4,0.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.0,35.0,1,0,113803,53.1000,C123,2
5,2.0,"Allen, Mr. William Henry",1.0,35.0,0,0,373450,8.0500,,2
...,...,...,...,...,...,...,...,...,...,...
870,2.0,"Johnson, Master. Harold Theodor",1.0,4.0,1,1,347742,11.1333,,2
871,2.0,"Balkic, Mr. Cerin",1.0,26.0,0,0,349248,7.8958,,2
872,0.0,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",0.0,47.0,1,1,11751,52.5542,D35,2
873,0.0,"Carlsson, Mr. Frans Olof",1.0,33.0,0,0,695,5.0000,B51 B53 B55,2


In [30]:
X.columns

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')

In [32]:
X = X[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']]

In [33]:
imputer = KNN()
# impute data and convert 
encode_data = pd.DataFrame(np.round(imputer.fit_transform(X)),columns = X.columns)

Imputing row 1/874 with 0 missing, elapsed time: 0.097
Imputing row 101/874 with 0 missing, elapsed time: 0.098
Imputing row 201/874 with 0 missing, elapsed time: 0.099
Imputing row 301/874 with 1 missing, elapsed time: 0.100
Imputing row 401/874 with 0 missing, elapsed time: 0.101
Imputing row 501/874 with 0 missing, elapsed time: 0.102
Imputing row 601/874 with 0 missing, elapsed time: 0.104
Imputing row 701/874 with 0 missing, elapsed time: 0.105
Imputing row 801/874 with 0 missing, elapsed time: 0.106


# Example of using on just numbers

### This also keeps track of column names and data types

In [97]:
df = pd.DataFrame([[np.nan, 2, np.nan, 0], 
                   [3, 4, np.nan, 1], 
                   [np.nan, np.nan, np.nan, 5], 
                   [np.nan, 3, np.nan, 4], 
                   [5,      7,  8,     2], 
                   [2,      5,  7,     9]], 
                  columns = list('ABCD')) 
  
# printing the dataframe 

cols = df.columns
dictTypes = df.dtypes.apply(lambda x: x.name).to_dict()

In [98]:
# calling the KNN class 
knn_imputer = KNN() 
# imputing the missing value with knn imputer 
df = knn_imputer.fit_transform(df) 

Imputing row 1/6 with 2 missing, elapsed time: 0.001


In [104]:
# printing dataframe 
df = pd.DataFrame(df,columns=cols)
df= df.astype(dtype=dictTypes)
print(df) 
print(df.dtypes)

          A         B         C  D
0  3.235569  2.000000  7.756303  0
1  3.000000  4.000000  7.825000  1
2  3.676471  3.463866  7.640000  5
3  3.355140  3.000000  7.591837  4
4  5.000000  7.000000  8.000000  2
5  2.000000  5.000000  7.000000  9
A    float64
B    float64
C    float64
D      int64
dtype: object
