In [1]:
import numpy as np
import pandas as pd

## Basic preprocessing

In [20]:
names = (['ID', 'Expense', 'Income', 'Loan_type', 'Occupation_type', 
         'Age', 'Score1', 'Score2', 'Score3', 'Score4', 'Score5'])

X = pd.read_csv('dataset/train_x.csv', index_col = 'ID', names = names, skiprows = 1)
y = pd.read_csv('dataset/train_y.csv', index_col = 'ID') 

In [21]:
# remove y NaNs
drop_indices = np.where(y.isna())[0] + 1
X = X.drop(drop_indices, axis = 0).reset_index(drop = True)
y = y.drop(drop_indices, axis = 0).reset_index(drop = True)

In [22]:
# get class indices
pos_idx = y.loc[y['Label'] == 1].index
neg_idx = y.loc[y['Label'] == 0].index

In [23]:
pos_loan = X.loc[pos_idx]['Loan_type'].mode()[0]
neg_loan = X.loc[neg_idx]['Loan_type'].mode()[0]

In [24]:
X_pos = X.loc[pos_idx]
X_pos['Loan_type'] = X_pos['Loan_type'].fillna(pos_loan)

X_neg = X.loc[neg_idx]
X_neg['Loan_type'] = X_neg['Loan_type'].fillna(neg_loan)

X = pd.concat([X_pos, X_neg], sort = False).sort_index()
X['Loan_type'] = X['Loan_type'].map({'A': 1, 'B': 0}, na_action = 'ignore')

In [25]:
# occupation is imputed with class conditional mode
pos_occ = X.loc[pos_idx]['Occupation_type'].mode()[0]
neg_occ = X.loc[neg_idx]['Occupation_type'].mode()[0]

In [26]:
# occ_type = X.pop('Occupation_type')
X_pos = X.loc[pos_idx]
X_pos['Occupation_type'] = X_pos['Occupation_type'].fillna(pos_occ)

X_neg = X.loc[neg_idx]
X_neg['Occupation_type'] = X_neg['Occupation_type'].fillna(neg_occ)

X = pd.concat([X_pos, X_neg], sort = False).sort_index()
occ_type = pd.get_dummies(X['Occupation_type'])
X = pd.concat([X, occ_type], axis = 1)
X = X.drop('Occupation_type', axis = 1)

In [27]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
col_names = X.columns
X = pd.DataFrame(scaler.fit_transform(X), columns = col_names)

## Class conditional wrapper

In [28]:
def class_conditional(imputer, X, y):
    
    pos_idx = y.loc[y['Label'] == 1].index
    neg_idx = y.loc[y['Label'] == 0].index

    X_pos = X.loc[pos_idx]
    X_neg = X.loc[neg_idx]

    X_pos = imputer(X_pos)
    X_neg = imputer(X_neg)
    return pd.concat([X_pos, X_neg], sort = False).sort_index()

## Single Imputation

In [29]:
from sklearn.impute import SimpleImputer
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

def num_imputer(X, strategy):
    """
    1. strategy (string) : use 'mean' or 'median'
    2. read documentation of SimpleImputer for methods
    """
    imp_numeric = SimpleImputer(missing_values = np.nan, strategy = strategy)
    return imp_numeric.fit_transform(X)
    

In [30]:
X_mean = num_imputer(X, 'mean')
X_mean = pd.DataFrame(X_mean, columns = X.columns)

In [31]:
X_mean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76097 entries, 0 to 76096
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Expense    76097 non-null  float64
 1   Income     76097 non-null  float64
 2   Loan_type  76097 non-null  float64
 3   Age        76097 non-null  float64
 4   Score1     76097 non-null  float64
 5   Score2     76097 non-null  float64
 6   Score3     76097 non-null  float64
 7   Score4     76097 non-null  float64
 8   Score5     76097 non-null  float64
 9   X          76097 non-null  float64
 10  Y          76097 non-null  float64
 11  Z          76097 non-null  float64
dtypes: float64(12)
memory usage: 7.0 MB


## Simple Random Imputation

In [50]:
def random_imputation(X, random_state = 42):
    """
    1. Imputes using random entries from the same column
    """
    df = X.copy()
    missing_columns = X.columns[X.isnull().any()]
    np.random.seed(random_state)
    
    for feature in missing_columns:
        number_missing = df[feature].isnull().sum()
        observed_values = df.loc[df[feature].notnull(), feature]
        df.loc[df[feature].isnull(), feature] = np.random.choice(observed_values, number_missing, replace = True)  
    
    return df

In [41]:
X_rand = random_imputation(X)

In [42]:
X_rand.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76097 entries, 0 to 76096
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Expense    76097 non-null  float64
 1   Income     76097 non-null  float64
 2   Loan_type  76097 non-null  float64
 3   Age        76097 non-null  float64
 4   Score1     76097 non-null  float64
 5   Score2     76097 non-null  float64
 6   Score3     76097 non-null  float64
 7   Score4     76097 non-null  float64
 8   Score5     76097 non-null  float64
 9   X          76097 non-null  float64
 10  Y          76097 non-null  float64
 11  Z          76097 non-null  float64
dtypes: float64(12)
memory usage: 7.0 MB


## MICE

In [None]:
from impyute.imputation.cs import mice
# https://impyute.readthedocs.io/en/latest/_modules/impyute/imputation/cs/mice.html

def mice_imputer(X):
    """
    1. this will 5-10 minutes to finish imputing
    """
    return mice(X.values)

X_mice = mice_imputer(X)
X_mice = pd.DataFrame(X_mice, columns = col_names)


In [None]:
X_mice.info()

## Stochastic Regression Imputation

In [46]:
X.head()

Unnamed: 0,Expense,Income,Loan_type,Age,Score1,Score2,Score3,Score4,Score5,X,Y,Z
0,0.727616,-0.820197,-1.084838,1.125771,-1.380902,0.459216,1.500287,0.134148,0.727577,-0.520531,1.034376,-0.660433
1,-0.665859,-0.346448,-1.084838,-0.88828,0.426981,0.076698,-0.458134,0.128304,-0.665198,-0.520531,1.034376,-0.660433
2,-1.343496,1.727059,0.921796,-0.88828,0.212984,-0.299051,-1.188483,-0.469986,-1.342495,-0.520531,1.034376,-0.660433
3,,0.577993,0.921796,-0.88828,0.940929,-0.029602,-0.273081,0.535119,-0.395711,-0.520531,1.034376,-0.660433
4,0.32757,0.633001,-1.084838,1.125771,,1.119919,0.211237,1.448825,0.327732,1.921116,-0.966766,-0.660433


In [53]:
from sklearn.linear_model import LinearRegression

def stochastic_regr_imputer(X):

    df = X.copy()
    X_rand = random_imputation(X)
    missing_columns = df.columns[df.isnull().any()]

    for feature in missing_columns:
            
        parameters = list(set(df.columns) - {feature})
        model = LinearRegression()
        model.fit(X = X_rand[parameters], y = X_rand[feature])
        
        #Standard Error of the regression estimates is equal to std() of the errors of each estimates
        predict = model.predict(X_rand[parameters])
        std_error = (predict[df[feature].notnull()] - X_rand.loc[df[feature].notnull(), feature]).std()
        
        random_predict = np.random.normal(size = df[feature].shape[0], 
                                          loc = predict, 
                                          scale = std_error)
        df.loc[(df[feature].isnull()), feature] = random_predict[(df[feature].isnull())]

    return df


In [54]:
X_stoch = stochastic_regr_imputer(X)

In [55]:
X_stoch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76097 entries, 0 to 76096
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Expense    76097 non-null  float64
 1   Income     76097 non-null  float64
 2   Loan_type  76097 non-null  float64
 3   Age        76097 non-null  float64
 4   Score1     76097 non-null  float64
 5   Score2     76097 non-null  float64
 6   Score3     76097 non-null  float64
 7   Score4     76097 non-null  float64
 8   Score5     76097 non-null  float64
 9   X          76097 non-null  float64
 10  Y          76097 non-null  float64
 11  Z          76097 non-null  float64
dtypes: float64(12)
memory usage: 7.0 MB
