In [1]:
import numpy as np
import pandas as pd
from impyute.imputation.cs import mice              # https://impyute.readthedocs.io/en/latest/_modules/impyute/imputation/cs/mice.html
from sklearn.impute import SimpleImputer            # https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression


## Basic preprocessing

In [2]:
def basic_clean(X, y):
    
    # remove y NaNs
    drop_indices = np.where(y.isna())[0] + 1
    
    X = X.drop(drop_indices, axis = 0).reset_index(drop = True)
    y = y.drop(drop_indices, axis = 0).reset_index(drop = True)

    # get class indices
    pos_idx = y.loc[y['Label'] == 1].index
    neg_idx = y.loc[y['Label'] == 0].index

    pos_loan = X.loc[pos_idx]['Loan_type'].mode()[0]
    neg_loan = X.loc[neg_idx]['Loan_type'].mode()[0]

    X_pos = X.loc[pos_idx]
    X_pos['Loan_type'] = X_pos['Loan_type'].fillna(pos_loan)

    X_neg = X.loc[neg_idx]
    X_neg['Loan_type'] = X_neg['Loan_type'].fillna(neg_loan)

    X = pd.concat([X_pos, X_neg], sort = False).sort_index()
    X['Loan_type'] = X['Loan_type'].map({'A': 1, 'B': 0}, na_action = 'ignore')

    # occupation is imputed with class conditional mode
    pos_occ = X.loc[pos_idx]['Occupation_type'].mode()[0]
    neg_occ = X.loc[neg_idx]['Occupation_type'].mode()[0]

    # occ_type = X.pop('Occupation_type')
    X_pos = X.loc[pos_idx]
    X_pos['Occupation_type'] = X_pos['Occupation_type'].fillna(pos_occ)

    X_neg = X.loc[neg_idx]
    X_neg['Occupation_type'] = X_neg['Occupation_type'].fillna(neg_occ)

    X = pd.concat([X_pos, X_neg], sort = False).sort_index()
    occ_type = pd.get_dummies(X['Occupation_type'])
    X = pd.concat([X, occ_type], axis = 1)
    X = X.drop('Occupation_type', axis = 1)

    scaler = StandardScaler()
    col_names = X.columns
    X = pd.DataFrame(scaler.fit_transform(X), columns = col_names)

    return X, y


## Class conditional wrapper

In [5]:
def class_conditional(imputer, X, y):
    
    assert isinstance(X, pd.DataFrame)
    pos_idx = y.loc[y['Label'] == 1].index
    neg_idx = y.loc[y['Label'] == 0].index

    col_names = X.columns
    X_pos = X.loc[pos_idx]
    X_neg = X.loc[neg_idx]

    X_pos = imputer(X_pos)
    X_neg = imputer(X_neg)

    if not isinstance(X_pos, pd.DataFrame):
        X_pos = pd.DataFrame(X_pos, columns = col_names)
        X_neg = pd.DataFrame(X_neg, columns = col_names)
    return pd.concat([X_pos, X_neg], sort = False).sort_index()

## Single Imputation

In [6]:
def num_imputer(X, strategy = 'mean'):
    """
    1. strategy (string) : use 'mean' or 'median'
    2. read documentation of SimpleImputer for methods
    """
    assert isinstance(X, pd.DataFrame)
    imp_numeric = SimpleImputer(missing_values = np.nan, strategy = strategy)
    return imp_numeric.fit_transform(X)
    

## Simple Random Imputation

In [7]:
def random_imputation(X, random_state = 42):
    """
    1. Imputes using random entries from the same column
    """
    assert isinstance(X, pd.DataFrame)
    missing_columns = X.columns[X.isnull().any()]
    np.random.seed(random_state)
    
    df = X.copy()
    for feature in missing_columns:
        number_missing = df[feature].isnull().sum()
        observed_values = df.loc[df[feature].notnull(), feature]
        df.loc[df[feature].isnull(), feature] = np.random.choice(observed_values, number_missing, replace = True)  
    
    return df

## MICE

In [9]:
def mice_imputer(X):
    """
    1. this will 5-10 minutes to finish imputing
    """
    assert isinstance(X, pd.DataFrame)
    col_names = X.columns 
    return pd.DataFrame(mice(X.values), columns = col_names)


## Stochastic Regression Imputation

In [10]:
def stochastic_regr_imputer(X):

    df = X.copy()
    X_rand = random_imputation(X)
    missing_columns = df.columns[df.isnull().any()]

    for feature in missing_columns:
            
        parameters = list(set(df.columns) - {feature})
        model = LinearRegression()
        model.fit(X = X_rand[parameters], y = X_rand[feature])
        
        #Standard Error of the regression estimates is equal to std() of the errors of each estimates
        predict = model.predict(X_rand[parameters])
        std_error = (predict[df[feature].notnull()] - X_rand.loc[df[feature].notnull(), feature]).std()
        
        random_predict = np.random.normal(size = df[feature].shape[0], 
                                          loc = predict, 
                                          scale = std_error)
        df.loc[(df[feature].isnull()), feature] = random_predict[(df[feature].isnull())]

    return df
