In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder

from fancyimpute import KNN


# Load Data

In [2]:
# Train dataset
data_train = pd.read_csv('../data/Exercise_train (1).csv', sep=',')
data_train.head()

Unnamed: 0.1,Unnamed: 0,id,kjkrfgld,bpowgknt,raksnhjf,vwpsxrgk,omtioxzz,yfmzwkru,tiwrsloh,weioazcf,poor
0,2225,29252,KfoTG,zPfZR,DtMvg,,12.0,-3.0,-1.0,0.5,False
1,1598,98286,ljBjd,THHLT,DtMvg,esAQH,21.0,-2.0,-5.0,-9.5,True
2,7896,49040,Lsuai,zPfZR,zeYAm,ZCIYy,12.0,-3.0,-5.0,-9.5,True
3,1458,35261,KfoTG,mDadf,zeYAm,ZCIYy,12.0,-1.0,-5.0,-9.5,False
4,1817,98833,KfoTG,THHLT,DtMvg,ARuYG,21.0,-4.0,-5.0,-9.5,True


In [3]:
# Test dataset
data_test = pd.read_csv('../data/Exercise_test (1).csv', sep=',')
data_test.head()

Unnamed: 0.1,Unnamed: 0,id,kjkrfgld,bpowgknt,raksnhjf,vwpsxrgk,omtioxzz,yfmzwkru,tiwrsloh,weioazcf,poor
0,2197,42706,qzGkS,zPfZR,rXCdD,IJnCs,21.0,-3.0,-5.0,-9.5,False
1,4823,65531,KfoTG,THHLT,DtMvg,XHmQd,21.0,-2.0,-9.0,-19.5,True
2,4165,64452,tnDpM,THHLT,qTmDg,yygvO,30.0,-4.0,-9.0,-19.5,True
3,4962,78022,,WXYiE,DtMvg,XAmOF,21.0,-2.0,3.0,10.5,False
4,4671,45674,Lsuai,zPfZR,DtMvg,hmAUm,30.0,-4.0,-5.0,-9.5,True


In [4]:
data_train = data_train.set_index('id')
data_test = data_test.set_index('id')

# Data Preprocessing
<div style = "float:right"><a style="text-decoration:none" href = "#inicio">Inicio</a></div>

## Binary target variable

Let's transform our target variable into a binary (0, 1) variable.

In [5]:
data_train['poor'] = data_train['poor'].apply(lambda x: 1 if x == True else 0)

In [6]:
data_test['poor'] = data_test['poor'].apply(lambda x: 1 if x == True else 0)

## Numerical features

Two-step approach:
<ol>
    <li> adopt an <b>imputation</b> method for missing values based on the distribution's median,</li>
    <li> <b>standardize</b> data to avoid degradation of the predictive performance of many machine learning algorithms. Unscaled data can also slow down or even prevent the convergence of many gradient-based estimators.<br>
        We first use PowerTransformer method since it accounts for skewed distributions (and outliers?). However, we shall perform other methods and visualize plots in order to make the most accurate data standardization.</li>
</ul>
    
[Documentation](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#sphx-glr-auto-examples-preprocessing-plot-all-scaling-py)

In [7]:
def num_proc(data):
    # Numerical features
    data = data.drop('Unnamed: 0', axis=1)
    num_data = data.select_dtypes(include=['float64']).copy()
    
    # Imputation
    median_imputer = SimpleImputer(strategy='median')
    num_data.iloc[:,:] = median_imputer.fit_transform(num_data)
    
    # Standardization
    scaler = PowerTransformer()
    num_data.iloc[:, 0:4] = scaler.fit_transform(num_data)
    
    return num_data

### Train data

In [8]:
num_data_train = num_proc(data_train)
num_data_train.head()

Unnamed: 0_level_0,omtioxzz,yfmzwkru,tiwrsloh,weioazcf
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
29252,-0.480203,0.102811,0.638589,0.628295
98286,0.350762,1.159184,-0.279685,-0.280701
49040,-0.480203,0.102811,-0.279685,-0.280701
35261,-0.480203,1.907488,-0.279685,-0.280701
98833,0.350762,-1.263886,-0.279685,-0.280701


In [9]:
print(num_data_train.isnull().values.sum())

0


### Test data

In [10]:
num_data_test = num_proc(data_test)
num_data_test.head()

Unnamed: 0_level_0,omtioxzz,yfmzwkru,tiwrsloh,weioazcf
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42706,0.404017,0.101349,-0.278625,-0.27874
65531,0.404017,1.177563,-1.074606,-1.071908
64452,1.174058,-1.248141,-1.074606,-1.071908
78022,0.404017,1.177563,1.970861,1.973978
45674,1.174058,-1.248141,-0.278625,-0.27874


In [12]:
print(num_data_test.isnull().values.sum())

0


## Categorical features

Typically, any standard work-flow in feature engineering involves some form of transformation of these categorical values into numeric labels and then applying some encoding scheme on these values.<br>

As suggested by this [DataCamp course](https://learn.datacamp.com/courses/dealing-with-missing-data-in-python), we may follow this three step-approach:<br>
<ol>
    <li>Convert non-missing categorical columns to ordinal values</li>
    <li>Impute the missing values in the ordinal DataFrame</li>
    <li>Convert back from ordinal values to categorical values</li>
</ol>

In [13]:
def cat_proc(data):
    # Select object types
    cat_data = data.select_dtypes(include=['object']).copy()
    
    # NAs imputation
    # Ordinal Encoding of cat_data_train DataFrame
    ordinal_enc_dict = {}

    for col_name in cat_data:
        # Create an Ordinal Encoder for col
        ordinal_enc_dict[col_name] = OrdinalEncoder()
        col = cat_data[col_name]
    
        # Select non-null values of columns (Ordinal Encoder does not work with NAs)
        col_not_null = col[col.notnull()]
        reshaped_vals = col_not_null.values.reshape(-1, 1)
        encoded_vals = ordinal_enc_dict[col_name].fit_transform(reshaped_vals)
    
        # Store the values to non-null values of the column in data
        cat_data.loc[col.notnull(), col_name] = np.squeeze(encoded_vals)
        
        
    # Create KNN imputer
    KNN_imputer = KNN()

    # Impute and round the users DataFrame
    cat_data.iloc[:, :] = np.round(KNN_imputer.fit_transform(cat_data))
    
    # Loop over the columns
    for col_name in cat_data:
    
        # Reshape the data
        reshaped = cat_data[col_name].values.reshape(-1, 1)
    
        # Perform inverse transform of the ordinally encoded columns
        cat_data[col_name] = ordinal_enc_dict[col_name].inverse_transform(reshaped)
        
    return cat_data      

### Train data

In [14]:
cat_data_train = cat_proc(data_train)
cat_data_train.head()

Imputing row 1/6562 with 1 missing, elapsed time: 3.496
Imputing row 101/6562 with 0 missing, elapsed time: 3.506
Imputing row 201/6562 with 0 missing, elapsed time: 3.507
Imputing row 301/6562 with 0 missing, elapsed time: 3.507
Imputing row 401/6562 with 0 missing, elapsed time: 3.508
Imputing row 501/6562 with 0 missing, elapsed time: 3.509
Imputing row 601/6562 with 0 missing, elapsed time: 3.509
Imputing row 701/6562 with 0 missing, elapsed time: 3.510
Imputing row 801/6562 with 0 missing, elapsed time: 3.510
Imputing row 901/6562 with 0 missing, elapsed time: 3.511
Imputing row 1001/6562 with 0 missing, elapsed time: 3.511
Imputing row 1101/6562 with 0 missing, elapsed time: 3.512
Imputing row 1201/6562 with 0 missing, elapsed time: 3.512
Imputing row 1301/6562 with 1 missing, elapsed time: 3.513
Imputing row 1401/6562 with 1 missing, elapsed time: 3.513
Imputing row 1501/6562 with 1 missing, elapsed time: 3.514
Imputing row 1601/6562 with 0 missing, elapsed time: 3.514
Imputing 

Unnamed: 0_level_0,kjkrfgld,bpowgknt,raksnhjf,vwpsxrgk
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
29252,KfoTG,zPfZR,DtMvg,ZCIYy
98286,ljBjd,THHLT,DtMvg,esAQH
49040,Lsuai,zPfZR,zeYAm,ZCIYy
35261,KfoTG,mDadf,zeYAm,ZCIYy
98833,KfoTG,THHLT,DtMvg,ARuYG


In [15]:
print(cat_data_train.isnull().values.sum())

0


### Test data

In [16]:
cat_data_test = cat_proc(data_test)
cat_data_test.head()

Imputing row 1/1641 with 0 missing, elapsed time: 0.245
Imputing row 101/1641 with 0 missing, elapsed time: 0.246
Imputing row 201/1641 with 0 missing, elapsed time: 0.246
Imputing row 301/1641 with 0 missing, elapsed time: 0.247
Imputing row 401/1641 with 0 missing, elapsed time: 0.247
Imputing row 501/1641 with 0 missing, elapsed time: 0.247
Imputing row 601/1641 with 0 missing, elapsed time: 0.248
Imputing row 701/1641 with 1 missing, elapsed time: 0.248
Imputing row 801/1641 with 1 missing, elapsed time: 0.249
Imputing row 901/1641 with 0 missing, elapsed time: 0.249
Imputing row 1001/1641 with 0 missing, elapsed time: 0.249
Imputing row 1101/1641 with 0 missing, elapsed time: 0.250
Imputing row 1201/1641 with 0 missing, elapsed time: 0.250
Imputing row 1301/1641 with 0 missing, elapsed time: 0.251
Imputing row 1401/1641 with 1 missing, elapsed time: 0.251
Imputing row 1501/1641 with 0 missing, elapsed time: 0.252
Imputing row 1601/1641 with 0 missing, elapsed time: 0.252


Unnamed: 0_level_0,kjkrfgld,bpowgknt,raksnhjf,vwpsxrgk
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42706,qzGkS,zPfZR,rXCdD,IJnCs
65531,KfoTG,THHLT,DtMvg,XHmQd
64452,tnDpM,THHLT,qTmDg,yygvO
78022,Lsuai,WXYiE,DtMvg,XAmOF
45674,Lsuai,zPfZR,DtMvg,hmAUm


In [17]:
print(cat_data_test.isnull().values.sum())

0


In [19]:
# cat_data_train = cat_data_train.astype('category')
cat_data_train.dtypes

kjkrfgld    object
bpowgknt    object
raksnhjf    object
vwpsxrgk    object
dtype: object

In [65]:
# Label Encoder
# cat_data_train = cat_data_train.apply(lambda col: cat_data_train[col].cat.codes, axis=1)
# cat_data_train.head()

## Features and targets

In [18]:
train = pd.concat([num_data_train, cat_data_train], axis=1)
train = train.join(data_train['poor'])

In [19]:
test = pd.concat([num_data_test, cat_data_test], axis=1)
test = test.join(data_test['poor'])

In [20]:
train.head()

Unnamed: 0_level_0,omtioxzz,yfmzwkru,tiwrsloh,weioazcf,kjkrfgld,bpowgknt,raksnhjf,vwpsxrgk,poor
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
29252,-0.480203,0.102811,0.638589,0.628295,KfoTG,zPfZR,DtMvg,ZCIYy,0
98286,0.350762,1.159184,-0.279685,-0.280701,ljBjd,THHLT,DtMvg,esAQH,1
49040,-0.480203,0.102811,-0.279685,-0.280701,Lsuai,zPfZR,zeYAm,ZCIYy,1
35261,-0.480203,1.907488,-0.279685,-0.280701,KfoTG,mDadf,zeYAm,ZCIYy,0
98833,0.350762,-1.263886,-0.279685,-0.280701,KfoTG,THHLT,DtMvg,ARuYG,1


In [21]:
test.head()

Unnamed: 0_level_0,omtioxzz,yfmzwkru,tiwrsloh,weioazcf,kjkrfgld,bpowgknt,raksnhjf,vwpsxrgk,poor
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42706,0.404017,0.101349,-0.278625,-0.27874,qzGkS,zPfZR,rXCdD,IJnCs,0
65531,0.404017,1.177563,-1.074606,-1.071908,KfoTG,THHLT,DtMvg,XHmQd,1
64452,1.174058,-1.248141,-1.074606,-1.071908,tnDpM,THHLT,qTmDg,yygvO,1
78022,0.404017,1.177563,1.970861,1.973978,Lsuai,WXYiE,DtMvg,XAmOF,0
45674,1.174058,-1.248141,-0.278625,-0.27874,Lsuai,zPfZR,DtMvg,hmAUm,1


In [22]:
y_train = train.iloc[:, -1]

In [23]:
X_train = train.iloc[:, 0:-1]

In [24]:
y_test = test.iloc[:, -1]

In [25]:
X_test = test.iloc[:, 0:-1]