In [93]:
# preprocessing for modeling data
import pandas as pd

dtypes = {
    'Gender': object,
    'Married': object,
    'Dependents': object,
    'Education': object,
    'Self_Employed': object,
    'Property_Area': object,
}
df = pd.read_csv('./av_loan_u6lujuX_CVtuZ9i.csv', header=0, dtype=dtypes)

x = df.iloc[:, 1:-1]
loan_id = df.iloc[:, [0]]
y = df.iloc[:, [-1]]
y_new = y.copy()
class_mapping = {'N':1, 'Y':0}
y_new.loc[:, 'Loan_Status'] = y_new['Loan_Status'].map(class_mapping)

# one-hot encoding
ohe_columns = []
for i in dtypes:
    ohe_columns.append(i)
    
x_new = pd.get_dummies(x, dummy_na=True, columns=ohe_columns)
x_new_columns = x_new.columns.values
x_new.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,...,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5849,0.0,,360.0,1.0,0,1,0,1,0,...,1,0,0,1,0,0,0,0,1,0
1,4583,1508.0,128.0,360.0,1.0,0,1,0,0,1,...,1,0,0,1,0,0,1,0,0,0
2,3000,0.0,66.0,360.0,1.0,0,1,0,0,1,...,1,0,0,0,1,0,0,0,1,0
3,2583,2358.0,120.0,360.0,1.0,0,1,0,0,1,...,0,1,0,1,0,0,0,0,1,0
4,6000,0.0,141.0,360.0,1.0,0,1,0,1,0,...,1,0,0,1,0,0,0,0,1,0


In [86]:
# imputation missing values
from sklearn.preprocessing import Imputer

imp = Imputer()
imp.fit(x_new)

x_new = pd.DataFrame(imp.transform(x_new), columns=x_new_columns)
x_new.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,...,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5849.0,0.0,146.412162,360.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [87]:
# feature selection
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

selector = RFE(RandomForestClassifier(random_state=1), n_features_to_select=10, step=0.05)
selector.fit(x_new, y_new.as_matrix().ravel())

RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
  n_features_to_select=10, step=0.05, verbose=0)

In [89]:
# feature extraction
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipe_rf = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components=10)), ('est', RandomForestClassifier(random_state=1))])
pipe_rf.fit(x_new, y_new.as_matrix().ravel())

print('Normally done')

Normally done


In [90]:
# preprocessing for scoring data
df_s = pd.read_csv('./av_loan_test_Y3wMUE5_7gLdaTN.csv', header=0, dtype=dtypes)

x_s = df_s.iloc[:, 1:]

# one-hot encoding
x_new_s = pd.get_dummies(x_s, dummy_na=True, columns=ohe_columns)
x_new_columns_s = x_new_s.columns.values

cols_model = set(x_new_columns)
cols_score = set(x_new_columns_s)
cols_only_model = cols_model - cols_score
cols_only_score = cols_score - cols_model
print('cols_only_model:', cols_only_model)
print('cols_only_score:', cols_only_score)
x_new_s.head()

cols_only_model: {'Dependents_3+'}
cols_only_score: {'Gender_Unknown'}


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Gender_Unknown,Gender_nan,Married_No,...,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5720,0,110.0,360.0,1.0,0,1,0,0,0,...,1,0,0,1,0,0,0,0,1,0
1,3076,1500,126.0,360.0,1.0,0,1,0,0,0,...,1,0,0,1,0,0,0,0,1,0
2,5000,1800,208.0,360.0,1.0,0,1,0,0,0,...,1,0,0,1,0,0,0,0,1,0
3,2340,2546,100.0,360.0,,0,1,0,0,0,...,1,0,0,1,0,0,0,0,1,0
4,3276,0,78.0,360.0,1.0,0,1,0,0,1,...,0,1,0,1,0,0,0,0,1,0


In [94]:
cols_m = pd.DataFrame(None, columns=x_new_columns)

x_new_s2 = pd.concat([cols_m, x_new_s])

# fill NaN of columns which existing only model data
x_new_s2.loc[:, cols_only_model] = x_new_s2.loc[:, cols_only_model].fillna(0, axis=1)

# remove columns which existing only score data
x_new_s2 = x_new_s2.drop(cols_only_score, axis=1)
x_new_s2.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Education_Graduate,Education_Not Graduate,...,Married_No,Married_Yes,Married_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan
0,5720,0,1.0,1,0,0,0,0,1,0,...,0,1,0,0,0,1,0,1,0,0
1,3076,1500,1.0,0,1,0,0,0,1,0,...,0,1,0,0,0,1,0,1,0,0
2,5000,1800,1.0,0,0,1,0,0,1,0,...,0,1,0,0,0,1,0,1,0,0
3,2340,2546,,0,0,1,0,0,1,0,...,0,1,0,0,0,1,0,1,0,0
4,3276,0,1.0,1,0,0,0,0,0,1,...,1,0,0,0,0,1,0,1,0,0


In [95]:
# reindex columns 
x_new_s2 = x_new_s2.reindex(x_new_columns, axis=1)
x_new_s2.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,...,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5720,0,110.0,360.0,1.0,0,1,0,0,1,...,1,0,0,1,0,0,0,0,1,0
1,3076,1500,126.0,360.0,1.0,0,1,0,0,1,...,1,0,0,1,0,0,0,0,1,0
2,5000,1800,208.0,360.0,1.0,0,1,0,0,1,...,1,0,0,1,0,0,0,0,1,0
3,2340,2546,100.0,360.0,,0,1,0,0,1,...,1,0,0,1,0,0,0,0,1,0
4,3276,0,78.0,360.0,1.0,0,1,0,1,0,...,0,1,0,1,0,0,0,0,1,0


In [96]:
# impution missing vlaues
x_new_s2 = pd.DataFrame(imp.transform(x_new_s2), columns=x_new_columns)

x_new_s2.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,...,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5720.0,0.0,110.0,360.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3076.0,1500.0,126.0,360.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,5000.0,1800.0,208.0,360.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2340.0,2546.0,100.0,360.0,0.842199,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,3276.0,0.0,78.0,360.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [98]:
# feature selection
x_new_selected_s = x_new_s2.loc[:, x_new_columns[selector.support_]]
x_new_selected_s.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Married_No,Dependents_0,Self_Employed_No,Property_Area_Rural,Property_Area_Semiurban
0,5720.0,0.0,110.0,360.0,1.0,0.0,1.0,1.0,0.0,0.0
1,3076.0,1500.0,126.0,360.0,1.0,0.0,0.0,1.0,0.0,0.0
2,5000.0,1800.0,208.0,360.0,1.0,0.0,0.0,1.0,0.0,0.0
3,2340.0,2546.0,100.0,360.0,0.842199,0.0,0.0,1.0,0.0,0.0
4,3276.0,0.0,78.0,360.0,1.0,1.0,1.0,1.0,0.0,0.0


In [47]:
x_new_selected.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Married_No,Dependents_0,Self_Employed_No,Property_Area_Rural,Property_Area_Semiurban
0,5849.0,0.0,146.412162,360.0,1.0,1.0,1.0,1.0,0.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,0.0,0.0,1.0,1.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,0.0,1.0,0.0,0.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,0.0,1.0,1.0,0.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,1.0,1.0,1.0,0.0,0.0
