In [24]:
reset -fs

In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model, tree
from sklearn.model_selection import train_test_split,cross_val_score, cross_val_predict,GridSearchCV,RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
 
from sklearn.metrics import roc_curve, confusion_matrix,precision_recall_curve,roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score 

import xgboost as xgb # note for installation use py-xgboost in conda

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectFromModel

%matplotlib inline
plt.style.use('seaborn')
np.random.seed(42)

In [26]:
train = pd.read_csv('application_train.csv')
test = pd.read_csv('application_test.csv')
test['target'] = -999
train.columns = train.columns.str.lower()
test.columns = test.columns.str.lower()

train['dataset'] = 'train'
test['dataset'] = 'test'

X = pd.concat([train,test],sort=True)

In [27]:
X_train = X[X.dataset == 'train'].drop(['target', 'dataset'], axis=1)
X_test = X[X.dataset == 'test'].drop(['target', 'dataset'], axis=1)
y_train = X[X.dataset == 'train'].target
y_test = X[X.dataset == 'test'].target

X.drop('dataset', axis=1, inplace=True)

In [28]:
drop_threshold = 20
missing = pd.DataFrame(X.isnull().sum(),columns=['Number'])
missing['Percentage'] = round(missing.Number/X.shape[0]*100,2)
missing[missing.Percentage>20]
to_remove = missing[missing.Percentage>drop_threshold].index.tolist()

X.drop(to_remove, axis=1, inplace=True)
X_train.drop(to_remove, axis=1, inplace=True)
X_test.drop(to_remove, axis=1, inplace=True)

In [29]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)
print('X shape:', X.shape)

X_train shape: (307511, 71)
X_test shape: (48744, 71)
y_train shape: (307511,)
y_test shape: (48744,)
X shape: (356255, 72)


In [30]:
num_features = list(X.columns[X.dtypes!=object])
num_features.remove('target')
cat_features = list(X.columns[X.dtypes==object])

In [31]:
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Pipline for numerical features
num_pipeline = Pipeline([
    #('imputer_num', KNNImputer(n_neighbors=2, weights='uniform')),
    ('imputer_num', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

# Pipeline for categorical features 
cat_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
    ('1hot', OneHotEncoder(handle_unknown='ignore'))
])

In [32]:
# Complete pipeline
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

In [33]:
# Building a full pipeline with our preprocessor and a LogisticRegression Classifier
pipe_logreg = Pipeline([
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression(max_iter=1000))
])

In [34]:
# Making predictions on the training set using cross validation as well as calculating the probabilities 
y_train_predicted = cross_val_predict(pipe_logreg, X_train, y_train, cv=5, verbose=5)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   53.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  3.1min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.0min finished


In [35]:
# Calculating the accuracy for the LogisticRegression Classifier 
print('Cross validation scores:')
print('-------------------------')
print("Accuracy: {:.2f}".format(accuracy_score(y_train, y_train_predicted)))
print("Recall: {:.2f}".format(recall_score(y_train, y_train_predicted)))
print("Precision: {:.2f}".format(precision_score(y_train, y_train_predicted)))

Cross validation scores:
-------------------------
Accuracy: 0.92
Recall: 0.01
Precision: 0.49


In [38]:
X[cat_features].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 48743
Data columns (total 11 columns):
code_gender                   356255 non-null object
flag_own_car                  356255 non-null object
flag_own_realty               356255 non-null object
name_contract_type            356255 non-null object
name_education_type           356255 non-null object
name_family_status            356255 non-null object
name_housing_type             356255 non-null object
name_income_type              356255 non-null object
name_type_suite               354052 non-null object
organization_type             356255 non-null object
weekday_appr_process_start    356255 non-null object
dtypes: object(11)
memory usage: 32.6+ MB


In [40]:
X[cat_features].name_type_suite.unique()

array(['Unaccompanied', 'Family', 'Spouse, partner', 'Children',
       'Other_A', nan, 'Other_B', 'Group of people'], dtype=object)