# Exploratory Analysis 

In [None]:
import pandas as pd
from pandas import Series, DataFrame
import os
import sys
import numpy as np


pd.set_option('display.max_columns', 500)

## Load Data

In [None]:
applicationDF = pd.read_csv(os.path.join('.','data','application_train.csv'))

## Basic Exploration

In [None]:
applicationDF.shape

In [None]:
applicationDF.info()

In [None]:
applicationDF.head()

In [None]:
applicationDF['TARGET'].value_counts()

In [None]:
applicationDF['TARGET'].value_counts() / applicationDF.shape[0]

## Clean Data

In [None]:
from tools.modelTools import categoricalColumns, numericalColumns

In [None]:
categoricalColumns = categoricalColumns()
numericalColumns = numericalColumns()

In [None]:
# create matrix of missing values
# only keep the column if it has missing values

numericalColumnsNan = (applicationDF[numericalColumns].isnull() * 1)
numericalColumnsNanList = (numericalColumnsNan.sum(axis=0) > 0)
numericalColumnsNanListFilter = numericalColumnsNanList[numericalColumnsNanList==True].index
numericalColumnsNanListFINAL = numericalColumnsNan[numericalColumnsNanListFilter]

In [None]:
# create matrix of missing values
# only keep the column if it has missing values

categoricalColumnsNan = (applicationDF[categoricalColumns].isnull() * 1)
categoricalColumnsNanList = (categoricalColumnsNan.sum(axis=0) > 0)
categoricalColumnsNanListFilter = categoricalColumnsNanList[categoricalColumnsNanList==True].index
categoricalColumnsNanListFINAL = categoricalColumnsNan[categoricalColumnsNanListFilter]

In [None]:
print(applicationDF[categoricalColumns].shape)
print(numericalColumnsNanListFINAL.shape)
print(categoricalColumnsNanListFINAL.shape)


In [None]:
categoricalDF = pd.concat([applicationDF[categoricalColumns], numericalColumnsNanListFINAL, categoricalColumnsNanListFINAL], axis=1)

In [None]:
categoricalDF.shape


In [None]:
# if you do not want to return a sparse matrix
# cat_encoder = OneHotEncoder(sparse=False)

cat_encoder = m.OneHotEncoder()
cat_1hot = cat_encoder.fit_transform(categoricalDF)

In [None]:
num_pipeline = Pipeline([
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

In [None]:
housing_num_tr = num_pipeline.fit_transform(applicationDF[numericalColumns])

## Prepare Data for Modeling

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(applicationDF, test_size=0.2, random_state=42)

X_train = train_set.drop(['SK_ID_CURR','TARGET'], axis=1) # drop labels for training set
y_train = train_set['TARGET'].copy()

X_test = test_set.drop(['SK_ID_CURR','TARGET'], axis=1) # drop labels for training set
y_test = test_set['TARGET'].copy()

In [None]:
%load_ext autoreload

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer

%aimport tools.modelTools
%autoreload 1

In [None]:
num_pipeline = Pipeline([
        ('selector', tools.modelTools.DataFrameSelector(numericalColumns)),
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', tools.modelTools.DataFrameSelector(categoricalColumns, fillna=True)),
        ('cat_encoder', tools.modelTools.OneHotEncoder(sparse=False)),
    ])

In [None]:
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [None]:
home_credit_prepared = full_pipeline.fit_transform(X_train)
home_credit_prepared

In [None]:
home_credit_prepared.shape

## Train Model

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=5, random_state=42)
sgd_clf.fit(home_credit_prepared, y_train)

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, home_credit_prepared, y_train, cv=3, scoring="accuracy")

In [None]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, home_credit_prepared, y_train, cv=3)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train, y_train_pred)

In [None]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_train, y_train_pred)

In [None]:
from sklearn.metrics import f1_score
f1_score(y_train, y_train_pred)

In [None]:
y_scores = cross_val_predict(sgd_clf, home_credit_prepared, y_train, cv=3,method="decision_function")

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train, y_scores)

In [None]:
import matplotlib.pyplot as plt

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_train, y_scores)

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf, home_credit_prepared, y_train, cv=3,method="predict_proba")

In [None]:
y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train,y_scores_forest)

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, "b:", linewidth=2, label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right", fontsize=16)
plt.show()

In [None]:
roc_auc_score(y_train, y_scores_forest)

### Questions

1. How to give a neural network categorical data?

### Playground

In [None]:
!ls -ltr data/

In [None]:
bureauBalancePath = os.path.join('.','data', 'bureau_balance.csv')

df = pd.read_csv(bureauBalancePath)

df.head()

In [None]:
df.STATUS.value_counts()

In [None]:
bureauBalancePath = os.path.join('.','data', 'bureau.csv')

df = pd.read_csv(bureauBalancePath)

df.head()

### Ideas
1. 