In [1]:
# Import pandas and numpy
import pandas as pd
import numpy as np

# Import seaborn and matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# Import OneHotEncoder, MinMaxScaler, and FunctionTransformer from sklearn.preprocessing
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer

# Import Pipeline and FeatureUnion
from sklearn.pipeline import Pipeline, FeatureUnion

# Import train_test_split from sklearn.model_selection
from sklearn.model_selection import train_test_split

# Import LogisticRegression from sklearn.linear_model
from sklearn.linear_model import LogisticRegression

# Import LinearSVC from sklearn.svm
from sklearn.svm import LinearSVC

# Import DecisionTreeClassifier from sklearn.tree
from sklearn.tree import DecisionTreeClassifier

# Import RandomForestClassifier from sklearn.ensemble
from sklearn.ensemble import RandomForestClassifier

# Import roc_auc_score from sklearn.metrics
from sklearn.metrics import roc_auc_score

In [2]:
# Import the final working data (encoded)
train_df = pd.read_csv('./Intermediate_Data/encoded_train.csv', encoding='utf-8', index_col=0)

# Import the test data
test_df = pd.read_csv('./Intermediate_Data/encoded_test.csv', encoding='utf-8', index_col=0)

### Model Approach
For the development of this model, I will be using pipelines with an encoding step (OneHotEncoder) and a modeling step (LogisticRegression/LinearSVC/DecisionTreeClassifier/RandomForestClassifier)). I will attempt a combination of different variables (exclude some of the variables that were determined as poor predictors of the target variable). Additionally, I may choose to apply feature importance to the non-binary categorical data to do some feature selection if the model is having difficulty training.

Additionally, I may choose to performing some preprocessing steps on the numerical data depending on the performance of the model.

### Metrics
For the Kaggle competition, the scoring is done via area under the ROC curve

In [3]:
train_df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,...,ONGOING_CREDIT,TOTAL_CLOSED,CLOSED_CREDIT,MONTHS_BALANCE,SK_DPD,NET_PAID,PREV_CREDIT,DAYS_TERMINATION,NET_PAYMENT,PAYMENT_TIME
0,100002,1,0,1,0,202500.0,406597.5,0,7,4,...,2.0,383067.0,6.0,0.0,0.0,0.0,3402045.0,-17.0,0.0,-388.0
1,100003,0,0,0,0,270000.0,1293502.5,1,4,1,...,1.0,207400.5,3.0,0.0,0.0,0.0,10159641.0,-527.0,0.0,-179.0
2,100004,0,1,1,0,67500.0,135000.0,0,7,4,...,0.0,189037.8,2.0,0.0,0.0,0.0,60318.0,-714.0,0.0,-23.0
3,100006,0,0,1,0,135000.0,312682.5,0,7,4,...,0.0,0.0,0.0,-1.0,0.0,0.0,3745395.0,0.0,0.0,-310.0
4,100007,0,0,1,0,121500.0,513000.0,0,7,4,...,0.0,146250.0,1.0,0.0,0.0,0.0,9733149.0,0.0,-29857.365,-240.0


In [4]:
# Define the categorical and numerical column

# Define the binary categories
BINARY_CAT = ['NAME_CONTRACT_TYPE', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'REG_REGION_NOT_LIVE_REGION', 'VALID_MOBILE']

# Define the non-binary categories
NON_BIN_CAT = ['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE']

# Define all the categorical columns
CATEGORICAL = BINARY_CAT + NON_BIN_CAT

# Define the numerical columns. All the dtype float64s are numerical columns
NUMERICAL = [col for col in train_df if train_df[col].dtype == 'float64']

# I'll define the additional numerical columns here
ADD_NUMERICAL = ['CNT_CHILDREN', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'DOCUMENTS']

# Define the complete numerical list
NUMERICAL += ADD_NUMERICAL

In [5]:
# Define the columns to remove that were deemed as poor predictors
TO_REMOVE = ['OWN_CAR_AGE', 'ENQUIRIES', 'SK_DPD', 'DAYS_TERMINATION', 'VALID_MOBILE']

# Remove the columns from the training data
train_df.drop(TO_REMOVE, axis=1, inplace=True)

# Remove the columns from the test data
test_df.drop(TO_REMOVE, axis=1, inplace=True)

In [6]:
# Remove poor predictors from CATEGORICAL list
CATEGORICAL.remove('VALID_MOBILE')

# Remove poor predictors from NUMERICAL list
for col in ['OWN_CAR_AGE', 'ENQUIRIES', 'SK_DPD', 'DAYS_TERMINATION']:
    NUMERICAL.remove(col)

In [7]:
# Remove SK_ID_cURR??? since it's supposed to be the index not a training variable

# Split the training data

# Split the data into X(variables) and y(target)
X = train_df.drop(['SK_ID_CURR', 'TARGET'], axis=1)
y = train_df['TARGET']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
# # Split the training data

# # Split the data into X(variables) and y(target)
# X = train_df.drop('TARGET', axis=1)
# y = train_df['TARGET']

# # Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
# Define a function to match the sample submission format
def submission(prediction):
    '''Creates the dataframe for the predictions and proper formatting'''
    submit = test_df[['SK_ID_CURR']].copy()
    submit['TARGET'] = prediction
    return submit

In [10]:
# Use FunctionTransformer to retrieve the categorical data and numerical data
get_categorical = FunctionTransformer(lambda x: x[CATEGORICAL], validate=False)

get_numerical = FunctionTransformer(lambda x: x[NUMERICAL], validate=False)

In [11]:
# Let's see if the function transformers work
assert len(CATEGORICAL) == len(get_categorical.fit_transform(train_df).columns)
assert len(NUMERICAL) == len(get_numerical.fit_transform(train_df).columns)

In [12]:
# Create a FeatureUnion with a nested pipeline
process_join_features = FeatureUnion(
        transformer_list=[
            ('numeric', Pipeline([
                ('selector', get_numerical),
                ('scaler', MinMaxScaler())
            ])),
            ('categorical', Pipeline([
                ('selector', get_categorical),
                ('encoder', OneHotEncoder())
            ]))
        ])

#### LogisticRegression

In [13]:
# Create the Pipeline with LogisticRegression
lrPipeline = Pipeline([
    ('union', process_join_features),
    ('clf', LogisticRegression())
])

# Fit the Pipeline with the training data
lrPipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric', Pipeline(memory=None,
     steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x00000177328EAC80>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y='deprecat...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [14]:
predicted_train_values = lrPipeline.predict_proba(X_train)[:, -1]
predicted_test_values = lrPipeline.predict_proba(X_test)[:, -1]

In [15]:
print (roc_auc_score(y_test, predicted_test_values))

0.6816438927778861


In [16]:
# Generate the predictions from the pipeline
lrPredicted = lrPipeline.predict_proba(test_df)[:, -1]

# Create the submission dataframe
lrPredictions = submission(lrPredicted)

# Write the predictions to a csv file
lrPredictions.to_csv('./Submissions//lrPredictions.csv', encoding='utf-8', index=False)

#### Decision Tree

In [17]:
# Create the Pipeline with DecisionTreeClassifier
dtPipeline = Pipeline([
    ('union', process_join_features),
    ('clf', DecisionTreeClassifier())
])

# Fit the Pipeline with the training data
dtPipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric', Pipeline(memory=None,
     steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x00000177328EAC80>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y='deprecat...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [18]:
predicted_train_values = dtPipeline.predict_proba(X_train)[:, -1]
predicted_test_values = dtPipeline.predict_proba(X_test)[:, -1]

In [19]:
print (roc_auc_score(y_test, predicted_test_values))

0.525174506019423


In [20]:
# Generate the predictions from the pipeline
dtPredicted = dtPipeline.predict_proba(test_df)[:, -1]

# Create the submission dataframe
dtPredictions = submission(dtPredicted)

# Write the predictions to a csv file
dtPredictions.to_csv('./Submissions//dtPredictions.csv', encoding='utf-8', index=False)

#### RandomForestClassifier

In [21]:
# Create the Pipeline with RandomForestClassifier
rfPipeline = Pipeline([
    ('union', process_join_features),
    ('clf', RandomForestClassifier())
])

# Fit the Pipeline with the training data
rfPipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric', Pipeline(memory=None,
     steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x00000177328EAC80>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y='deprecat...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [22]:
predicted_train_values = rfPipeline.predict_proba(X_train)[:, -1]
predicted_test_values = rfPipeline.predict_proba(X_test)[:, -1]

In [23]:
print (roc_auc_score(y_test, predicted_test_values))

0.6007047529465207


In [24]:
# Generate the predictions from the pipeline
rfPredicted = rfPipeline.predict_proba(test_df)[:, -1]

# Create the submission dataframe
rfPredictions = submission(rfPredicted)

# Write the predictions to a csv file
rfPredictions.to_csv('./Submissions//rfPredictions.csv', encoding='utf-8', index=False)

 #### LinearSVC

In [25]:
# Create the Pipeline with LinearSVC
svcPipeline = Pipeline([
    ('union', process_join_features),
    ('clf', LinearSVC())
])

# Fit the Pipeline with the training data
svcPipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric', Pipeline(memory=None,
     steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x00000177328EAC80>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y='deprecat...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [26]:
predicted_train_values = svcPipeline.decision_function(X_train)#[:, -1]
predicted_test_values = svcPipeline.decision_function(X_test)#[:, -1]

In [27]:
predicted_train_values

array([-0.80051466, -0.91324142, -0.65885526, ..., -0.80803819,
       -0.91029166, -0.82523187])

In [28]:
print (roc_auc_score(y_test, predicted_test_values))

0.6814275258498907


In [29]:
# Generate the predictions from the pipeline
svcPredicted = svcPipeline.decision_function(test_df)#[:, -1]

# Create the submission dataframe
svcPredictions = submission(svcPredicted)

In [30]:
# Normalize the predictions
svcPredictions['TARGET'] =(svcPredictions['TARGET']-svcPredictions['TARGET'].
                min())/(svcPredictions['TARGET'].max()-svcPredictions['TARGET'].min())

In [31]:
svcPredictions.describe()

Unnamed: 0,SK_ID_CURR,TARGET
count,48744.0,48744.0
mean,277796.67635,0.387691
std,103169.547296,0.104054
min,100001.0,0.0
25%,188557.75,0.283527
50%,277549.0,0.423152
75%,367555.5,0.465282
max,456250.0,1.0


In [32]:
# Write the predictions to a csv file
svcPredictions.to_csv('./Submissions//svcPredictions.csv', encoding='utf-8', index=False)