# Description
This notebook creates a baseline model (logistic regression) using only the application table.

In [1]:
import numpy as np
import pandas as pd

import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
print(os.listdir("../input/"))

In [3]:
# Load training data
app_train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
print('Training data shape: ', app_train.shape)
app_train.head()

In [4]:
# Load testing data
app_test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
print('Testing data shape: ', app_test.shape)
app_test.head()

In [5]:
# Count the distribution of labels in training data
app_train['TARGET'].value_counts()

In [6]:
# Histogram of label distribution
app_train['TARGET'].astype(int).plot.hist()

In [7]:
# Calculate missing values by column
def missing_values_table(df):
        # Total missing values of each column
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * mis_val / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [8]:
missing_values = missing_values_table(app_train)
missing_values.head(20)

In [9]:
# Histogram of percent of missing values
missing_values['% of Total Values'].plot.hist()

In [10]:
# Number of each type of column
app_train.dtypes.value_counts()

In [11]:
# Number of unique classes in each object column
app_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

In [12]:
from sklearn.preprocessing import LabelEncoder
# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        # For binary columns, encode with 0 and 1 (indeed the same as one-hot encoding)
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(app_train[col])
            # Transform both training and testing data
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

In [13]:
# one-hot encoding of categorical variables
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

# The resulting tables (ignore the target column) have different number of columns
# Because some values occur only in the training data
print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

In [14]:
train_labels = app_train['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

# Add the target back in
app_train['TARGET'] = train_labels

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

In [15]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# Drop the target from the training data
if 'TARGET' in app_train:
    train = app_train.drop(columns = ['TARGET'])
else:
    train = app_train.copy()
    
# Feature names
features = list(train.columns)

# Copy of the testing data
test = app_test.copy()

# Median imputation of missing values
imputer = SimpleImputer(strategy = 'median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
imputer.fit(train)

# Transform both training and testing data
train = imputer.transform(train)
test = imputer.transform(app_test)

# Repeat with the scaler
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

In [16]:
from sklearn.linear_model import LogisticRegression

# Make the model with the specified regularization parameter
log_reg = LogisticRegression(C = 1e-4)

# Train on the training data
log_reg.fit(train, train_labels)

In [17]:
# Make predictions
# Make sure to select the second column only
log_reg_pred = log_reg.predict_proba(test)[:, 1]

In [18]:
# Submission dataframe
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = log_reg_pred

submit.head()

In [19]:
# Save the submission to a csv file
submit.to_csv('log_reg_baseline.csv', index = False)

Private Score: 0.68426, Public Score: 0.68156

In [20]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 233, verbose = 1, n_jobs = -1)

# Train on the training data
random_forest.fit(train, train_labels)

# Make predictions
rf_pred = random_forest.predict_proba(test)[:, 1]

In [21]:
# Submission dataframe
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = rf_pred

submit.head()

In [22]:
# Save the submission to a csv file
submit.to_csv('rf_baseline.csv', index = False)

Private Score: 0.69141, Public Score: 0.68358