In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import xgboost as xgb
import warnings

from sklearn.datasets import dump_svmlight_file
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer

warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')

In [2]:
# Read training and test data
train = pd.read_csv('./input/application_train.csv')
test = pd.read_csv('./input/application_test.csv')

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

Training data shape:  (307511, 122)
Testing data shape:  (48744, 121)


In [3]:
# Move the target to the last
train = train[[c for c in train if c not in ['TARGET']] + ['TARGET']]

# Split data to features and target
X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
X_test = test.iloc[:, :]

In [4]:
# Check the target counts
y_train.value_counts()

0    282686
1     24825
Name: TARGET, dtype: int64

In [5]:
print(type(X_train))

<class 'pandas.core.frame.DataFrame'>


## Data Processing

### Label Encoding

In [6]:
# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in X_train:
    if X_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(X_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(X_train[col])
            # Transform both training and testing data
            X_train[col] = le.transform(X_train[col])
            X_test[col] = le.transform(X_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

3 columns were label encoded.


In [7]:
# One-hot encoding of categorical variables
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

print('Training Features shape: ', X_train.shape)
print('Testing Features shape: ', X_test.shape)

Training Features shape:  (307511, 242)
Testing Features shape:  (48744, 239)


In [8]:
# Align the training and testing data, keep only columns present in both dataframes
X_train, X_test = X_train.align(X_test, join='inner', axis=1)

print('Training Features shape: ', X_train.shape)
print('Testing Features shape: ', X_test.shape)
print(type(X_train))

Training Features shape:  (307511, 239)
Testing Features shape:  (48744, 239)
<class 'pandas.core.frame.DataFrame'>


### Imputing

In [9]:
# Median imputation of missing values
imputer = Imputer(strategy='median')

In [10]:
# Fit on the training data
imputer.fit(X_train)

# Transform both training and testing data
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

print(type(X_train), type(y_train))

<class 'numpy.ndarray'> <class 'pandas.core.series.Series'>


In [11]:
# Converting y_test to a numpy array
y_train = y_train.as_matrix()

print(type(y_train))

<class 'numpy.ndarray'>


## Training

In [14]:
if not os.path.exists('dtrain_sample.svm'):
    dump_svmlight_file(X_train, y_train, 'dtrain_sample.svm', zero_based=True)

dtrain = xgb.DMatrix('dtrain_sample.svm')

In [15]:
# Training parameters (Random)
num_round = 30
param = {
    'max_depth': 3,
    'eta': 0.05,
    'silent': 1}

In [16]:
# Make a model
model = xgb.train(param, dtrain, num_round)
model.save_model('sample_model')

## Prediction

In [17]:
dtest = xgb.DMatrix(X_test)

In [18]:
pred = model.predict(dtest)
print(pred.shape)

(48744,)


## Submission

In [19]:
# Make a submission dataframe
submission = test[['SK_ID_CURR']]
submission['TARGET'] = pred

0.136136    1483
0.134515    1252
0.147259    1196
0.130352     874
0.145638     861
0.131974     845
0.162887     766
0.146857     761
0.137001     741
0.155549     682
0.148124     677
0.142117     645
0.143097     595
0.135380     595
0.141475     578
0.145235     541
0.195533     480
0.140496     480
0.161265     477
0.146503     449
0.131217     426
0.153928     410
0.175558     409
0.132839     367
0.161302     339
0.149151     337
0.152590     330
0.136333     328
0.171565     317
0.142695     315
            ... 
0.208335       1
0.196043       1
0.208343       1
0.260451       1
0.392236       1
0.242999       1
0.243007       1
0.291438       1
0.196197       1
0.211818       1
0.231985       1
0.176892       1
0.199061       1
0.200730       1
0.304375       1
0.260070       1
0.291337       1
0.260098       1
0.169478       1
0.216345       1
0.145686       1
0.338926       1
0.183438       1
0.208193       1
0.239445       1
0.192571       1
0.216326       1
0.273132      

In [20]:
# Save the submission file
submission.to_csv("submission.csv", index=False) # Scored 0.702