## Imports

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing.imputation import Imputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.cross_validation import StratifiedKFold



## Read CSV's

In [3]:
import os

to_drop = ["job_name", "reason"]

# have to upload train by hand since for some reason it is nont available here
df_train = pd.read_csv("train_data.csv").set_index("ids").drop(to_drop, axis=1)
df_train = df_train[~df_train.default.isnull()]
df_train["default"] = df_train["default"].astype("int")
df_test = pd.read_csv("teste_data.csv").set_index("ids").drop(to_drop, axis=1)
print((df_train.shape, df_test.shape))

((47952, 24), (12014, 23))


## Missings, type and uniques

In [4]:
encode_cols = df_train.dtypes
encode_cols = encode_cols[encode_cols == object].index.tolist()

# stats
print(pd.concat([df_train.isnull().mean(), df_train.dtypes, df_train.T.apply(lambda x: x.nunique(), axis=1)], axis=1))

                           0        1      2
default             0.000000    int64      2
score_1             0.000000   object      7
score_2             0.000000   object     35
score_3             0.000000  float64     87
score_4             0.000000  float64  47952
score_5             0.000000  float64  47952
score_6             0.000000  float64  47952
risk_rate           0.000000  float64     79
amount_borrowed     0.000000  float64  41588
borrowed_in_months  0.000000  float64      2
credit_limit        0.312813  float64  21804
income              0.000000  float64  44196
sign                0.316546   object     12
gender              0.049570   object      2
facebook_profile    0.099954   object      2
state               0.000000   object     50
zip                 0.000000   object    823
channel             0.000000   object      1
real_state          0.000000   object      5
ok_since            0.586044  float64    101
n_bankruptcies      0.003441  float64      7
n_defaulte

## Count Encoder

In [5]:
def get_encoder(df, col):
    dft = df[col].astype(str).to_frame().copy()
    dft["count"] = 1
    return dft.groupby(col).count().to_dict()["count"]
    
def encode_all(df_train, df_test, cols):
    for col in cols:
        enc = get_encoder(df_train, col)
        df_train[col] = df_train[col].astype(str).apply(lambda x: enc.get(x, -1))
        df_test[col] = df_test[col].astype(str).apply(lambda x: enc.get(x, -1))
    return df_train, df_test

In [6]:
df_train, df_test = encode_all(df_train, df_test, encode_cols)
df_train, df_test = df_train.fillna(-1), df_test.fillna(-1)
df_train.head()

Unnamed: 0_level_0,default,score_1,score_2,score_3,score_4,score_5,score_6,risk_rate,amount_borrowed,borrowed_in_months,...,facebook_profile,state,zip,channel,real_state,ok_since,n_bankruptcies,n_defaulted_loans,n_accounts,n_issues
ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
c017198e-1d86-04e5-4639-5883c12f34af,0,6829,1221,300.0,98.943966,0.666203,84.552862,0.57,5269.87,60.0,...,27252,418,75,47952,23674,70.0,0.0,0.0,10.0,-1.0
ebd7fb8a-9413-4c09-e387-2781cb1a04aa,0,6829,1546,280.0,98.801402,0.158028,111.156002,0.44,35021.99,36.0,...,27252,3633,398,47952,23674,45.0,0.0,0.0,11.0,11.0
4192b843-bb6b-3e8d-c0fc-791ca9b69894,0,16338,4008,420.0,107.506083,0.952076,93.556293,0.37,20027.94,60.0,...,27252,543,61,47952,23674,34.0,0.0,0.0,14.0,14.0
63b5db01-9c03-28e9-a968-fc2709533e3f,0,16338,2610,400.0,104.015198,0.269695,98.517418,0.31,7216.22,36.0,...,15907,1913,427,47952,20284,64.0,1.0,0.0,4.0,4.0
e89fd51d-796b-a12e-92ba-de710f5a10e0,0,1240,376,390.0,99.872055,0.753891,95.456606,0.51,16019.44,60.0,...,27252,153,37,47952,20284,-1.0,0.0,0.0,11.0,11.0


## Random Forest

In [7]:
clf = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42, max_features=10, oob_score=True)

X_train, y_train = df_train.drop("default", axis=1), df_train["default"]
X_test = df_test

In [8]:
skf = StratifiedKFold(y_train, 3, shuffle=True, random_state=42)

## Cross Validation

In [9]:
aucs = []
for (fold, (i_train, i_test)) in enumerate(skf):
    clf.fit(X_train.iloc[i_train], y_train.iloc[i_train])
    i_pred_proba = clf.predict_proba(X_train.iloc[i_test])
    print(i_pred_proba.shape)
    auc = roc_auc_score(y_train.iloc[i_test], i_pred_proba[:, 1])
    aucs.append(auc)
    print("AUC score on fold %i: %2.3f" % (fold, auc))
print("AUC: %2.3f +- %2.4f" % (np.mean(aucs), np.std(aucs)))

(15985, 2)
AUC score on fold 0: 0.755
(15984, 2)
AUC score on fold 1: 0.754
(15983, 2)
AUC score on fold 2: 0.761
AUC: 0.757 +- 0.0033


## Final Fit

In [10]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

## Predict and save submission file

In [11]:
sub = pd.DataFrame(clf.predict_proba(X_test)[:, 1], columns=["prob"], index=X_test.index)
sub.to_csv("submission101.csv")
sub

Unnamed: 0_level_0,prob
ids,Unnamed: 1_level_1
5e78f987-3e90-55a9-12f8-423c4d63d254,0.083568
f4f28e96-099f-0ffa-6956-d5befffb7191,0.192798
b93c6a30-ce80-546c-7ee6-6fc7d27320b4,0.114423
54f1cfe1-8239-ba99-e926-61dba5a4b119,0.117629
5be46214-a131-1b23-673f-43aa906fa5d6,0.505498
ed7197c9-d440-9688-7c50-e7ee4538b8db,0.105907
6e4fbdcb-1cc7-0d53-f86a-c6523866431b,0.244716
f1818b94-3fd7-bce1-d227-06ca2b9c932d,0.060245
3238dad5-c7d4-060e-868d-f8de4770a1e8,0.093243
d83383b5-a481-2e3a-49d7-fcf1027a257d,0.116262
