In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import linear_model, metrics
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.feature_selection import SelectFromModel

import xgboost as xgb
import lightgbm as lgb

In [3]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

# clean and split data
print ("dimention of the traing data"+ str(train.shape))
print ("dimention of the test data"+ str(train.shape))

dimention of the traing data(76020, 371)
dimention of the test data(76020, 371)


In [4]:
# remove constant columns (std = 0)
remove = []
for col in train.columns:
    if train[col].std() == 0:
        remove.append(col)


train.drop(remove, axis=1, inplace=True)
test.drop(remove, axis=1, inplace=True)
print ("removing " + str(len(remove))+ "vars")
print ("dimention of the traing removing 0 sd"+ str(train.shape))
print ("dimention of the test removing 0 sd"+ str(train.shape))

removing 34vars
dimention of the traing removing 0 sd(76020, 337)
dimention of the test removing 0 sd(76020, 337)


In [5]:
# remove duplicated columns
remove_dups = []
cols = train.columns
for i in range(len(cols)-1):
    v = train[cols[i]].values
    for j in range(i+1,len(cols)):
        if np.array_equal(v,train[cols[j]].values):
            remove_dups.append(cols[j])

print ("removing " + str(len(remove_dups))+ "vars")
train.drop(remove_dups, 
           axis=1, inplace=True)
test.drop(remove_dups, axis=1, inplace=True)


print ("dimention of the traing data after duplicated "+ str(train.shape))
print ("dimention of the test data after duplicated "+ str(train.shape))

removing 29vars
dimention of the traing data after duplicated (76020, 308)
dimention of the test data after duplicated (76020, 308)


In [6]:
# split data into train and test
test_id = test.ID
test = test.drop(["ID"],axis=1)

X = train.drop(["TARGET","ID"],axis=1)
y = train.TARGET.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1729)

print(X_train.shape, X_test.shape, test.shape)

(60816, 306) (15204, 306) (75818, 306)


In [7]:
## # Feature selection
from sklearn.feature_selection import SelectFromModel


clf=lgb.LGBMClassifier()
selector = clf.fit(X_train, y_train)
fs = SelectFromModel(selector, prefit=True)
X_train = fs.transform(X_train)
X_test = fs.transform(X_test)
test = fs.transform(test)

print(X_train.shape, X_test.shape, test.shape)

# embeded_lgb_support = embeded_lgb_selector.get_support()
# embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
# print(str(len(embeded_lgb_feature)), 'selected features')

# clf = RandomForestClassifier(random_state=1729)
# selector = clf.fit(X_train, y_train)
# # clf.feature_importances_ 
# fs = SelectFromModel(selector, prefit=True)

# X_train = fs.transform(X_train)
# X_test = fs.transform(X_test)
# test = fs.transform(test)

# print(X_train.shape, X_test.shape, test.shape)

(60816, 54) (15204, 54) (75818, 54)


In [8]:
## # Train Model
# classifier from xgboost
m2_xgb = xgb.XGBClassifier(n_estimators=110, nthread=-1, seed=1729)
m2_xgb.fit(X_train, y_train, eval_metric="auc",
           eval_set=[(X_test, y_test)])

[0]	validation_0-auc:0.787596
[1]	validation_0-auc:0.791669
[2]	validation_0-auc:0.791738
[3]	validation_0-auc:0.795287
[4]	validation_0-auc:0.799052
[5]	validation_0-auc:0.798715
[6]	validation_0-auc:0.798495
[7]	validation_0-auc:0.798494
[8]	validation_0-auc:0.798624
[9]	validation_0-auc:0.798001
[10]	validation_0-auc:0.799355
[11]	validation_0-auc:0.800553
[12]	validation_0-auc:0.801058
[13]	validation_0-auc:0.801677
[14]	validation_0-auc:0.802055
[15]	validation_0-auc:0.80229
[16]	validation_0-auc:0.803853
[17]	validation_0-auc:0.804323
[18]	validation_0-auc:0.80482
[19]	validation_0-auc:0.804942
[20]	validation_0-auc:0.805213
[21]	validation_0-auc:0.806797
[22]	validation_0-auc:0.811364
[23]	validation_0-auc:0.812214
[24]	validation_0-auc:0.81724
[25]	validation_0-auc:0.814581
[26]	validation_0-auc:0.814684
[27]	validation_0-auc:0.815549
[28]	validation_0-auc:0.817289
[29]	validation_0-auc:0.817601
[30]	validation_0-auc:0.819446
[31]	validation_0-auc:0.819024
[32]	validation_0-auc

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=110,
       n_jobs=1, nthread=-1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1729,
       silent=True, subsample=1)

In [9]:
# calculate the auc score
print("Roc AUC: ", roc_auc_score(y_test, m2_xgb.predict_proba(X_test)[:,1],
              average='macro'))

Roc AUC:  0.8327948531163114


In [10]:
d_train = lgb.Dataset(X_train, label=y_train)   # (60816, 306) (15204, 306)
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
}
clf = lgb.train(train_set=d_train, params=params)

In [11]:
# calculate the auc score
print("Roc AUC: ", roc_auc_score(y_test, clf.predict(X_test), average='macro'))

Roc AUC:  0.8298656488269767


In [12]:
## # Submission
probs = clf.predict(test)
print(probs)
submission = pd.DataFrame({"ID":test_id, "TARGET": probs})
submission.to_csv("submission.csv", index=False)

[0.03945041 0.06405031 0.00082802 ... 0.00374245 0.06930675 0.00083459]
