In [9]:
import pandas as pd
import numpy as np
import datetime

import xgboost as xgb
import sklearn
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
train = pd.read_csv('train.csv', '\t', index_col='Unnamed: 0')
test = pd.read_csv('test.csv', '\t', index_col='Unnamed: 0')
X = train.drop('0', axis=1).as_matrix()
y = train['0']
X_test = test.drop('0', axis=1).as_matrix()
X.shape

(30500, 345)

In [30]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,336,337,338,339,340,341,342,343,344,345
0,1,1,0,0,0,0,0,0.090909,0,0,...,0.221395,0,1,0,0,0.222222,1,1,1,1
1,1,1,0,0,1,0,0,0.090909,0,0,...,0.241508,0,1,0,0,0.111111,1,1,1,0
2,0,1,0,0,1,0,0,0.090909,0,0,...,0.123067,0,1,0,0,0.444444,1,1,1,1
3,0,1,0,0,1,0,0,0.136364,0,0,...,0.296065,0,0,1,0,0.222222,1,1,1,0
4,0,1,0,0,1,0,0,0.136364,0,0,...,0.178956,0,0,1,0,0.111111,1,1,1,1


In [3]:
from sklearn.model_selection import train_test_split
X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.33, random_state=42)

### Logistic regression 

In [31]:
from sklearn.linear_model import LogisticRegression


In [39]:
print ('%8s %8s %4s' % ('Train','Validate','C'))
for s in ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']:
#for C in np.arange(1, 30, 1):
    LR = LogisticRegression(C=100,penalty='l2', solver = s)
    LR.fit(X_train, y_train)
    AUC_train = roc_auc_score(y_train, LR.predict_proba(X_train)[:,1])
    AUC_validate = roc_auc_score(y_validate, LR.predict_proba(X_validate)[:,1])
    print('%.6f %.6f %s' %  (AUC_train,AUC_validate,s))

   Train Validate    C
0.795459 0.728902 newton-cg
0.795459 0.728901 lbfgs
0.795459 0.728901 liblinear
0.795459 0.728901 sag
0.795459 0.728901 saga


In [40]:
LR_best = LogisticRegression(C=100,penalty='l2')
LR_best.fit(X, y)
ans = LR.predict_proba(X_test)[:,1]

In [42]:
test_pd = pd.DataFrame(ans,columns=['_VAL_'])
test_pd.index.names = ['_ID_']
test_pd.to_csv('logreg.csv')
print(test_pd.shape)
test_pd.head(5)

(4166, 1)


Unnamed: 0_level_0,_VAL_
_ID_,Unnamed: 1_level_1
0,0.086063
1,0.531248
2,0.138917
3,0.379277
4,0.640003


### XGBoost 

In [4]:
import xgboost as xgb

In [5]:
bst = xgb.XGBClassifier()

In [6]:
dtrain = xgb.DMatrix(X_train, y_train)
xgtrain = xgb.DMatrix(X_train)
dvalidate = xgb.DMatrix(X_validate)

D = xgb.DMatrix(X, y)
X_vals = xgb.DMatrix(X)
dtest = xgb.DMatrix(X_test)

In [10]:
learning_rate = np.arange(0.06, 0.15, 0.01)
for s in np.arange(0.1, 1.4, 0.1):
    param = {'max_depth':3, 'min_child_weight':3, 'subsample':0.7,
                 'colsample_bytree':0.07, 'eta':0.08, 'silent':1, 'objective':'binary:logistic' }
    num_rounds = 180
    bst = xgb.train(param, dtrain, num_rounds)
    AUC_train = roc_auc_score(y_train, bst.predict(xgtrain))
    AUC_validate = roc_auc_score(y_validate, bst.predict(dvalidate))
    print('%.6f %.6f m = %f' %  (AUC_train, AUC_validate, s))

0.774616 0.728245 m = 0.100000
0.774616 0.728245 m = 0.200000
0.774616 0.728245 m = 0.300000
0.774616 0.728245 m = 0.400000
0.774616 0.728245 m = 0.500000
0.774616 0.728245 m = 0.600000
0.774616 0.728245 m = 0.700000
0.774616 0.728245 m = 0.800000
0.774616 0.728245 m = 0.900000
0.774616 0.728245 m = 1.000000
0.774616 0.728245 m = 1.100000
0.774616 0.728245 m = 1.200000
0.774616 0.728245 m = 1.300000


In [11]:
param = {'max_depth':3, 'min_child_weight':3, 'subsample':0.7, 'colsample_bytree':0.7,
         'eta':0.08, 'silent':1, 'objective':'binary:logistic', 'seed': 666 }
num_round = 180
xgb_best = xgb.train(param, dtrain, num_round)

In [12]:
import numpy as np
from sklearn.metrics import roc_auc_score
y_scores = xgb_best.predict(dvalidate)
y_train_scores = xgb_best.predict(xgtrain)
roc_auc_score(y_validate, y_scores)

0.7290121360427332

In [13]:
xgb_best = xgb.train(param, D, num_round)
y_res = xgb_best.predict(dtest)
y_p = xgb_best.predict(X_vals)

In [14]:
#Сохраняем результат в csv
test_pd = pd.DataFrame(y_res,columns=['_VAL_'])
test_pd.index.names = ['_ID_']
test_pd.to_csv('xgb.csv')
print(test_pd.shape)
test_pd.head(5)

(4166, 1)


Unnamed: 0_level_0,_VAL_
_ID_,Unnamed: 1_level_1
0,0.108647
1,0.412091
2,0.173746
3,0.336561
4,0.467857


In [15]:
y_res

array([0.10864674, 0.4120909 , 0.17374647, ..., 0.06989857, 0.09565405,
       0.15100183], dtype=float32)

In [16]:
range(1, 300, 25)

[1, 26, 51, 76, 101, 126, 151, 176, 201, 226, 251, 276]

### CatBoost

In [18]:
from catboost import CatBoostClassifier

In [23]:
model = CatBoostClassifier(iterations=6000, learning_rate=0.1, depth=6, loss_function='Logloss', logging_level='Silent')
#model = CatBoostClassifier(iterations=60, depth=8, learning_rate=0.3, loss_function='Logloss', logging_level='Silent')
#train the model
model.fit(X_train, y_train)
# make the prediction using the resulting model
preds_class = model.predict(X_validate)
preds_proba = model.predict_proba(X_validate)
print roc_auc_score(y_validate, preds_class), roc_auc_score(y_validate, preds_class)

0.5732862534499384 0.5732862534499384


### LightGBM

In [24]:
import lightgbm as lgb

In [25]:
X_train_l = X_train
X_validate_l = X_validate
X_l = X
X_test_l = X_test
#add xgboost outputs
X_train_l[:,:-1] = y_train_scores.reshape(X_train_l.shape[0], 1)
X_validate_l[:,:-1] = y_scores.reshape(X_validate_l.shape[0], 1)
X_l[:,:-1] = y_p.reshape(X_l.shape[0], 1)
X_test_l[:,:-1] = y_res.reshape(X_test_l.shape[0], 1)
#add logistic regression outputs
X_train_l[:,:-1] = y_train_scores.reshape(X_train_l.shape[0], 1)
X_validate_l[:,:-1] = y_scores.reshape(X_validate_l.shape[0], 1)
X_l[:,:-1] = y_p.reshape(X_l.shape[0], 1)
X_test_l[:,:-1] = y_res.reshape(X_test_l.shape[0], 1)

In [26]:
for d in np.arange(0.1, 1.1, 0.1):
    gbm = lgb.LGBMRegressor(objective='binary', max_depth = 4, bagging_fraction = 1,
                        num_leaves=8, feature_fraction = 0.5, lambda_l2 = 0.8,
                        learning_rate=0.05,
                        n_estimators=70, verbose = 0)
    gbm.fit(X_train_l, y_train)
    y_pred = gbm.predict(X_validate_l, num_iteration=gbm.best_iteration_)
    print roc_auc_score(y_validate, y_pred), d

0.7283492681469628 0.1
0.7283492681469628 0.2
0.7283492681469628 0.30000000000000004
0.7283492681469628 0.4
0.7283492681469628 0.5
0.7283492681469628 0.6
0.7283492681469628 0.7000000000000001
0.7283492681469628 0.8
0.7283492681469628 0.9
0.7283492681469628 1.0


In [27]:
gbm = lgb.LGBMRegressor(objective='binary', max_depth = 4, bagging_fraction = 1,
                        num_leaves=8, feature_fraction = 0.5, lambda_l2 = 0.8,
                        learning_rate=0.05,
                        n_estimators=70, verbose = 0)
gbm.fit(X_l, y)
y_pred = gbm.predict(X_test_l, num_iteration=gbm.best_iteration_)

In [28]:
#Сохраняем результат в csv
test_pd = pd.DataFrame(y_pred, columns=['_VAL_'])
test_pd.index.names = ['_ID_']
test_pd.to_csv('lgb.csv')
print(test_pd.shape)
test_pd.head(5)

(4166, 1)


Unnamed: 0_level_0,_VAL_
_ID_,Unnamed: 1_level_1
0,0.103342
1,0.469832
2,0.176692
3,0.393345
4,0.603982


In [24]:
y_train_scores.reshape(20435, 1)

array([[ 0.03102162],
       [ 0.21062025],
       [ 0.121241  ],
       ..., 
       [ 0.13546936],
       [ 0.08220468],
       [ 0.08715096]], dtype=float32)

In [37]:
range(1, 60)

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59]

In [52]:
X_l.shape[0]

30500