In [3]:
import pandas as pd 
import numpy as np 
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier 
from xgboost import XGBRegressor
from xgboost import plot_importance
import matplotlib.pylab as plt
from matplotlib import pyplot
import mlxtend
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [4]:
dataset = pd.read_csv("train_final.csv")
test_dataset = pd.read_csv("test_final.csv")

In [5]:
# feature_cols = ['f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11','f12','f13','f14','f15','f16','f17','f18','f19','f20','f21','f22','f23','f24']
feature_cols = ['f1','f3','f4','f7','f8','f10','f12','f13','f14','f15','f16','f17','f19','f23',]
# remove_feature_cols = ['f2','f5','f6','f9','f11','f18','f20','f21','f22','f24']
features_X = dataset[feature_cols]
target = 'Y'
IDcol = 'Id'
target_Y = dataset['Y']
test_feature = test_dataset[feature_cols]
test_Id = test_dataset['Id']
predictors = [x for x in features_X.columns if x not in [target,IDcol]]


In [64]:
param_test1 = {
    'max_depth':range(16,21,1),
    'min_child_weight':range(1,9,1)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( 
    learning_rate =0.1, 
    n_estimators=140, 
    max_depth=10,
    min_child_weight=1, 
    gamma=0, 
    subsample=0.8, 
    colsample_bytree=0.15,
    objective= 'binary:logistic', 
    nthread=16, 
    scale_pos_weight=1, 
    seed=42
), 
param_grid = param_test1, scoring='roc_auc',n_jobs=-1,iid=False, cv=10)
gsearch1.fit(dataset[predictors],dataset['Y'])
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

({'mean_fit_time': array([5.64948053, 5.35711918, 5.37172909, 5.39833953, 5.01855092,
         4.90761263, 4.68376215, 4.48391681, 5.71319602, 5.49033248,
         5.54104035, 5.18351879, 4.87834377, 4.75687873, 4.69316409,
         4.54423113, 5.90123873, 5.56072419, 5.57137966, 5.32502451,
         4.95246227, 4.89431026, 4.59964323, 4.30537655, 6.39495802,
         6.03297737, 5.59768214, 5.39354417, 5.1019758 , 5.14756773,
         4.77478437, 4.47522273, 6.15508661, 5.84278414, 5.71398163,
         5.37753654, 4.97422814, 4.73057296, 4.79509561, 4.64316566]),
  'std_fit_time': array([0.13692952, 0.23180267, 0.11202144, 0.1671651 , 0.13628227,
         0.15671069, 0.26353349, 0.13143853, 0.12003626, 0.47401828,
         0.15888558, 0.15771079, 0.14346734, 0.1264004 , 0.36670113,
         0.13659731, 0.10616206, 0.35955994, 0.13080742, 0.25173015,
         0.18215507, 0.15822532, 0.14014271, 0.13387317, 0.16124955,
         0.264017  , 0.1038196 , 0.24877785, 0.29140366, 0.15509689,

In [45]:
param_test2 = {
    'gamma':[i/10.0 for i in range(0,5)]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( 
    learning_rate =0.1, 
    n_estimators=140, 
    max_depth=17,
    min_child_weight=1, 
    gamma=0, 
    subsample=0.7, 
    colsample_bytree=0.7,
    objective= 'binary:logistic', 
    nthread=16, 
    scale_pos_weight=1, 
    seed=42
), 
param_grid = param_test2, scoring='roc_auc',n_jobs=-1,iid=False, cv=10)
gsearch1.fit(dataset[predictors],dataset['Y'])
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

({'mean_fit_time': array([11.50625992, 10.75112817,  8.0273206 ,  7.96761632,  6.92252333]),
  'std_fit_time': array([1.33006689, 2.17717757, 0.08949047, 0.17115506, 1.61623065]),
  'mean_score_time': array([0.01250317, 0.01230316, 0.01080267, 0.00960228, 0.00800183]),
  'std_score_time': array([0.00412998, 0.00419749, 0.00477188, 0.00135677, 0.00062908]),
  'param_gamma': masked_array(data=[0.0, 0.1, 0.2, 0.3, 0.4],
               mask=[False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'gamma': 0.0},
   {'gamma': 0.1},
   {'gamma': 0.2},
   {'gamma': 0.3},
   {'gamma': 0.4}],
  'split0_test_score': array([0.88391055, 0.89079629, 0.87356831, 0.89267112, 0.87928143]),
  'split1_test_score': array([0.84839446, 0.84795473, 0.85159531, 0.84916826, 0.85672212]),
  'split2_test_score': array([0.85410417, 0.87350014, 0.85372239, 0.86503954, 0.86043087]),
  'split3_test_score': array([0.89040087, 0.88482411, 0.8852468 , 0.88706027, 0.876956

In [11]:
param_test3 = {
    'subsample':[i/100.0 for i in range(75,95,5)],
    'colsample_bytree':[i/100.0 for i in range(10,45,5)]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( 
    learning_rate =0.1, 
    n_estimators=140, 
    max_depth=17,
    min_child_weight=1, 
    gamma=0, 
    subsample=0.8, 
    colsample_bytree=0.15,
    objective= 'binary:logistic', 
    nthread=16, 
    scale_pos_weight=1, 
    seed=42
), 
param_grid = param_test3, scoring='roc_auc',n_jobs=-1,iid=False, cv=10)
gsearch1.fit(dataset[predictors],dataset['Y'])
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

({'mean_fit_time': array([3.94478722, 4.06181378, 4.17323906, 4.12852824, 5.5412467 ,
         5.75819514, 5.76109536, 5.8908253 , 5.71798587, 5.66367416,
         5.75409472, 5.94053667, 6.74001644, 6.81473312, 6.9185565 ,
         7.0376837 , 7.69893217, 7.79365301, 7.86937068, 7.89397573,
         7.87117062, 7.8932761 , 7.80675611, 7.85496719, 8.48891001,
         8.57152836, 8.52901888, 6.38363612]),
  'std_fit_time': array([0.17380919, 0.16998477, 0.1428412 , 0.09532295, 0.15522917,
         0.08544607, 0.05279955, 0.03159273, 0.06403039, 0.07729908,
         0.09245365, 0.10228775, 0.10966059, 0.06697595, 0.06031439,
         0.09059374, 0.07047523, 0.0891834 , 0.07798953, 0.12685682,
         0.09060051, 0.12496447, 0.09752192, 0.09781885, 0.07514334,
         0.11875772, 0.08236583, 1.02841416]),
  'mean_score_time': array([0.01150291, 0.01580396, 0.01100268, 0.01140313, 0.0109026 ,
         0.01390338, 0.01610382, 0.01420343, 0.01390336, 0.01430347,
         0.01580348, 0.010

In [54]:
param_test4 = {
#     'reg_alpha':[1e-9, 1e-8, 1e-7,1e-6, 1e-5, 1e-4,1e-3,0.01,0.1,1,1.1]
    'reg_alpha':[.001, .002, , .0001, .0002, .0003]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( 
    learning_rate =0.1, 
    n_estimators=140, 
    max_depth=17,
    min_child_weight=1, 
    gamma=0, 
    subsample=0.8, 
    colsample_bytree=0.15,
    objective= 'binary:logistic', 
    nthread=16, 
    scale_pos_weight=1, 
    seed=42
), 
param_grid = param_test4, scoring='roc_auc',n_jobs=-1,iid=False, cv=10)
gsearch1.fit(dataset[predictors],dataset['Y'])
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

({'mean_fit_time': array([8.10884545, 6.61340675, 4.35168667, 4.43640628, 4.42040198,
         4.38069348, 4.35828855, 4.33968441, 4.40320387, 4.21961386,
         3.33099105]),
  'std_fit_time': array([0.88455849, 1.93765493, 0.09508018, 0.04841371, 0.07981504,
         0.06716394, 0.07067708, 0.06734391, 0.06605729, 0.15742719,
         0.11689687]),
  'mean_score_time': array([0.0148035 , 0.01060252, 0.01150315, 0.01110244, 0.01070309,
         0.00920236, 0.01130297, 0.01100252, 0.01060233, 0.00840235,
         0.01200283]),
  'std_score_time': array([0.00701313, 0.0040059 , 0.00525925, 0.00570146, 0.00447416,
         0.00097991, 0.00558781, 0.00346524, 0.00415284, 0.00066369,
         0.00591723]),
  'param_reg_alpha': masked_array(data=[1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01,
                     0.1, 1, 1.1],
               mask=[False, False, False, False, False, False, False, False,
                     False, False, False],
         fill_value='?',
          

In [7]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=10, early_stopping_rounds=200):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Y'],eval_metric='auc')
    print("Model Fitted")
        

In [13]:
# X_train,X_test,y_train,y_test=train_test_split(features_X,target_Y,test_size=0.25,random_state=42)

# logreg = LogisticRegression()

# logreg.fit(X_train, y_train)

# y_pred = logreg.predict(X_test)

model = XGBClassifier(
    learning_rate =0.01, 
    n_estimators=5000, 
    max_depth=17,
    min_child_weight=1, 
    gamma=0, 
    subsample=0.85, 
    colsample_bytree=0.25,
    objective= 'binary:logistic', 
    nthread=16, 
    scale_pos_weight=1, 
    seed=42,
    reg_alpha = .0001
)

# model_forest = RandomForestClassifier()
# model_forest.fit(predictors, target_Y)


modelfit(model,dataset,predictors)



# plot_importance(model)
# pyplot.show()
print("Beginning K-Fold")
kfold = KFold(n_splits=10, random_state=42)
results = cross_val_score(model, features_X, target_Y, cv=kfold, scoring='roc_auc')
print(results)
print(results.mean())
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Model Fitted
Beginning K-Fold
[0.90457024 0.88526278 0.88864222 0.92269423 0.90795291 0.92151812
 0.8942643  0.90137995 0.92090496 0.92040506]
0.9067594768228127
Accuracy: 90.68% (1.36%)


In [12]:
   learning_rate =0.01, 
    n_estimators=5000, 
    max_depth=17,
    min_child_weight=1, 
    gamma=0, 
    subsample=0.85, 
    colsample_bytree=0.25,
    objective= 'binary:logistic', 
    nthread=16, 
    scale_pos_weight=1, 
    seed=42,
    
    0.9067594768228127
    Accuracy: 90.68% (1.36%)

IndentationError: unexpected indent (<ipython-input-12-a5da258d8ef1>, line 2)

In [15]:
print(results)
print(results.mean())
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

[0.90457024 0.88526278 0.88864222 0.92269423 0.90795291 0.92151812
 0.8942643  0.90137995 0.92090496 0.92040506]
0.9067594768228127
Accuracy: 90.68% (1.36%)


In [16]:
submitModelPredict = model.predict_proba(test_feature)
submission = pd.DataFrame({'Id': test_Id, 'Y':submitModelPredict[:,1]})
submission.head()

Unnamed: 0,Id,Y
0,16384,0.954937
1,16385,0.887154
2,16386,0.999852
3,16387,0.99986
4,16388,0.995572


In [17]:
filename = 'submit.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: submit.csv
