# Particle prediction: Kaggle competition
---

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from time import time
from sklearn.preprocessing import scale
from sklearn.metrics import roc_auc_score
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from joblib import dump, load
from sklearn.model_selection import train_test_split

In [9]:
# Reading data
X = pd.read_csv("./X_train.dat",index_col=0)
Y = pd.read_csv("./y_train.dat",names =["Y"],index_col=0)
X_test =   pd.read_csv("./X_test.dat",index_col=0)
index_test = pd.read_csv("./sample.dat",index_col=0)

# Split data into validation and training data
X_train, X_val, Y_train, y_val = train_test_split(scale(X.values), Y.values, test_size=0.15, random_state=42)

In [10]:
# Function to generate submission file in the correct format
def generate_submission_file (model,X_test,file_name):
    # Computing probs for Y=1
    probs = model.predict_proba(X_test.values)[:,1]
    Ids = X_test.index.values
    prediction =  pd.DataFrame({'Id':Ids, 'Prediction':probs})
    prediction.to_csv("./"+file_name,index= None, header = True)

**Algorithms below may take long time to output a result depending on the grid search "size"**

### Logistic regression with regularization

In [None]:
from sklearn.linear_model import LogisticRegressionCV
print("Fitting the classifier to LogReg")

lr_l2 = LogisticRegressionCV(cv=5,
                             scoring='roc_auc', 
                             penalty='l2',
                             random_state=0,
                             max_iter=500).fit(X_train, Y_train)

lr_l1 = LogisticRegressionCV(cv=5,
                             scoring='roc_auc',
                             penalty='l1',
                             solver='liblinear',
                             random_state=0,
                             max_iter=500).fit(X_train, Y_train)

dump(lr_l1, 'lr_l1.joblib')
dump(lr_l2, 'lr_l2.joblib') 

In [None]:
print(f"The AUC on val set for Logistic Regression with L2 is: \n{roc_auc_score(y_val,lr_l2.predict_proba(X_val)[:,1])}")
print(f"The AUC on val for Logistic Regression with L1 is: \n{roc_auc_score(y_val,lr_l1.predict_proba(X_val)[:,1])}")

In [None]:
generate_submission_file(lr_l1,X_test,"test")

### SVM

In [None]:
from sklearn.svm import SVC

print("Fitting the classifier to SVM")
t0 = time()
param_grid = {'C': [1e-1]}
svm = GridSearchCV(SVC(kernel='rbf',
                       gamma = 'auto',
                       probability = True),
                   scoring = 'roc_auc',
                   param_grid = param_grid,
                   n_jobs = -1,
                   cv=5)
svm.fit(X_train, Y_train.Y)

#dump(svm, 'svm.joblib')
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(svm.best_estimator_)
print(f"The AUC for SVM: \n{svm.best_score_}")

In [None]:
print(f"The AUC on val for SVM is: \n{roc_auc_score(y_val,svm.predict_proba(X_val)[:,1])}")

In [None]:
generate_submission_file(svm,X_test,"test")

### KNN

In [None]:
### TAKES LONG TIME

from sklearn.neighbors import KNeighborsClassifier

print("Fitting the classifier to KNN")
t0 = time()
param_grid = {'n_neighbors': np.arange(1,10,5)}
knn = GridSearchCV(KNeighborsClassifier(),
                   param_grid,
                   cv=5,
                   scoring = 'roc_auc',
                   n_jobs=-1)
knn = knn.fit(X_train, Y_train.Y)

dump(knn, 'knn.joblib')
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(knn.best_estimator_)
print(f"The AUC for KNN: \n{knn.best_score_}")

In [None]:
print(f"The AUC on val for KNN is: \n{roc_auc_score(y_val,knn.predict_proba(X_val)[:,1])}")

### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

print("Fitting the classifier to Random Forest")
t0 = time()
param_grid = { 
    'n_estimators': [50, 100,500],
    'max_features': ['auto'],
    'max_depth' : [4,6,10,14],
    'criterion' :['entropy']
}
rfc = GridSearchCV(RandomForestClassifier(),
                   param_grid,
                   cv=5,
                   scoring = 'roc_auc',
                   n_jobs=-1)
rfc = rfc.fit(X_train, Y_train.Y)

dump(rfc, 'rfc.joblib')
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(rfc.best_estimator_)
print(f"The AUC for RF: \n{rfc.best_score_}")


In [None]:
print(f"The AUC on val for RF is: \n{roc_auc_score(y_val,rfc.predict_proba(X_val)[:,1])}")

### Gradient boost classifiers

In [29]:
import xgboost as xgb

print("Fitting the classifier to XGBoost")

# Param grid already tuned
t0 = time()
param_grid = {'max_depth':[6],
              'gamma': [0],
              'min_child_weight':[4],
              'reg_lambda':[3],
              'reg_alpha':[0.1],
              'silent': [1],
              'objective': ['binary:logistic']}

xgb_cl = GridSearchCV(xgb.XGBClassifier(),
                   param_grid,
                   cv=5,
                   scoring = 'roc_auc',
                   n_jobs=-1,
                    verbose = 0)

xgb_cl.fit(X_train,Y_train)

#dump(xgb_cl, 'xgb_cl5.joblib')
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(xgb_cl.best_estimator_)
print(f"The AUC for RF: \n{xgb_cl.best_score_}")

Fitting the classifier to XGBoost
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   12.0s remaining:   18.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   12.4s remaining:    8.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.6s finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


done in 29.341s
Best estimator found by grid search:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=4, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0.1, reg_lambda=3, scale_pos_weight=1, seed=None,
       silent=1, subsample=1)
The AUC for RF: 
0.8244410775090415


In [25]:
print(f"The AUC on val for XGBoost is: \n{roc_auc_score(y_val,xgb_cl.predict_proba(X_val)[:,1])}")

The AUC on val for XGBoost is: 
0.825128284340518


In [30]:
best_model = xgb.XGBClassifier(**xgb_cl.best_params_)
best_model.fit(X.values,Y.values)
print(f"The AUC on val for XGBoost is: \n{roc_auc_score(y_val,xgb_cl.predict_proba(X_val)[:,1])}")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


The AUC on val for XGBoost is: 
0.8253722582638244


In [31]:
generate_submission_file(best_model,X_test,"XGB_Submission_6")