## About

In this notebook we prepare a simple solution for the [kaggle challenge on higgs.](https://inclass.kaggle.com/c/mlhep-2016-higgs-detection)

In [1]:
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt

import pandas
import numpy

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

### Read the smallest part of training file and test file

In [None]:
import root_numpy
data = pandas.DataFrame(root_numpy.root2array('datasets/public_train_100000.root'))
test = pandas.DataFrame(root_numpy.root2array('datasets/public_test.root'))

### Define training features

Exclude `event_id`, `target` from the features set

In [None]:
features = list(set(data.columns) - {'event_id', 'target'})
#features

### Prepare high-level features for training

In [None]:
#high_level_features = ['m_jj', 'm_jjj', 'm_jlv', 'm_wwbb', 'm_bb', 'm_wbb', 'm_lv']
high_level_features = ['m_jj', 'm_jlv', 'm_wwbb', 'm_bb', 'm_wbb', 'm_lv', 'lepton_pt','mem_phi']

### Plot histograms for each high-level feature

In [None]:
#hist_params = {'normed': True, 'bins': 60, 'alpha': 0.4}
## create the figure
#plt.figure(figsize=(16, 25))
#for n, feature in enumerate(high_level_features):
#    # add sub plot on our figure
#    plt.subplot(len(features) // 5 + 1, 3, n+1)
#    # define range for histograms by cutting 1% of data from both ends
#    min_value, max_value = numpy.percentile(data[feature], [1, 99])
#    plt.hist(data.ix[data.target.values == 0, feature].values, range=(min_value, max_value), 
#             label='background', **hist_params)
#    plt.hist(data.ix[data.target.values == 1, feature].values, range=(min_value, max_value), 
#             label='signal', **hist_params)
#    plt.legend(loc='best')
#    plt.title(feature)

### Divide training data into 2 parts 
`train_test_split` function is used to divide into 2 parts to preserve quality overestimating.

In [None]:
training_data, validation_data = train_test_split(data, random_state=11, train_size=0.66)

### Simple knn from `sklearn` training

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.cross_validation import cross_val_score
#knn = KNeighborsClassifier(n_neighbors=100, metric='manhattan')
#knn.fit(training_data[high_level_features], training_data.target)
#knn_cv = cross_val_score(KNeighborsClassifier(),
#                training_data[high_level_features],
#                training_data.target,
#                cv=4, n_jobs=4, scoring="roc_auc")
#print knn_cv.mean()
#bagging_cv = cross_val_score(BaggingClassifier(base_estimator=KNeighborsClassifier(), n_jobs=4),
#                             training_data[high_level_features],
#                             training_data.target, scoring='roc_auc', cv=4)
#print bagging_cv.mean()
b = BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=70, metric='manhattan', n_jobs=20), n_jobs=10, n_estimators=20)
b.fit(training_data[high_level_features], training_data.target)



In [None]:
#### predict validation sample (probability for each event)
#proba = knn.predict_proba(validation_data[high_level_features])
probb = b.predict_proba(validation_data[high_level_features])

In [None]:
#probb
roc_auc_score(validation_data.target, probb[:, 1])
#0.74161 con 100
#0.7262 con 10
#0.74974 con 50
#0.75104 con 70

### Compute quality (ROC AUC) on the validation set (to prevent overestimating quality)

In [None]:
# take probability to be 1 class to compute ROC AUC
#roc_auc_score(validation_data.target, proba[:, 1])

## Prepare submission to kaggle

In [None]:
# predict test sample
kaggle_probb = b.predict_proba(test[high_level_features])[:, 1]
kaggle_ids = test.event_id

In [None]:
from IPython.display import FileLink
def create_solution(ids, proba, filename='bagging_kNN.csv'):
    """saves predictions to file and provides a link for downloading """
    pandas.DataFrame({'event_id': ids, 'prediction': proba}).to_csv('datasets/{}'.format(filename), index=False)
    return FileLink('datasets/{}'.format(filename))
    
create_solution(kaggle_ids, kaggle_proba)