In [1]:
import pandas as pd

import numpy as np

import matplotlib
import matplotlib.pylab as plt
%matplotlib inline

import seaborn as sns

from __future__ import division

## Loading data

In [2]:
from sklearn import cross_validation

### Train

In [3]:
train_data_filepath = './data/training.csv'
train_data_raw = pd.read_csv(train_data_filepath, index_col='EventId')

In [4]:
train_labels_raw = train_data_raw['Label'].apply(lambda label: 1 if label == 's' else 0)
train_weights_raw = train_data_raw['Weight']
train_data_raw = train_data_raw.drop(['Label', 'Weight'], axis=1)

# TODO:

In [5]:
train_data, hold_out_data, train_labels, hold_out_labels = cross_validation.train_test_split(train_data_raw, train_labels_raw,
                                                                                            test_size=0.2, stratify=train_labels_raw,
                                                                                            random_state=42)

### Test

In [6]:
test_data_filepath = './data/test.csv'
test_data_raw = pd.read_csv(test_data_filepath, index_col='EventId')

---

## Creating classes for Higgs Boson data tranformation

It is divided into modules, so that it is easy to carry out experiments with features like adding new ones, transforming etc.

In [7]:
from sklearn import base, preprocessing

In [8]:
class MissingValuesTransformer(base.BaseEstimator, base.TransformerMixin):

    def transform(self, X, **transform_params):
        X_new = X.copy()
        X_new.replace(-999.0, np.nan, inplace=True)
        
        for column in X_new.columns:
            nan_ratio = len(X_new[X_new[column].isnull()]) / len(X_new)
            if nan_ratio >= 0.5:
                X_new.drop(column, axis=1, inplace=True)
        self.lol=2
        X_new.fillna(X_new.median(), inplace=True)

        return X_new.values

    def fit(self, X, y=None, **fit_params):
        return self
    
    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

---

## Making AMS score function

In [9]:
def ams_score(y, y_pred, weights):
    weights = weights.loc[y.index].copy().values
    
    y = np.array(y)
    y_pred = np.array(y_pred)
    
    s = sum(weights * (y == 1) * (y_pred == 1))
    b = sum(weights * (y == 0) * (y_pred == 1))
    b_r = 10.0
    
    return np.sqrt(2 * ((s + b + b_r) * np.log(1 + s / (b + b_r))) - s)

---

## Logistic regression

In [10]:
from sklearn import linear_model, pipeline, preprocessing

In [11]:
logistic_regression = linear_model.LogisticRegression(random_state=42, n_jobs=-1, verbose=True)

In [12]:
logistic_regression_estimator = pipeline.Pipeline([
    ('missing_values', MissingValuesTransformer()),
    ('scale', preprocessing.StandardScaler()),
    ('classifier', logistic_regression)
])

### Choosing optimal parameters by grid search and cross validation

In [13]:
from sklearn import cross_validation, grid_search, metrics

In [14]:
logistic_regression_cv = cross_validation.KFold(len(train_labels), n_folds=5, shuffle=True, random_state=42)

In [15]:
logistic_regression_estimator.get_params().keys()

['scale__with_std',
 'classifier__dual',
 'scale',
 'classifier__max_iter',
 'classifier__class_weight',
 'classifier__tol',
 'classifier__solver',
 'classifier__multi_class',
 'classifier__intercept_scaling',
 'classifier__C',
 'classifier__random_state',
 'scale__with_mean',
 'classifier__warm_start',
 'steps',
 'classifier__fit_intercept',
 'classifier__n_jobs',
 'classifier__penalty',
 'scale__copy',
 'classifier',
 'classifier__verbose',
 'missing_values']

In [176]:
logistic_regression_estimator_grid_params = {
    'classifier__C': [10 ** x for x in xrange(-1, 1 + 1)],
#     'classifier__penalty': ['l1', 'l2'],
#     'classifier__class_weight': [None, 'balanced'] # Will help with class inequality
}

In [177]:
def ams_score_with_train_weights(y, y_pred):
    return ams_score(y, y_pred, train_weights)

grid_cv = grid_search.GridSearchCV(logistic_regression_estimator, logistic_regression_estimator_grid_params,
                                   scoring=metrics.make_scorer(ams_score_with_train_weights, greater_is_better=True),
                                   cv=logistic_regression_cv,
                                   n_jobs=-1, verbose=1)

In [178]:
grid_cv.fit(train_data, train_labels)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   21.3s finished


GridSearchCV(cv=sklearn.cross_validation.KFold(n=250000, n_folds=5, shuffle=True, random_state=42),
       error_score='raise',
       estimator=Pipeline(steps=[('missing_values', MissingValuesTransformer()), ('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=True, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'classifier__C': [0.1, 1, 10]}, pre_dispatch='2*n_jobs',
       refit=True, scoring=make_scorer(ams_score_with_train_weights),
       verbose=1)

In [180]:
print grid_cv.best_score_
print grid_cv.best_params_

8.47279266003
{'classifier__C': 10}


In [144]:
grid_cv.grid_scores_

[mean: 8.46732, std: 0.03708, params: {'classifier__C': 0.1},
 mean: 8.46970, std: 0.03708, params: {'classifier__C': 1},
 mean: 8.47032, std: 0.03715, params: {'classifier__C': 10}]

In [145]:
preds = grid_cv.best_estimator_.predict(test_data)

In [152]:
random_submission = pd.read_csv('./data/random_submission.csv')

In [153]:
random_submission.head()

Unnamed: 0,EventId,RankOrder,Class
0,350000,416957,b
1,350001,89624,b
2,350002,519845,b
3,350003,510885,s
4,350004,455944,s


In [169]:
!head submission.csv

EventId,RankOrder,Class
350000,1,b
350001,2,b
350002,3,b
350003,4,b
350004,5,b
350005,6,b
350006,7,b
350007,8,b
350008,9,b


In [167]:
!head ./data/random_submission.csv

EventId,RankOrder,Class
350000,416957,b
350001,89624,b
350002,519845,b
350003,510885,s
350004,455944,s
350005,505711,b
350006,108993,b
350007,134597,b
350008,194267,b


## Gradient boosting

In [182]:
from sklearn import ensemble

In [183]:
gradient_boosting = ensemble.GradientBoostingClassifier()

In [184]:
feature_transformer = pipeline.Pipeline([
    ('missing_values', MissingValuesTransformer()),
    ('scale', preprocessing.StandardScaler())
])

In [186]:
new_train_data = feature_transformer.fit_transform(train_data)

In [190]:
gradient_boosting.fit(new_train_data, train_labels)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [191]:
new_test_data = feature_transformer.transform(test_data)

In [193]:
preds = gradient_boosting.predict(new_test_data)

In [200]:
def make_submission(preds, file_path):
    submission_preds = map(lambda label: 's' if label == 1 else 'b', preds)
    submission = pd.DataFrame({'EventId': test_data.index, 
                               'RankOrder': range(1, len(test_data) + 1), 
                               'Class': submission_preds
                             })
    submission = submission[['EventId', 'RankOrder', 'Class']] # Column reorder
    
    with open(file_path, 'w') as f:
        submission.to_csv(f)

In [195]:
submission_preds = map(lambda label: 's' if label == 1 else 'b', preds)

In [196]:
submission = pd.DataFrame({'EventId': test_data.index, 
                           'RankOrder': range(1, len(test_data) + 1), 
                           'Class': submission_preds
                          })
submission = submission[['EventId', 'RankOrder', 'Class']] # Column reorder

In [197]:
submission.to_csv('./submission.csv', index=False)

In [199]:
len(submission)

550000