In [None]:
import pandas as pd

import numpy as np

import matplotlib
import matplotlib.pylab as plt
%matplotlib inline

import seaborn as sns

from __future__ import division

## Loading data

### Train

In [None]:
train_data_filepath = './data/training.csv'
train_data = pd.read_csv(train_data_filepath, index_col='EventId')

In [None]:
train_labels = train_data['Label'].apply(lambda label: 1 if label == 's' else 0)
train_weights = train_data['Weight']
train_data = train_data.drop(['Label', 'Weight'], axis=1)

### Test

In [None]:
test_data_filepath = './data/test.csv'
test_data = pd.read_csv(test_data_filepath, index_col='EventId')

---

## Creating class for Higgs Boson data tranformation

It is divided into modules, so that it is easy to carry out experiments with features like adding new ones, transforming etc.

In [None]:
from sklearn import preprocessing

In [None]:
class HiggsBosonTransformer:
    """Class for Higgs Boson data transformations"""
    
    def __init__(self, with_missing_values=True, with_scaling=True):
        self.with_missing_values = with_missing_values
        self.with_scaling = with_scaling
        
    def _transform_missing_values(self, df):
        df.replace(-999.0, np.nan, inplace=True)
        
        for column in df.columns:
            nan_ratio = len(df[df[column].isnull()]) / len(df)
            if nan_ratio >= 0.5:
                df.drop(column, axis=1, inplace=True)
        
        df.fillna(df.median(), inplace=True)
        
        return df
                
    def _transform_scale(self, df, with_fitting=False):
        if with_fitting:
            self.scaler = preprocessing.StandardScaler()
            self.scaler.fit(df.values)
        
        scaled_data = self.scaler.transform(df.values)

        return pd.DataFrame(scaled_data, columns=df.columns)
    
    def transform(self, df, with_fitting=False):
        new_df = df.copy()
        
        # Missing values
        if self.with_missing_values:
            new_df = self._transform_missing_values(new_df)
        
        # Scaling
        if self.with_scaling:
            new_df = self._transform_scale(new_df, with_fitting=with_fitting)

        return new_df  
    
    
    def fit_transform(self, df):
        return self.transform(df, with_fitting=True)

In [None]:
higgs_boson_transformer_params = {'with_missing_values': True,
                                  'with_scaling': True   
                                 }

higgs_boson_transformer = HiggsBosonTransformer(**higgs_boson_transformer_params)
new_train_data = higgs_boson_transformer.fit_transform(train_data)
new_test_data = higgs_boson_transformer.transform(test_data)

In [None]:
new_train_data = higgs_boson_transformer.fit_transform(train_data)

In [None]:
new_test_data = higgs_boson_transformer.transform(test_data)

---

## Making AMS score function

In [None]:
def ams_score(y, y_pred, weights):
    y = np.array(y)
    y_pred = np.array(y_pred)
    
    
    s = sum(weights * (y == 1) * (y_pred == 1))
    b = sum(weights * (y == 0) * (y_pred == 1))
    b_r = 10.0
    
    return np.sqrt(2 * ((s + b + b_r) * np.log(1 + s / (b + b_r))) - s)

---

## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logistic_regression = LogisticRegression(random_state=42, n_jobs=-1, verbose=True)

In [None]:
logistic_regression.fit(new_train_data, train_labels)

In [None]:
preds = logistic_regression.predict(new_train_data)

In [None]:
from sklearn import metrics

In [None]:
metrics.accuracy_score(train_labels.values, preds)

### Choosing optimal parameters by grid

In [None]:
from sklearn import cross_validation, grid_search, metrics

In [None]:
logistic_regression_grid_params = {'C': [10 ** x for x in xrange(-1, 1 + 1)],
                                   'penalty': ['l2'],
                                   'class_weight': [None, 'balanced'] # Will help with class inequality
                                  }

In [None]:
cv = cross_validation.StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=42)

In [None]:
grid_cv = grid_search.GridSearchCV(logistic_regression, {'C': [0.1, 10], 'penalty': ['l1']},
                                   scoring='accuracy', cv=cv,
                                   verbose=True, n_jobs=-1)

In [None]:
%%time
grid_cv.fit(new_train_data, train_labels)

In [None]:
grid_cv.grid_scores_

In [None]:
print grid_cv.best_score_
print grid_cv.best_params_

In [None]:
grid_cv.grid_scores_

In [None]:
param_grid = grid_search.ParameterGrid(logistic_regression_grid_params)

In [None]:
logistic_regression.get_params

In [None]:
def higgs_boson_grid_search(X, y, estimator, param_grid, kf):
    result_dict = {'grid_scores': [],
                  }
    for params in grid_search.ParameterGrid(param_grid):
        cv_scores = []
        for train_index, test_index in kf:
            X_train, y_train = X.iloc[train_index], y.iloc[train_index]
            X_test, y_test = X.iloc[test_index], y.iloc[test_index]
            
            higgs_boson_transformer = HiggsBosonTransformer()
            X_train_transformed = higgs_boson_transformer.fit_transform(X_train)
            X_test_transformed = higgs_boson_transformer.transform(X_test)
        
            estimator.set_params(**params)
            estimator.fit(X_train_transformed, y_train, )
            preds = estimator.predict(X_test_transformed)
            
            score = ams_score(y_test, preds, train_weights.iloc[test_index])
            cv_scores.append(score)
            
        result_dict['grid_scores'].append({'mean': np.mean(cv_scores), 'std': np.std(cv_scores), 'params': params})
    
    result_dict['best_score'] = max(result_dict['grid_scores'], key=lambda x: x['mean'])['mean']
    result_dict['best_params'] = max(result_dict['grid_scores'], key=lambda x: x['mean'])['params']
    
    return result_dict

In [None]:
result = higgs_boson_grid_search(train_data, train_labels, 
                                 logistic_regression,
                                 {'C': [0.1, 10], 'penalty': ['l1']}, cv)

In [None]:
result