In [164]:
import pandas as pd

import numpy as np

import matplotlib
import matplotlib.pylab as plt
%matplotlib inline

import seaborn as sns

from __future__ import division

## Loading data

### Train

In [181]:
train_data_filepath = './data/training.csv'
train_data = pd.read_csv(train_data_filepath, index_col='EventId')

In [182]:
train_labels = train_data['Label']
weights = train_data['Label']
train_data = train_data.drop(['Label', 'Weight'], axis=1)

### Test

In [167]:
test_data_filepath = './data/test.csv'
test_data = pd.read_csv(test_data_filepath, index_col='EventId')

---

## Creating class for Higgs Boson data tranformation

It is divided into modules, so that it is easy to carry out experiments with features like adding new ones, transforming etc.

In [168]:
from sklearn import preprocessing

In [227]:
class HiggsBosonTransformer:
    """Class for Higgs Boson data transformations"""
    
    def __init__(self, with_missing_values=True, with_scaling=True):
        self.with_missing_values = with_missing_values
        self.with_scaling = with_scaling
        
    def transform_missing_values(self, df):
        df.replace(-999.0, np.nan, inplace=True)
        
        for column in df.columns:
            nan_ratio = len(df[df[column].isnull()]) / len(df)
            if nan_ratio >= 0.5:
                df.drop(column, axis=1, inplace=True)
        
        df.fillna(df.median(), inplace=True)
        
        return df
                
    def transform_scale(self, df, with_fitting=False):
        if with_fitting:
            self.scaler = preprocessing.StandardScaler()
            self.scaler.fit(df.values)
        
        scaled_data = self.scaler.transform(df.values)

        return pd.DataFrame(scaled_data, columns=df.columns)
    
    def transform(self, df, with_fitting=False):
        new_df = df.copy()
        
        # Missing values
        if self.with_missing_values:
            new_df = self.transform_missing_values(new_df)
        
        # Scaling
        if self.with_scaling:
            new_df = self.transform_scale(new_df, with_fitting=with_fitting)

        return new_df  
    
    
    def fit_transform(self, df):
        return self.transform(df, with_fitting=True)

In [238]:
higgs_boson_transformer_params = {'with_missing_values': True,
                                  'with_scaling': True   
                                 }

higgs_boson_transformer = HiggsBosonTransformer(**higgs_boson_transformer_params)

In [239]:
new_train_data = higgs_boson_transformer.fit_transform(train_data)

In [240]:
new_test_data = higgs_boson_transformer.transform(test_data)