# Support Vector Machine
## HiggsML Challenge

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Loading the data

In [2]:
dirName = '../../data/'
fileName = dirName + 'training.csv'
data = pd.read_csv(fileName)

## Data Preprocessing

In [3]:
# replace missing values with NaN
data = data.replace(-999.0, np.nan)

# learning data
X = data.copy()
del X['EventId']
del X['Weight']
del X['Label']
y = data['Label']
w = data['Weight']

In [6]:
# handle missing values
from sklearn.preprocessing import Imputer

# replace missing values with NaN
X = X.replace(-999.0, np.nan)

# missing_values is the value of your placeholder, strategy is if you'd like mean, median or mode, and axis=0 means it calculates the imputation based on the other feature values for that sample
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
missingX = imp.transform(X)

In [10]:
# scale the data for SVM
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
missingX_scaled = scaler.fit_transform(missingX)

In [11]:
# testing SVM
missingX_scaled

array([[  3.14910656e-01,   6.83319669e-02,   4.07680272e-01, ...,
          1.14381874e+00,  -2.52714288e+00,   4.12510497e-01],
       [  7.40827026e-01,   5.52504823e-01,   5.40136414e-01, ...,
         -1.58502753e-17,  -1.59516311e-17,  -2.73819964e-01],
       [  0.00000000e+00,   3.19515553e+00,   1.09655998e+00, ...,
         -1.58502753e-17,  -1.59516311e-17,  -2.93969845e-01],
       ..., 
       [ -3.10930673e-01,   3.19316447e-01,  -1.30863670e-01, ...,
         -1.58502753e-17,  -1.59516311e-17,  -3.17017229e-01],
       [ -5.10097335e-01,  -8.45323970e-01,  -3.02973380e-01, ...,
         -1.58502753e-17,  -1.59516311e-17,  -7.45439413e-01],
       [  0.00000000e+00,   6.65336083e-01,  -2.53522760e-01, ...,
         -1.58502753e-17,  -1.59516311e-17,  -7.45439413e-01]])

## Splitting Training set to real training set and test set

In [12]:
Xtrain = missingX_scaled[1:225000]
Xtest = missingX_scaled[-25000:]
ytrain = y[1:225000]
ytest = y[-25000:]
wtrain = w[1:225000]
wtest = w[-25000:]

In [13]:
# minimal subset for faster learning of SVM.fit
Xtrain = missingX_scaled[1:25000]
ytrain = y[1:25000]
wtrain = w[1:25000]
ytrainVals = ytrain.replace(to_replace=['s','b'],value=[1,0])

## SVM

In [14]:
# import
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier as GBC

In [15]:
C = 1.0 

In [23]:
# SVM linear
clf = svm.SVC(kernel='linear', probability=True, verbose=1)
clf.fit(Xtrain, ytrainVals)

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=1)

In [30]:
# SVM polynomial
clf = svm.SVC(kernel='poly', degree=3, probability=True, verbose=1)
clf.fit(Xtrain, ytrainVals)

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=1)

In [16]:
# SVM RBF
clf = svm.SVC(probability=True, verbose=1)
clf.fit(Xtrain, ytrainVals)

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=1)

In [34]:
# SVM sigmoid
clf = svm.SVC(kernel='sigmoid', probability=True, verbose=1)
clf.fit(Xtrain, ytrainVals)

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='sigmoid',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=1)

In [88]:
# Gradint Boosting to test WTF is wrong
clf = GBC(n_estimators=50, max_depth=5,min_samples_leaf=200,max_features=10,verbose=1)
clf.fit(Xtrain,ytrainVals) 

      Iter       Train Loss   Remaining Time 
         1           1.2128           31.48s
         2           1.1659           30.14s
         3           1.1184           29.40s
         4           1.0764           28.68s
         5           1.0441           28.32s
         6           1.0134           27.54s
         7           0.9870           26.84s
         8           0.9666           26.09s
         9           0.9436           25.47s
        10           0.9248           24.79s
        20           0.8184           18.46s
        30           0.7757           12.11s
        40           0.7536            5.99s
        50           0.7396            0.00s


GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=5, max_features=10, max_leaf_nodes=None,
              min_samples_leaf=200, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=1.0, verbose=1,
              warm_start=False)

## Validation

In [17]:
# compute AMS
def ams(s, b):
    from math import sqrt,log
    if b==0:
        return 0

    return sqrt(2*((s+b+10)*log(1+float(s)/(b+10))-s))

In [18]:
# compute all measures
def validate(predicted, real, weights):
    sumsig = 0.
    sumbkg = 0.
    tp = 0.
    tn = 0.
    fp = 0.
    fn = 0.
    precision = 0.
    recall = 0.
    acc = 0.
    
    if (predicted.shape[0] != real.shape[0]):
        raise Exception
    
    for i in range(predicted.shape[0]):
        if predicted[i] == "s":
            if real[i] == "s":
                sumsig += weights[i]
                tp += 1
            else:
                sumbkg += weights[i]
                fp += 1
        else:
            if real[i] == "s":
                fn += 1
            else:
                tn += 1
    
    print(tp, fp, fn, tn)
    
    # calculate scores
    amsscore = ams(sumsig * 10, sumbkg * 10)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    acc = (tp + tn) / (tp + fp + tn + fn)
    f1score = (2 * precision * recall)/(precision + recall)

    printScores(tp, tn, fp, fn, precision, recall, acc, f1score, amsscore)
    
    return amsscore

In [19]:
def printScores(tp, tn, fp, fn, precision, recall, acc, f1score, amsscore):
    all = tp + tn + fp + fn
    print("TP: ", tp/all)
    print("TN: ", tn/all)
    print("FP: ", fp/all)
    print("FN: ", fn/all)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("Acc: ", acc)
    print("F1: ", f1score)
    print("AMS: ", amsscore)

In [35]:
# validation
predicted = clf.predict(Xtest)

In [38]:
predictedV = pd.Series(predicted).map({1: 's', 0: 'b'})

In [39]:
validate(predictedV, np.array(ytest), np.array(wtest))

0.0 0.0 8609.0 16391.0


ZeroDivisionError: float division by zero

In [69]:
vars = clf.predict_proba(Xtest)[:,1]

In [71]:
res = vars - vars[0]

In [75]:
vars[1:20]

array([ 0.3321973 ,  0.3321982 ,  0.3321982 ,  0.3321982 ,  0.3321982 ,
        0.3321982 ,  0.3321982 ,  0.3321982 ,  0.33220401,  0.3321982 ,
        0.33219092,  0.3321982 ,  0.3321982 ,  0.33218499,  0.3290364 ,
        0.3321982 ,  0.3321982 ,  0.3321982 ,  0.3321982 ])