# Support Vector Machine
## HiggsML Challenge

In [59]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Loading the data

In [60]:
dirName = '../../data/'
fileName = dirName + 'training.csv'
data = pd.read_csv(fileName)

## Data Preprocessing

In [61]:
# replace missing values with NaN
# data = data.replace(-999.0, np.nan)

# learning data
X = data.copy()
del X['EventId']
del X['Weight']
del X['Label']
y = data['Label']
w = data['Weight']

In [55]:
# handle missing values
from sklearn.preprocessing import Imputer

# replace missing values with NaN
# X = X.replace(-999.0, np.nan)

# missing_values is the value of your placeholder, strategy is if you'd like mean, median or mode, and axis=0 means it calculates the imputation based on the other feature values for that sample
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
missingX = imp.transform(X)

In [62]:
# testing SVM
missingX = X

## Splitting Training set to real training set and test set

In [63]:
Xtrain = missingX[1:225000]
Xtest = missingX[-25000:]
ytrain = y[1:225000]
ytest = y[-25000:]
wtrain = w[1:225000]
wtest = w[-25000:]

In [87]:
# minimal subset for faster learning of SVM.fit
Xtrain = missingX[1:225000]
ytrain = y[1:225000]
wtrain = w[1:225000]
ytrainVals = ytrain.replace(to_replace=['s','b'],value=[1,0])

## SVM

In [77]:
# import
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier as GBC

In [None]:
C = 1.0 

In [None]:
clf = svm.LinearSVC(verbose=1)
clf.fit(Xtrain, ytrainVals, sample_weight=wtrain)

In [65]:
clf = svm.SVC(probability=True, verbose=1)
clf.fit(Xtrain, ytrainVals)

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=1)

In [88]:
# Gradint Boosting to test WTF is wrong
clf = GBC(n_estimators=50, max_depth=5,min_samples_leaf=200,max_features=10,verbose=1)
clf.fit(Xtrain,ytrainVals) 

      Iter       Train Loss   Remaining Time 
         1           1.2128           31.48s
         2           1.1659           30.14s
         3           1.1184           29.40s
         4           1.0764           28.68s
         5           1.0441           28.32s
         6           1.0134           27.54s
         7           0.9870           26.84s
         8           0.9666           26.09s
         9           0.9436           25.47s
        10           0.9248           24.79s
        20           0.8184           18.46s
        30           0.7757           12.11s
        40           0.7536            5.99s
        50           0.7396            0.00s


GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=5, max_features=10, max_leaf_nodes=None,
              min_samples_leaf=200, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=1.0, verbose=1,
              warm_start=False)

## Validation

In [22]:
# compute AMS
def ams(s, b):
    from math import sqrt,log
    if b==0:
        return 0

    return sqrt(2*((s+b+10)*log(1+float(s)/(b+10))-s))

In [21]:
# compute all measures
def validate(predicted, real, weights):
    sumsig = 0.
    sumbkg = 0.
    tp = 0.
    tn = 0.
    fp = 0.
    fn = 0.
    precision = 0.
    recall = 0.
    acc = 0.
    
    if (predicted.shape[0] != real.shape[0]):
        raise Exception
    
    for i in range(predicted.shape[0]):
        if predicted[i] == "s":
            if real[i] == "s":
                sumsig += weights[i]
                tp += 1
            else:
                sumbkg += weights[i]
                fp += 1
        else:
            if real[i] == "s":
                fn += 1
            else:
                tn += 1
    
    print(tp, fp, fn, tn)
    
    # calculate scores
    amsscore = ams(sumsig * 10, sumbkg * 10)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    acc = (tp + tn) / (tp + fp + tn + fn)
    f1score = (2 * precision * recall)/(precision + recall)

    printScores(tp, tn, fp, fn, precision, recall, acc, f1score, amsscore)
    
    return amsscore

In [23]:
def printScores(tp, tn, fp, fn, precision, recall, acc, f1score, amsscore):
    all = tp + tn + fp + fn
    print("TP: ", tp/all)
    print("TN: ", tn/all)
    print("FP: ", fp/all)
    print("FN: ", fn/all)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("Acc: ", acc)
    print("F1: ", f1score)
    print("AMS: ", amsscore)

In [89]:
# validation
predicted = clf.predict(Xtest)

In [90]:
predictedV = pd.Series(predicted).map({1: 's', 0: 'b'})

In [91]:
validate(predictedV, np.array(ytest), np.array(wtest))

6136.0 1707.0 2473.0 14684.0
TP:  0.24544
TN:  0.58736
FP:  0.06828
FN:  0.09892
Precision:  0.7823536911895959
Recall:  0.7127424788012545
Acc:  0.8328
F1:  0.7459275468028203
AMS:  2.8057784356183713


2.8057784356183713

In [69]:
vars = clf.predict_proba(Xtest)[:,1]

In [71]:
res = vars - vars[0]

In [75]:
vars[1:20]

array([ 0.3321973 ,  0.3321982 ,  0.3321982 ,  0.3321982 ,  0.3321982 ,
        0.3321982 ,  0.3321982 ,  0.3321982 ,  0.33220401,  0.3321982 ,
        0.33219092,  0.3321982 ,  0.3321982 ,  0.33218499,  0.3290364 ,
        0.3321982 ,  0.3321982 ,  0.3321982 ,  0.3321982 ])