### Cross Validation

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import time
from sklearn import svm
from sklearn import tree

from imports.matrix_helper import MatrixHelper
from imports.prediction_helper import PredictionHelper
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_score

matrix_helper = MatrixHelper()
sampling_factor=(2.0/3)

def stratified_sampling(matrix):
    # Create own matrices for vulenrable and not vulnerable entries
    vulnerable_matrix = matrix_helper.get_vulnerable_components(matrix)
    not_vulnerable_matrix = matrix_helper.get_not_vulnerable_components(matrix)

    # Split into training sets (2/3) and test sets (1/3)
    vulnerable_training, vulnerable_test = matrix_helper.split_training_test(vulnerable_matrix, sampling_factor)
    not_vulnerable_training, not_vulnerable_test = matrix_helper.split_training_test(not_vulnerable_matrix, sampling_factor)

    # Concatenate vulnerable/not-vulnerable
    training_matrix = np.concatenate((not_vulnerable_training, vulnerable_training), axis=0)
    test_matrix = np.concatenate((not_vulnerable_test, vulnerable_test), axis=0)

    # Split into training and target matrices
    training_data, training_target = matrix_helper.create_data_target(training_matrix)
    test_data, test_target = matrix_helper.create_data_target(test_matrix)
    
    return training_data, training_target, test_data, test_target
    

# Read pickle
#matrices = matrix_helper.load_from_parse('data/matrices/matrix_cla_incl_current.pickle')
matrices = matrix_helper.load_from_parse('data/matrices/matrix_reg_incl_current.pickle')
feature_matrix = matrices[0]
#feature_matrix = feature_matrix[:100, :]

total_score_list = []
penalties = [0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
for penalty in penalties:
    #clf = svm.LinearSVC(C=penalty)
    clf = svm.SVR(C=penalty)
    score_list = []
    for i in range(5):
        training_data, training_target, test_data, test_target = stratified_sampling(feature_matrix)
        score = clf.fit(training_data, training_target).score(test_data, test_target)
        score_list.append(score)
    print('Penalty: {}, Average: {}'.format(penalty, sum(score_list) / float(len(score_list))))
    total_score_list.append(sum(score_list) / float(len(score_list)))

    
figure = plt.figure()

plt.xlabel('Penalty Parameter')
plt.ylabel('Score')
figure.suptitle('Cross Validation Scores')
plt.semilogx(penalties, total_score_list)

figure.savefig('outputs/penalty_parameter.pdf')

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from imports.matrix_helper import MatrixHelper

matrix_helper = MatrixHelper()

matrices = matrix_helper.load_from_parse('data/matrices/matrix_reg_incl_current.pickle')
feature_matrix = matrices[0]
#feature_matrix = feature_matrix[:100, :]
features_count = feature_matrix.shape[1] - 1
print('go')

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear']}
grid_search = GridSearchCV(svm.SVR(), param_grid, cv=5)
grid_search.fit(feature_matrix[:, range(features_count)], feature_matrix[:, features_count])
print(grid_search.best_params_)


go
