In [1]:
import numpy as np
import matplotlib.pyplot as plt
import copy
import collections

from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_recall_curve

from imports.matrix_helper import MatrixHelper

matrix_helper = MatrixHelper()

# get feature matrices and row names (component names)
matrices = matrix_helper.load_from_parse('data/feature_matrix_sparse_rev150000.pickle')
validation_matrices = matrix_helper.load_from_parse('data/feature_matrix_sparse_v3.pickle')

feature_matrix = matrices[0]
validation_feature_matrix = validation_matrices[0]
rows = matrices[1]
validation_rows = validation_matrices[1]

# count number of samples and features in the feature matrix
samples_count = feature_matrix.shape[0]
features_count = feature_matrix.shape[1] - 1

# Create Array (vulnerable_rows) with the names of all vulnerable components
vulnerable_indices = np.where(feature_matrix[:,-1] > 0)
vulnerable_rows = [rows[i] for i in (vulnerable_indices[0])]

# Create 2 matrices: One with the NOT vulnerable samples/components and one with their names
not_vulnerable_rows = []
not_vulnerable_matrix = []

for i in range(len(rows)):
    if rows[i] not in vulnerable_rows:   
        not_vulnerable_rows.append(rows[i])
        not_vulnerable_matrix.append(feature_matrix[i,:])
        
not_vulnerable_matrix = np.asarray(not_vulnerable_matrix)

# Split feature matrix into data and target
training_data = feature_matrix[:, range(features_count)]
training_target = feature_matrix[:, features_count]

# Create support vector regression
svr = LinearSVR(C=0.2)

# Fit model
svr.fit(training_data, training_target)

# Predict target for all components without any known vulnerabilities
target_prediction = svr.predict(not_vulnerable_matrix[:, range(features_count)])

# Create matrix with component names, predicted vulnerabilities and actual number of vulnerabilities in validation revision
compare_matrix = []
for i in range(len(not_vulnerable_rows)):
    if not_vulnerable_rows[i] in validation_rows:
        validation_index = validation_rows.index(not_vulnerable_rows[i])
        compare_matrix.append([not_vulnerable_rows[i], target_prediction[i], validation_feature_matrix[validation_index, -1]])

compare_matrix = np.array(compare_matrix)
sorted_indeces = np.array(compare_matrix[:,1], dtype='f').argsort()[::-1]
compare_matrix_sorted = compare_matrix[sorted_indeces]

# Calculate the mean square error with the compared matrix
error = mean_squared_error(np.array(compare_matrix_sorted[:,2], dtype='f'), np.array(compare_matrix_sorted[:,1], dtype='f'))
print('Mean square error: {}'.format(error))


# First we filter all zeros from the originally compare matrix
compare_matrix_filtered = []
for component in compare_matrix_sorted:
    if (float(component[1]) >= 0.5 or float(component[1]) <= -0.5 or float(component[2]) > 0):
        compare_matrix_filtered.append(copy.copy(component))
compare_matrix_filtered = np.array(compare_matrix_filtered)

error = mean_squared_error(np.array(compare_matrix_filtered[:,2], dtype='f'), np.array(compare_matrix_filtered[:,1], dtype='f'))
print('Mean square error with filtered matrix: {}'.format(error))


# Than we convert the prediction and actual values to binary 0/1 and determine precision and recall
# Predicted values => 0.5 -> 1 and predicted values < 0.5 -> 0
compare_matrix_binary = copy.copy(compare_matrix_sorted)
compare_matrix_binary[:, 1] = [1 if float(x) >= 0.5 else 0 for x in compare_matrix_binary[:, 1]]
compare_matrix_binary[:, 2] = [1 if float(x) >= 0.5 else 0 for x in compare_matrix_binary[:, 2]]

precision, recall, thresholds = precision_recall_curve(np.array(compare_matrix_binary[:, 2], dtype='f'), np.array(compare_matrix_binary[:, 1], dtype='f'))

print('Precision: {}, Recall: {}'.format(precision[1], recall[1]))

#print('--------------')
#for component in compare_matrix_sorted:
#    print("{:25} Predicted: {:>3} \t Actual in new Revision: {:>3}".format(component[0], round(float(component[1])), component[2]))

Mean square error: 0.193909600377
Mean square error with filtered matrix: 3.98732352257
Precision: 0.147959183673, Recall: 0.126637554585
