In [7]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_recall_curve
from scipy.stats import rankdata
from scipy.stats import spearmanr

from imports.matrix_helper import MatrixHelper
from imports.svr_helper import SVRHelper

# get feature matrices and row names (component names)
matrix_helper = MatrixHelper()
matrices = matrix_helper.load_from_parse('data/semiannual/matrix_regression_2011-09-15.pickle')
validation_matrices = matrix_helper.load_from_parse('data/semiannual/matrix_regression_2017-03-09.pickle')

# instantiate SVR Helper Class and predict values for compare matrix
svr_helper = SVRHelper()
svr_helper.calculate_semiannual_compare_matrix(matrices, validation_matrices)

# Calculate the mean square error between the actual vulnerabilities in the validation matrix and the predicted values
compare_matrix_sorted = svr_helper.get_compare_matrix_sorted()
mse = mean_squared_error(np.array(compare_matrix_sorted[:,2], dtype='f'), np.array(compare_matrix_sorted[:,1], dtype='f'))
print('Mean square error: {}'.format(mse))

# Get the Top 1% and calcualte again the mean square error
compare_matrix_top = svr_helper.get_compare_matrix_top()
mse_top = mean_squared_error(np.array(compare_matrix_top[:,2], dtype='f'), np.array(compare_matrix_top[:,1], dtype='f'))
print('Mean square error with top 1%: {}'.format(mse_top))

# Get the Top 1% and calculate the spearman correlation coefficient
predicted_ranking = len(compare_matrix_top[:, 1]) - rankdata(compare_matrix_top[:, 1], method='max') + 1
actual_ranking = len(compare_matrix_top[:, 2]) - rankdata(compare_matrix_top[:, 2], method='max') + 1

# Calculate Spearman's correlation coefficient
coefficient = spearmanr(predicted_ranking, actual_ranking)
print('Spearman correlation coefficient: {}'.format(coefficient[0]))


Mean square error: 0.955091059208
Mean square error with top 1%: 3.3125
Spearman correlation coefficient: 0.32377738857


## Regression with multiple matrices

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_recall_curve
from scipy.stats import rankdata
from scipy.stats import spearmanr

from imports.matrix_helper import MatrixHelper
from imports.svr_helper import SVRHelper

matrix_helper = MatrixHelper()

def validate(matrices, validation_matrices):
    # instantiate SVR Helper Class and predict values for compare matrix
    svr_helper = SVRHelper()
    svr_helper.calculate_semiannual_compare_matrix(matrices, validation_matrices)

    # Calculate the mean square error between the actual vulnerabilities in the validation matrix and the predicted values
    compare_matrix_sorted = svr_helper.get_compare_matrix_sorted()
    mse = mean_squared_error(np.array(compare_matrix_sorted[:,2], dtype='f'), np.array(compare_matrix_sorted[:,1], dtype='f'))

    # Get the Top 1% and calcualte again the mean square error
    compare_matrix_top = svr_helper.get_compare_matrix_top()
    mse_top = mean_squared_error(np.array(compare_matrix_top[:,2], dtype='f'), np.array(compare_matrix_top[:,1], dtype='f'))


    # Get the Top 1% and calculate the spearman correlation coefficient
    predicted_ranking = len(compare_matrix_top[:, 1]) - rankdata(compare_matrix_top[:, 1], method='max') + 1
    actual_ranking = len(compare_matrix_top[:, 2]) - rankdata(compare_matrix_top[:, 2], method='max') + 1

    # Calculate Spearman's correlation coefficient
    spearman = spearmanr(predicted_ranking, actual_ranking)[0]

    return mse, mse_top, spearman

validation_date = '2017-03-09'
counter = 0

for file in os.listdir("data/semiannual"):
    if file != '.DS_Store' and file.split('_', 2)[1] == 'regression':
        date = file.split('_', 2)[2].split('.', 1)[0]
        if (date != validation_date and counter >= 0):
            counter += 1
            matrices = matrix_helper.load_from_parse('data/semiannual/' + file)
            validation_matrices = matrix_helper.load_from_parse('data/semiannual/matrix_regression_' + validation_date + '.pickle')
            mse, mse_top, spearman = validate(matrices, validation_matrices)
            print('{} - MSE: {}    MSE Top 1%: {}    Spearman coefficient: {}'.format(date, mse, mse_top, spearman))



2007-03-22 - MSE: 16.6169605255    MSE Top 1%: 0.571428596973    Spearman coefficient: nan
2007-06-21 - MSE: 6.14189815521    MSE Top 1%: 63.7142868042    Spearman coefficient: -0.269283887663
2007-09-23 - MSE: 4.49358654022    MSE Top 1%: 1.66666662693    Spearman coefficient: -0.304870559568
2007-12-18 - MSE: 3.33031463623    MSE Top 1%: 0.904761910439    Spearman coefficient: -0.227616367
2008-03-18 - MSE: 2.58407902718    MSE Top 1%: 0.523809552193    Spearman coefficient: 0.340127571764
2008-06-08 - MSE: 1.81991398335    MSE Top 1%: 1.18181812763    Spearman coefficient: 0.516734155486
2008-09-21 - MSE: 1.73864996433    MSE Top 1%: 8.30303001404    Spearman coefficient: 0.446529865668
2008-12-16 - MSE: 0.851306259632    MSE Top 1%: 6.80555534363    Spearman coefficient: 0.256149697401
2009-03-21 - MSE: 0.779922783375    MSE Top 1%: 2.27777767181    Spearman coefficient: 0.324083349486
2009-06-18 - MSE: 0.659047603607    MSE Top 1%: 3.35135126114    Spearman coefficient: 0.34871327