In [11]:
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_recall_curve
from scipy.stats import rankdata
from scipy.stats import spearmanr

from imports.matrix_helper import MatrixHelper
from imports.prediction_helper import PredictionHelper

matrix_helper = MatrixHelper()

def validate(matrices, validation_matrices):
    # instantiate SVR Helper Class and predict values for compare matrix
    prediction_helper = PredictionHelper()
    prediction_helper.calculate_semiannual_compare_matrix(matrices, validation_matrices)

    # get the compare matrix twice:
    # Once sorted according to the PREDICTED number of vulnerabilities
    # Once sorted according to the ACTUAL number of vulnerabilities
    compare_matrix_sorted_predicted = prediction_helper.get_compare_matrix_sorted()
    compare_matrix_sorted_actual = prediction_helper.get_compare_matrix_sorted(reference_column=2)

    # Crop first 1% of both matrices
    relevant_samples_count = int(round(0.01 * len(compare_matrix_sorted_predicted)))
    compare_matrix_sorted_predicted = compare_matrix_sorted_predicted[range(relevant_samples_count)]
    compare_matrix_sorted_actual = compare_matrix_sorted_actual[range(relevant_samples_count)]

    # Sum up the actual number of vulnerabilities in both matrices
    fixed_predicted = sum(np.array(compare_matrix_sorted_predicted[:, 2], dtype='f'))
    fixed_actual = sum(np.array(compare_matrix_sorted_actual[:, 2], dtype='f'))

    # Calculate the prediction factor

    return (fixed_predicted / fixed_actual)

counter = 0
quarter_between_matrices = 12

import os
all_files = os.listdir("data/semiannual")
file_list = [file for file in all_files if file != '.DS_Store' and file.split('_', 2)[1] == 'regression']
for i in range(len(file_list) - quarter_between_matrices):
    file = file_list[i]
    file_date = file.split('_', 2)[2].split('.', 1)[0]
    validaiton_file = file_list[i+quarter_between_matrices]
    validation_date = validaiton_file.split('_', 2)[2].split('.', 1)[0]
    if (counter > -1):
        counter += 1
        matrices = matrix_helper.load_from_parse('data/semiannual/' + file)
        validation_matrices = matrix_helper.load_from_parse('data/semiannual/' + validaiton_file)
        
        vulnerable_samples_count = len([i for i in matrices[0] if i[-1] > 0])
        vulnerable_validation_samples_count = len([i for i in validation_matrices[0] if i[-1] > 0])
        if vulnerable_samples_count > 0 and vulnerable_validation_samples_count > 0:
            factor = validate(matrices, validation_matrices)
            print('* {} ({}) vs {} ({}) Koeffizient: {}'.format(file_date, matrices[0].shape[0], validation_date, validation_matrices[0].shape[0], factor))




* 2007-06-21 (5471) vs 2010-06-19 (7198) Koeffizient: 0.0414201183432
* 2007-09-23 (5346) vs 2010-09-17 (7658) Koeffizient: 0.0277777777778
* 2007-12-18 (5378) vs 2010-12-16 (7723) Koeffizient: 0.0327868852459
* 2008-03-18 (5341) vs 2011-03-17 (7930) Koeffizient: 0.031746031746
* 2008-06-08 (6607) vs 2011-06-16 (7862) Koeffizient: 0.0382165605096
* 2008-09-21 (6740) vs 2011-09-15 (7919) Koeffizient: 0.031914893617
* 2008-12-16 (6920) vs 2011-12-15 (12478) Koeffizient: 0.043795620438
* 2009-03-21 (6982) vs 2012-03-15 (8820) Koeffizient: 0.0
* 2009-06-18 (6230) vs 2012-06-13 (10944) Koeffizient: 0.0238095238095
* 2009-09-20 (6592) vs 2012-09-10 (11459) Koeffizient: 0.12865497076
* 2009-12-19 (6947) vs 2012-12-13 (11532) Koeffizient: 0.14880952381
* 2010-03-21 (7149) vs 2013-03-14 (12441) Koeffizient: 0.0104712041885
* 2010-06-19 (7198) vs 2013-06-13 (12545) Koeffizient: 0.0126582278481
* 2010-09-17 (7658) vs 2013-09-15 (14196) Koeffizient: 0.0266666666667
* 2010-12-16 (7723) vs 2013-12-0