In [18]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_recall_curve
from scipy.stats import rankdata
from scipy.stats import spearmanr

from imports.matrix_helper import MatrixHelper
from imports.prediction_helper import PredictionHelper

# get feature matrices and row names (component names)
matrix_helper = MatrixHelper()
matrices = matrix_helper.load_from_parse('data/semiannual/matrix_regression_2009-03-21.pickle')
validation_matrices = matrix_helper.load_from_parse('data/semiannual/matrix_regression_2009-06-18.pickle')

# instantiate Prediction Helper Class and predict values for compare matrix with regression
prediction_helper = PredictionHelper()
prediction_helper.calculate_semiannual_compare_matrix(matrices, validation_matrices)

# Calculate the mean square error between the actual vulnerabilities in the validation matrix and the predicted values
compare_matrix_sorted = prediction_helper.get_compare_matrix_sorted()
mse = mean_squared_error(np.array(compare_matrix_sorted[:,2], dtype='f'), np.array(compare_matrix_sorted[:,1], dtype='f'))
print('Mean square error: {}'.format(mse))

# Get the Top 1% and calcualte again the mean square error
compare_matrix_top = prediction_helper.get_compare_matrix_top()
mse_top = mean_squared_error(np.array(compare_matrix_top[:,2], dtype='f'), np.array(compare_matrix_top[:,1], dtype='f'))
print('Mean square error with top 1%: {}'.format(mse_top))

# Get the Top 1% and calculate the spearman correlation coefficient
predicted_ranking = len(compare_matrix_top[:, 1]) - rankdata(compare_matrix_top[:, 1], method='max') + 1
actual_ranking = len(compare_matrix_top[:, 2]) - rankdata(compare_matrix_top[:, 2], method='max') + 1

# Calculate Spearman's correlation coefficient
coefficient = spearmanr(predicted_ranking, actual_ranking)
print('Spearman correlation coefficient: {}'.format(coefficient[0]))


Mean square error: 0.0312107615173
Mean square error with top 1%: 0.589285731316
Spearman correlation coefficient: nan


## Regression with multiple matrices

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_recall_curve
from scipy.stats import rankdata
from scipy.stats import spearmanr

from imports.matrix_helper import MatrixHelper
from imports.prediction_helper import PredictionHelper

matrix_helper = MatrixHelper()

def validate(matrices, validation_matrices):
    # instantiate SVR Helper Class and predict values for compare matrix
    prediction_helper = PredictionHelper()
    prediction_helper.calculate_semiannual_compare_matrix(matrices, validation_matrices)

    # Calculate the mean square error between the actual vulnerabilities in the validation matrix and the predicted values
    compare_matrix_sorted = prediction_helper.get_compare_matrix_sorted()
    mse = mean_squared_error(np.array(compare_matrix_sorted[:,2], dtype='f'), np.array(compare_matrix_sorted[:,1], dtype='f'))

    # Get the Top 1% and calcualte again the mean square error
    compare_matrix_top = prediction_helper.get_compare_matrix_top()
    mse_top = mean_squared_error(np.array(compare_matrix_top[:,2], dtype='f'), np.array(compare_matrix_top[:,1], dtype='f'))


    # Get the Top 1% and calculate the spearman correlation coefficient
    predicted_ranking = len(compare_matrix_top[:, 1]) - rankdata(compare_matrix_top[:, 1], method='max') + 1
    actual_ranking = len(compare_matrix_top[:, 2]) - rankdata(compare_matrix_top[:, 2], method='max') + 1

    # Calculate Spearman's correlation coefficient
    spearman = spearmanr(predicted_ranking, actual_ranking)[0]

    return mse, mse_top, spearman

counter = 0
quarter_between_matrices = 12

import os
all_files = os.listdir("data/semiannual")
file_list = [file for file in all_files if file != '.DS_Store' and file.split('_', 2)[1] == 'regression']
for i in range(len(file_list) - quarter_between_matrices):
    file = file_list[i]
    file_date = file.split('_', 2)[2].split('.', 1)[0]
    validaiton_file = file_list[i+quarter_between_matrices]
    validation_date = validaiton_file.split('_', 2)[2].split('.', 1)[0]
    if (counter > -1):
        counter += 1
        matrices = matrix_helper.load_from_parse('data/semiannual/' + file)
        validation_matrices = matrix_helper.load_from_parse('data/semiannual/' + validaiton_file)
        mse, mse_top, spearman = validate(matrices, validation_matrices)
        print('* {} ({}) vs {} ({}) Spearman coefficient: {}'.format(file_date, matrices[0].shape[0], validation_date, validation_matrices[0].shape[0], spearman))



  c /= stddev[:, None]
  c /= stddev[None, :]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


* 2007-03-22 (5551) vs 2010-03-21 (7149) Spearman coefficient: nan
* 2007-06-21 (5471) vs 2010-06-19 (7198) Spearman coefficient: -0.143822784148
* 2007-09-23 (5346) vs 2010-09-17 (7658) Spearman coefficient: 0.174336581738
* 2007-12-18 (5378) vs 2010-12-16 (7723) Spearman coefficient: -0.121566134771
* 2008-03-18 (5341) vs 2011-03-17 (7930) Spearman coefficient: 0.308307959237
* 2008-06-08 (6607) vs 2011-06-16 (7862) Spearman coefficient: 0.376537562425
* 2008-09-21 (6740) vs 2011-09-15 (7919) Spearman coefficient: 0.424232581842
* 2008-12-16 (6920) vs 2011-12-15 (12478) Spearman coefficient: 0.132177087552
* 2009-03-21 (6982) vs 2012-03-15 (8820) Spearman coefficient: 0.337923510331
* 2009-06-18 (6230) vs 2012-06-13 (10944) Spearman coefficient: 0.304035979924
* 2009-09-20 (6592) vs 2012-09-10 (11459) Spearman coefficient: 0.39663972685
* 2009-12-19 (6947) vs 2012-12-13 (11532) Spearman coefficient: 0.358093535641
* 2010-03-21 (7149) vs 2013-03-14 (12441) Spearman coefficient: 0.3682

#### Vergleichsmatrix: 2017-03-09
* 2007-03-22 - MSE: 16.6169605255    MSE Top 1%: 0.571428596973    Spearman coefficient: nan
* 2007-06-21 - MSE: 6.14189815521    MSE Top 1%: 63.7142868042    Spearman coefficient: -0.269283887663
* 2007-09-23 - MSE: 4.49358654022    MSE Top 1%: 1.66666662693    Spearman coefficient: -0.304870559568
* 2007-12-18 - MSE: 3.33031463623    MSE Top 1%: 0.904761910439    Spearman coefficient: -0.227616367
* 2008-03-18 - MSE: 2.58407902718    MSE Top 1%: 0.523809552193    Spearman coefficient: 0.340127571764
* 2008-06-08 - MSE: 1.81991398335    MSE Top 1%: 1.18181812763    Spearman coefficient: 0.516734155486
* 2008-09-21 - MSE: 1.73864996433    MSE Top 1%: 8.30303001404    Spearman coefficient: 0.446529865668
* 2008-12-16 - MSE: 0.851306259632    MSE Top 1%: 6.80555534363    Spearman coefficient: 0.256149697401
* 2009-03-21 - MSE: 0.779922783375    MSE Top 1%: 2.27777767181    Spearman coefficient: 0.324083349486
* 2009-06-18 - MSE: 0.659047603607    MSE Top 1%: 3.35135126114    Spearman coefficient: 0.348713273132
* 2009-09-20 - MSE: 1.3887155056    MSE Top 1%: 3.34210515022    Spearman coefficient: 0.400771139843
* 2009-12-19 - MSE: 1.30761599541    MSE Top 1%: 1.64999997616    Spearman coefficient: 0.317779115744
* 2010-03-21 - MSE: 1.24105238914    MSE Top 1%: 4.09523820877    Spearman coefficient: 0.50689649831
* 2010-06-19 - MSE: 0.626625657082    MSE Top 1%: 4.14285707474    Spearman coefficient: 0.441332688592
* 2010-09-17 - MSE: 0.827854275703    MSE Top 1%: 5.19999980927    Spearman coefficient: 0.427916403307
* 2010-12-16 - MSE: 1.10468947887    MSE Top 1%: 11.4489793777    Spearman coefficient: 0.29751387137
* 2011-03-17 - MSE: 0.865600705147    MSE Top 1%: 12.1086959839    Spearman coefficient: 0.403132544941
* 2011-06-16 - MSE: 1.01645922661    MSE Top 1%: 25.148935318    Spearman coefficient: 0.214136714542
* 2011-09-15 - MSE: 0.955091059208    MSE Top 1%: 3.3125    Spearman coefficient: 0.32377738857
* 2011-12-15 - MSE: 0.799456179142    MSE Top 1%: 1.63636362553    Spearman coefficient: 0.261762592967
* 2012-03-15 - MSE: 0.761833190918    MSE Top 1%: 3.13207554817    Spearman coefficient: 0.0946937460952
* 2012-06-13 - MSE: 0.635879218578    MSE Top 1%: 5.01470565796    Spearman coefficient: 0.200205752153
* 2012-09-10 - MSE: 0.622252941132    MSE Top 1%: 5.0    Spearman coefficient: 0.227661128493
* 2012-12-13 - MSE: 0.453928768635    MSE Top 1%: 3.81690144539    Spearman coefficient: 0.0818621627868
* 2013-03-14 - MSE: 0.29224255681    MSE Top 1%: 3.58750009537    Spearman coefficient: 0.235395693055
* 2013-06-13 - MSE: 0.25324434042    MSE Top 1%: 3.96296286583    Spearman coefficient: 0.255617222435
* 2013-09-15 - MSE: 0.270454078913    MSE Top 1%: 5.26041650772    Spearman coefficient: 0.344048389613
* 2013-12-08 - MSE: 0.190508171916    MSE Top 1%: 3.67415738106    Spearman coefficient: 0.214832711148
* 2014-03-17 - MSE: 0.212228044868    MSE Top 1%: 4.1414141655    Spearman coefficient: 0.143983125318
* 2014-06-12 - MSE: 0.172789111733    MSE Top 1%: 4.76363658905    Spearman coefficient: 0.132311001731
* 2014-12-14 - MSE: 0.119469024241    MSE Top 1%: 5.79411745071    Spearman coefficient: 0.210280158232
* 2015-03-01 - MSE: 0.113073058426    MSE Top 1%: 4.89430904388    Spearman coefficient: -0.0099652527642

#### Unterschied 4 Quartale
* 2007-03-22 (5551) vs 2008-03-18 (5341) Spearman coefficient: nan
* 2007-06-21 (5471) vs 2008-06-08 (6607) Spearman coefficient: nan
* 2007-09-23 (5346) vs 2008-09-21 (6740) Spearman coefficient: nan
* 2007-12-18 (5378) vs 2008-12-16 (6920) Spearman coefficient: nan
* 2008-03-18 (5341) vs 2009-03-21 (6982) Spearman coefficient: nan
* 2008-06-08 (6607) vs 2009-06-18 (6230) Spearman coefficient: -0.0237886551472
* 2008-09-21 (6740) vs 2009-09-20 (6592) Spearman coefficient: 0.341068023362
* 2008-12-16 (6920) vs 2009-12-19 (6947) Spearman coefficient: 0.176581069399
* 2009-03-21 (6982) vs 2010-03-21 (7149) Spearman coefficient: 0.321305487745
* 2009-06-18 (6230) vs 2010-06-19 (7198) Spearman coefficient: 0.371906740783
* 2009-09-20 (6592) vs 2010-09-17 (7658) Spearman coefficient: 0.246986191212
* 2009-12-19 (6947) vs 2010-12-16 (7723) Spearman coefficient: 0.264598047602
* 2010-03-21 (7149) vs 2011-03-17 (7930) Spearman coefficient: 0.0836827408756
* 2010-06-19 (7198) vs 2011-06-16 (7862) Spearman coefficient: 0.0233844693808
* 2010-09-17 (7658) vs 2011-09-15 (7919) Spearman coefficient: -0.0166406729168
* 2010-12-16 (7723) vs 2011-12-15 (12478) Spearman coefficient: -0.0162660757708
* 2011-03-17 (7930) vs 2012-03-15 (8820) Spearman coefficient: 0.112112238087
* 2011-06-16 (7862) vs 2012-06-13 (10944) Spearman coefficient: 0.0261534240017
* 2011-09-15 (7919) vs 2012-09-10 (11459) Spearman coefficient: 0.181289600146
* 2011-12-15 (12478) vs 2012-12-13 (11532) Spearman coefficient: -0.0930569703931
* 2012-03-15 (8820) vs 2013-03-14 (12441) Spearman coefficient: 0.0730079020485
* 2012-06-13 (10944) vs 2013-06-13 (12545) Spearman coefficient: 0.00309175242241
* 2012-09-10 (11459) vs 2013-09-15 (14196) Spearman coefficient: 0.12467663326
* 2012-12-13 (11532) vs 2013-12-08 (13791) Spearman coefficient: 0.047915889141
* 2013-03-14 (12441) vs 2014-03-17 (14508) Spearman coefficient: 0.161089876151
* 2013-06-13 (12545) vs 2014-06-12 (14885) Spearman coefficient: 0.118695931243
* 2013-09-15 (14196) vs 2014-12-14 (14409) Spearman coefficient: 0.378763855903
* 2013-12-08 (13791) vs 2015-03-01 (16259) Spearman coefficient: 0.217282584831
* 2014-03-17 (14508) vs 2015-05-30 (15408) Spearman coefficient: 0.208492045064
* 2014-06-12 (14885) vs 2015-09-07 (15484) Spearman coefficient: 0.248574596363
* 2014-12-14 (14409) vs 2016-03-13 (16623) Spearman coefficient: 0.137193705721
* 2015-03-01 (16259) vs 2016-05-29 (17098) Spearman coefficient: -0.088617968345
* 2015-05-30 (15408) vs 2016-09-12 (17564) Spearman coefficient: 0.0847080843604
* 2015-09-07 (15484) vs 2016-12-08 (17149) Spearman coefficient: 0.198274657187
* 2016-03-13 (16623) vs 2017-03-09 (17699) Spearman coefficient: 0.105757592967

#### Unterschied 8 Quartal
* 2007-03-22 (5551) vs 2009-03-21 (6982) Spearman coefficient: nan
* 2007-06-21 (5471) vs 2009-06-18 (6230) Spearman coefficient: 0.226455406829
* 2007-09-23 (5346) vs 2009-09-20 (6592) Spearman coefficient: 0.23586883112
* 2007-12-18 (5378) vs 2009-12-19 (6947) Spearman coefficient: -0.0426216963123
* 2008-03-18 (5341) vs 2010-03-21 (7149) Spearman coefficient: 0.229761500056
* 2008-06-08 (6607) vs 2010-06-19 (7198) Spearman coefficient: 0.283945919791
* 2008-09-21 (6740) vs 2010-09-17 (7658) Spearman coefficient: 0.550249955677
* 2008-12-16 (6920) vs 2010-12-16 (7723) Spearman coefficient: 0.302941930704
* 2009-03-21 (6982) vs 2011-03-17 (7930) Spearman coefficient: 0.225550714782
* 2009-06-18 (6230) vs 2011-06-16 (7862) Spearman coefficient: 0.24517104955
* 2009-09-20 (6592) vs 2011-09-15 (7919) Spearman coefficient: 0.367647058824
* 2009-12-19 (6947) vs 2011-12-15 (12478) Spearman coefficient: 0.200771047715
* 2010-03-21 (7149) vs 2012-03-15 (8820) Spearman coefficient: 0.229430529202
* 2010-06-19 (7198) vs 2012-06-13 (10944) Spearman coefficient: 0.25883743424
* 2010-09-17 (7658) vs 2012-09-10 (11459) Spearman coefficient: 0.00252601176301
* 2010-12-16 (7723) vs 2012-12-13 (11532) Spearman coefficient: 0.0614293553167
* 2011-03-17 (7930) vs 2013-03-14 (12441) Spearman coefficient: 0.230079962031
* 2011-06-16 (7862) vs 2013-06-13 (12545) Spearman coefficient: 0.187210379586
* 2011-09-15 (7919) vs 2013-09-15 (14196) Spearman coefficient: 0.319824548994
* 2011-12-15 (12478) vs 2013-12-08 (13791) Spearman coefficient: 0.21938323275
* 2012-03-15 (8820) vs 2014-03-17 (14508) Spearman coefficient: 0.166362566662
* 2012-06-13 (10944) vs 2014-06-12 (14885) Spearman coefficient: 0.146758936632
* 2012-09-10 (11459) vs 2014-12-14 (14409) Spearman coefficient: 0.261201065282
* 2012-12-13 (11532) vs 2015-03-01 (16259) Spearman coefficient: 0.0276948702547
* 2013-03-14 (12441) vs 2015-05-30 (15408) Spearman coefficient: 0.121260629582
* 2013-06-13 (12545) vs 2015-09-07 (15484) Spearman coefficient: 0.0873979970998
* 2013-09-15 (14196) vs 2016-03-13 (16623) Spearman coefficient: 0.265951180656
* 2013-12-08 (13791) vs 2016-05-29 (17098) Spearman coefficient: 0.126587665936
* 2014-03-17 (14508) vs 2016-09-12 (17564) Spearman coefficient: 0.0319245193656
* 2014-06-12 (14885) vs 2016-12-08 (17149) Spearman coefficient: 0.192377788064
* 2014-12-14 (14409) vs 2017-03-09 (17699) Spearman coefficient: 0.210280158232