# Bestimmung Top10 durch Regression mit SVR


In [None]:
import time
import matplotlib.pyplot as plt
import numpy as np
import cPickle as pickle

from sklearn import svm, datasets
from sklearn.metrics import precision_recall_curve

# Read pickle
data = None
with open('data/feature_matrix_sparse.pickle', 'rb') as f:
    sparse = pickle.load(f)

rows = sparse[2]
columns = sparse[3]
data = np.zeros((len(rows), len(columns) + 1), dtype=np.uint8)
data[:,-1:] = sparse[1]
for i, j in sparse[0]:
    data[i, j] = 1

data = data[:1000, :]

samples = data.shape[0]
features = data.shape[1] - 1

# Create own matrices for vulenrable and not vulnerable components
vulnerable = (data[np.where(data[:,-1] != 0),:])[0]
not_vulnerable = (data[np.where(data[:,-1] == 0),:])[0]

print("Total Anzahl Samples: {}".format(samples))
print("Anzahl verwundbare Samples: {}".format(vulnerable.shape[0]))
print("Anzahl nicht verwundbare Samples: {}".format(not_vulnerable.shape[0]))

n_vulnerable = vulnerable.shape[0]
n_not_vulnerable = not_vulnerable.shape[0]

# Split into training sets (2/3) and test sets (1/3)
not_vulnerable_training = not_vulnerable[np.random.choice(not_vulnerable.shape[0], (n_not_vulnerable * 2 / 3), replace=False), :]
not_vulnerable_test = not_vulnerable[np.random.choice(not_vulnerable.shape[0], (n_not_vulnerable / 3), replace=False), :]

vulnerable_training = vulnerable[np.random.choice(vulnerable.shape[0], (n_vulnerable * 2 / 3), replace=False), :]
vulnerable_test = vulnerable[np.random.choice(vulnerable.shape[0], (n_vulnerable / 3), replace=False), :]

# Concatenate vulnerable and not vulnerable matrices
training_set = np.concatenate((not_vulnerable_training, vulnerable_training), axis=0)
test_set = np.concatenate((not_vulnerable_test, vulnerable_test), axis=0)

# Split sets into data and target
data_train = training_set[:, range(features)]
target_train = training_set[:, features]
target_train[target_train > 1] = 1

data_test = test_set[:, range(features)]
target_test = test_set[:, features]
target_test[target_test > 1] = 1

start = time.time()

# Create classifier
clf = svm.SVC(kernel='linear', C=0.2)

# Fit model
clf.fit(data_train, target_train)

# Predict remaining data
target_prediction = clf.predict(data_test)

vulnerable_target_test = (target_test[target_test > 0]).size * 100.0 / target_test.size
vulnerable_target_prediction = (target_prediction[target_prediction > 0]).size * 100.0 / target_prediction.size
print("------------------------------")
print("Prozentsatz verwundbare Komponenten im Testset: {0:10.1f}%".format(vulnerable_target_test))
print("Prozentsatz verwundbare Komponenten im Prediction Vektor: {0:10.1f}%".format(vulnerable_target_prediction))
print("------------------------------")

# Compute Precision-Recall
precision, recall, thresholds = precision_recall_curve(target_test, target_prediction)

end = time.time()
elapsed = (end - start) / 60

# Print Precision and Recall
print('Precision {0:10.3f}, Recall {0:10.3f}'.format(precision[1], recall[1]))
print('time: {0:10.1f}min'.format(elapsed))
# Plot precision-recall-curve
plt.clf()
plt.plot(recall, precision, lw=2, color='navy',
         label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall')
plt.legend(loc="lower left")
plt.show()

