In [32]:
import pandas as pd
import numpy as np
from hazem_doc2vec.helper_functions import in_pickle, out_pickle
import data_analysis.preprocessor_end as pre
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sn


In [27]:
def precision(tp, fp): return tp / (tp + fp)
def recall(tp, fn): return tp / (tp + fn)
def f1_score(precision, recall): return 2 * ((precision * recall) / (precision + recall))
def accuracy(tp, fp, tn, fn): return (tp + tn) / (tp + fp + fn + tn)

In [3]:
%%time
X = in_pickle('data/X')     # Corpus "numpy vectors"
Y = in_pickle('data/Y')     # Corpus "numpy vector"

clf = SVC(C=10, gamma=0.001, probability=True)
scores = cross_val_score(clf, X, Y, cv=10, n_jobs=-1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


Accuracy: 0.72 (+/- 0.04)
CPU times: user 93 ms, sys: 43.2 ms, total: 136 ms
Wall time: 7min 38s


In [5]:
%%time
y_pred = cross_val_predict(clf, X, Y, cv=10, n_jobs=-1)
conf_mat = confusion_matrix(Y, y_pred)
print(conf_mat)


[[4463  258]
 [1495   78]]
CPU times: user 75.7 ms, sys: 17.8 ms, total: 93.5 ms
Wall time: 7min 49s


In [29]:
%%time
tn, fp, fn, tp = confusion_matrix(Y, y_pred).ravel()
precision = precision(tp, fp)
recall = recall(tp, fn)
f1_score = f1_score(precision, recall)
accuracy = accuracy(tp, fp, tn, fn)

CPU times: user 10.9 ms, sys: 573 Âµs, total: 11.4 ms
Wall time: 9.85 ms


In [11]:
(tn, fp, fn, tp)

(4463, 258, 1495, 78)

In [12]:
(precision, recall, f1_score, accuracy)


(0.23214285714285715,
 0.049586776859504134,
 0.08171817705605029,
 0.7214807753415952)

In [30]:
print(classification_report(Y, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.95      0.84      4721
           1       0.23      0.05      0.08      1573

   micro avg       0.72      0.72      0.72      6294
   macro avg       0.49      0.50      0.46      6294
weighted avg       0.62      0.72      0.65      6294



In [35]:
df_cm = pd.DataFrame(conf_mat)
#plt.figure(figsize = (10,7))
sn.set(font_scale=1.4)#for label size
sn.heatmap(df_cm, annot=True,annot_kws={"size": 16})# font size
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [37]:
# Save the plot
fig = plt.figure()
plt.matshow(conf_mat)
plt.colorbar()
plt.ylabel('Actual')
plt.xlabel('Predicated')
plt.savefig('approach_1.png')