In [30]:
import numpy as np
import pandas as pd
import pickle

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import multilabel_confusion_matrix as confusion_matrix

In [31]:
data = pd.read_csv('../data/train.csv')

In [32]:
train_set = data.sample(frac=0.9)
train_set_features = train_set.loc[:, train_set.columns != 'label']

validation_set = data.drop(train_set.index)
validation_set_features = validation_set.loc[:, validation_set.columns != 'label']

In [33]:
# logreg = LogisticRegression(max_iter=1000000000, multi_class='multinomial', solver='newton-cg').fit(train_set_features, train_set['label'])

# with open('model_serialized', 'wb') as ouf:
#     pickle.dump(logreg, ouf)

logreg = 0
with open('../logreg/model_serialized.file', 'rb') as inpf:
    logreg = pickle.load(inpf)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [34]:
prediction = logreg.predict(validation_set_features)
result = (validation_set['label'] == prediction).to_numpy()
count_true = np.count_nonzero(result == True)
count_total = len(result)
accuracy = count_true / count_total
accuracy

0.9521428571428572

In [147]:
combined = pd.DataFrame(prediction, columns=['prediction'])
combined = combined.assign(label=validation_set['label'].values)
count1 = combined[(combined.prediction == 5) & (combined.label == 0)]
count2 = combined[(combined.prediction == 0) & (combined.label == 5)]
print("Prediction - count", count1)
print("Count - prediction", count2)

Prediction - count       prediction  label
16             5      0
1137           5      0
1244           5      0
Count - prediction      prediction  label
540           0      5


In [90]:
report = classification_report(validation_set['label'], prediction)
precision, recall, fscore, support = score(validation_set['label'], prediction, average='weighted')
print('Precision : {}'.format(precision))
print('Recall    : {}'.format(recall))
print('F-score   : {}'.format(fscore))

Precision : 0.9521465149457015
Recall    : 0.9521428571428572
F-score   : 0.9521130646329956


In [38]:
matrix = confusion_matrix(validation_set['label'], prediction)
matrix

array([[414,   0,   0,   0,   0,   3,   1,   0,   3,   1],
       [  0, 522,   1,   1,   0,   0,   0,   1,   1,   0],
       [  1,   4, 417,   6,   5,   0,   1,   6,   6,   1],
       [  1,   0,   7, 376,   0,   7,   0,   3,   5,   5],
       [  0,   1,   2,   0, 404,   0,   1,   2,   3,   7],
       [  1,   1,   1,   6,   2, 331,   3,   0,   7,   4],
       [  1,   0,   4,   1,   1,   4, 365,   0,   0,   0],
       [  1,   1,   3,   3,   4,   0,   1, 414,   1,  11],
       [  1,   5,   2,  11,   0,   6,   1,   0, 363,   3],
       [  2,   0,   1,   2,   7,   0,   0,  12,   1, 393]])