In [1]:
import pandas as pd
import pylab as pl
import numpy as np
import scipy.optimize as opt
from sklearn import preprocessing
%matplotlib inline 
import matplotlib.pyplot as plt

In [2]:
# Importing dataset - the generated simulator data
data = pd.read_csv('./data/metrics.csv', delimiter=",")
data.head()

Unnamed: 0.1,Unnamed: 0,Timestamp,ID,Temperature(celsius),Target_Temperature(celsius),Power,PowerConsumption,ContentType,O2,CO2,Time_Door_Open,Maintenance_Required,Defrost_Cycle
0,0,2019-09-16 23:41:28.424611,101,1.4484,4,17.022871,20.790347,2,21.036842,2.311549,31.458286,0,6
1,1,2019-09-16 23:56:28.424611,101,3.429018,4,3.592812,24.38316,2,19.376279,8.208501,25.424173,0,4
2,2,2019-09-17 00:11:28.424611,101,2.128647,4,19.358125,43.741285,2,20.905647,12.769884,32.355962,0,1
3,3,2019-09-17 00:26:28.424611,101,7.606698,4,8.97703,52.718315,2,25.174083,3.584314,28.840481,0,4
4,4,2019-09-17 00:41:28.424611,101,3.870544,4,6.738919,59.457234,2,15.753765,-3.815855,30.820754,0,3


In [3]:
# Convert dataset into array for the independent variables (features)
X = np.asarray(data[['Temperature(celsius)','Target_Temperature(celsius)','Power','PowerConsumption','ContentType','O2','CO2','Time_Door_Open','Defrost_Cycle']])
X[0:5]

array([[ 1.44839974,  4.        , 17.02287069, 20.79034749,  2.        ,
        21.03684175,  2.3115489 , 31.45828622,  6.        ],
       [ 3.42901817,  4.        ,  3.59281214, 24.38315962,  2.        ,
        19.37627866,  8.20850067, 25.42417332,  4.        ],
       [ 2.1286465 ,  4.        , 19.35812534, 43.74128497,  2.        ,
        20.90564696, 12.76988397, 32.35596162,  1.        ],
       [ 7.60669751,  4.        ,  8.97703011, 52.71831508,  2.        ,
        25.17408283,  3.58431369, 28.84048139,  4.        ],
       [ 3.87054371,  4.        ,  6.73891933, 59.45723441,  2.        ,
        15.75376503, -3.81585455, 30.82075413,  3.        ]])

In [4]:
# Convert dataset into array for the dependent (objective) variables
y = np.asarray(data['Maintenance_Required'])
#y = np.asarray(data['Maintainence_Required'])
y [0:5]

array([0, 0, 0, 0, 0])

In [5]:
# Normalize/standardize (mean = 0 and standard deviation = 1) 
# your features before applying machine learning techniques.
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[-0.78830883,  0.        ,  1.80536954, -0.791233  ,  0.        ,
         0.02944169, -0.56421539,  0.68968588,  1.52733065],
       [-0.44453391,  0.        , -0.3947635 , -0.78965296,  0.        ,
        -0.51282631,  1.39669983, -2.37935883,  0.5133352 ],
       [-0.67023875,  0.        ,  2.18793459, -0.78113966,  0.        ,
        -0.01340084,  2.91349804,  1.14625771, -1.00765798],
       [ 0.28058375,  0.        ,  0.48728745, -0.77719174,  0.        ,
         1.38048549, -0.14098252, -0.64177082,  0.5133352 ],
       [-0.36789855,  0.        ,  0.12063666, -0.77422811,  0.        ,
        -1.69578241, -2.60176278,  0.36542703,  0.00633747]])

In [6]:
## split the dataset into train and test to estiamte model accuracy 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (1600, 9) (1600,)
Test set: (400, 9) (400,)


In [7]:
## As we are trying to acheive a binary classification, we use Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
LR

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [8]:
LR.coef_

array([[ 0.15830492,  0.        ,  0.03690305,  0.77966287,  0.        ,
        -0.01955488,  0.44335135,  0.07408011,  0.05635899]])

In [9]:
## Predict using the trained LR model
yhat = LR.predict(X_test)
yhat

array([1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,

In [10]:
yhat_prob = LR.predict_proba(X_test)
yhat_prob

array([[0.47766703, 0.52233297],
       [0.78247702, 0.21752298],
       [0.44081017, 0.55918983],
       [0.76834367, 0.23165633],
       [0.81540078, 0.18459922],
       [0.45814685, 0.54185315],
       [0.39827836, 0.60172164],
       [0.27863134, 0.72136866],
       [0.89814321, 0.10185679],
       [0.61161381, 0.38838619],
       [0.74505992, 0.25494008],
       [0.62753913, 0.37246087],
       [0.88749762, 0.11250238],
       [0.82621609, 0.17378391],
       [0.72557427, 0.27442573],
       [0.82591643, 0.17408357],
       [0.65140111, 0.34859889],
       [0.64633487, 0.35366513],
       [0.71378742, 0.28621258],
       [0.73960416, 0.26039584],
       [0.59583515, 0.40416485],
       [0.2869435 , 0.7130565 ],
       [0.80938731, 0.19061269],
       [0.22391735, 0.77608265],
       [0.75772979, 0.24227021],
       [0.62244757, 0.37755243],
       [0.67710582, 0.32289418],
       [0.67701687, 0.32298313],
       [0.76948842, 0.23051158],
       [0.11413944, 0.88586056],
       [0.

In [11]:
from sklearn.metrics import jaccard_score
jaccard_score(y_test, yhat)

0.38461538461538464

In [12]:
from sklearn.metrics import classification_report, confusion_matrix
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    print(confusion_matrix(y_test, yhat, labels=[1,0]))

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, yhat, labels=[1,0])
np.set_printoptions(precision=2)


# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['MaintFlag=1','MaintFlag=0'],normalize= False,  title='Confusion matrix')

In [14]:
print (classification_report(y_test, yhat))


              precision    recall  f1-score   support

           0       0.82      0.85      0.84       286
           1       0.59      0.53      0.56       114

    accuracy                           0.76       400
   macro avg       0.70      0.69      0.70       400
weighted avg       0.75      0.76      0.76       400



In [15]:

from sklearn.metrics import log_loss
log_loss(y_test, yhat_prob)


0.5206027539762208

In [16]:
import pickle

#serializing our model to a file called model_logistic_regression.pkl
pickle.dump(LR, open("model_logistic_regression.pkl","wb"))