### import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn import metrics
import matplotlib.pyplot as plt
print('Done Importing')

Done Importing


#### Import the dataset

In [2]:
df=pd.read_csv('DSL-StrongPasswordData.csv')
df.head()

Unnamed: 0,subject,sessionIndex,rep,H.period,DD.period.t,UD.period.t,H.t,DD.t.i,UD.t.i,H.i,...,H.a,DD.a.n,UD.a.n,H.n,DD.n.l,UD.n.l,H.l,DD.l.Return,UD.l.Return,H.Return
0,s002,1,1,0.1491,0.3979,0.2488,0.1069,0.1674,0.0605,0.1169,...,0.1349,0.1484,0.0135,0.0932,0.3515,0.2583,0.1338,0.3509,0.2171,0.0742
1,s002,1,2,0.1111,0.3451,0.234,0.0694,0.1283,0.0589,0.0908,...,0.1412,0.2558,0.1146,0.1146,0.2642,0.1496,0.0839,0.2756,0.1917,0.0747
2,s002,1,3,0.1328,0.2072,0.0744,0.0731,0.1291,0.056,0.0821,...,0.1621,0.2332,0.0711,0.1172,0.2705,0.1533,0.1085,0.2847,0.1762,0.0945
3,s002,1,4,0.1291,0.2515,0.1224,0.1059,0.2495,0.1436,0.104,...,0.1457,0.1629,0.0172,0.0866,0.2341,0.1475,0.0845,0.3232,0.2387,0.0813
4,s002,1,5,0.1249,0.2317,0.1068,0.0895,0.1676,0.0781,0.0903,...,0.1312,0.1582,0.027,0.0884,0.2517,0.1633,0.0903,0.2517,0.1614,0.0818


In [3]:
#check the shape of the data
df.shape

(20400, 34)

In [4]:
#check for null values
df.isnull().sum()

subject            0
sessionIndex       0
rep                0
H.period           0
DD.period.t        0
UD.period.t        0
H.t                0
DD.t.i             0
UD.t.i             0
H.i                0
DD.i.e             0
UD.i.e             0
H.e                0
DD.e.five          0
UD.e.five          0
H.five             0
DD.five.Shift.r    0
UD.five.Shift.r    0
H.Shift.r          0
DD.Shift.r.o       0
UD.Shift.r.o       0
H.o                0
DD.o.a             0
UD.o.a             0
H.a                0
DD.a.n             0
UD.a.n             0
H.n                0
DD.n.l             0
UD.n.l             0
H.l                0
DD.l.Return        0
UD.l.Return        0
H.Return           0
dtype: int64

In [5]:
#check for duplicate rows
np.sum(df.duplicated())

0

In [6]:
#check the data types
df.dtypes

subject             object
sessionIndex         int64
rep                  int64
H.period           float64
DD.period.t        float64
UD.period.t        float64
H.t                float64
DD.t.i             float64
UD.t.i             float64
H.i                float64
DD.i.e             float64
UD.i.e             float64
H.e                float64
DD.e.five          float64
UD.e.five          float64
H.five             float64
DD.five.Shift.r    float64
UD.five.Shift.r    float64
H.Shift.r          float64
DD.Shift.r.o       float64
UD.Shift.r.o       float64
H.o                float64
DD.o.a             float64
UD.o.a             float64
H.a                float64
DD.a.n             float64
UD.a.n             float64
H.n                float64
DD.n.l             float64
UD.n.l             float64
H.l                float64
DD.l.Return        float64
UD.l.Return        float64
H.Return           float64
dtype: object

In [7]:
df.columns

Index(['subject', 'sessionIndex', 'rep', 'H.period', 'DD.period.t',
       'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e',
       'H.e', 'DD.e.five', 'UD.e.five', 'H.five', 'DD.five.Shift.r',
       'UD.five.Shift.r', 'H.Shift.r', 'DD.Shift.r.o', 'UD.Shift.r.o', 'H.o',
       'DD.o.a', 'UD.o.a', 'H.a', 'DD.a.n', 'UD.a.n', 'H.n', 'DD.n.l',
       'UD.n.l', 'H.l', 'DD.l.Return', 'UD.l.Return', 'H.Return'],
      dtype='object')

In [8]:
x=df[['sessionIndex', 'rep', 'H.period', 'DD.period.t',
       'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e',
       'H.e', 'DD.e.five', 'UD.e.five', 'H.five', 'DD.five.Shift.r',
       'UD.five.Shift.r', 'H.Shift.r', 'DD.Shift.r.o', 'UD.Shift.r.o', 'H.o',
       'DD.o.a', 'UD.o.a', 'H.a', 'DD.a.n', 'UD.a.n', 'H.n', 'DD.n.l',
       'UD.n.l', 'H.l', 'DD.l.Return', 'UD.l.Return', 'H.Return']].values
y=df[['subject']].values
y=y.ravel()

In [17]:
#scale your data
x=preprocessing.StandardScaler().fit(x).transform(x)

### Train split your dataset

In [18]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=4)

In [19]:
print('x_train',x_train.shape)
print('y_train',y_train.shape)

x_train (16320, 33)
y_train (16320,)


### Model your data

In [20]:
model1=DecisionTreeClassifier(criterion='entropy')
model2=KNeighborsClassifier(n_neighbors =5)
model3=RandomForestClassifier()
model4=GradientBoostingClassifier()
model5=svm.SVC(kernel='rbf')
model6=MLPClassifier()

In [21]:
#find the best k
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(x_train,y_train)
    yhat=neigh.predict(x_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

mean_acc

array([0.86151961, 0.83382353, 0.85661765, 0.86151961, 0.86568627,
       0.86372549, 0.86446078, 0.86151961, 0.86495098])

In [22]:
print( "The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax()+1)

The best accuracy was with 0.865686274509804 with k= 5


In [23]:
#Grid search for best parameters

In [24]:
model1.fit(x_train,y_train)
model2.fit(x_train,y_train)
model3.fit(x_train,y_train)
model4.fit(x_train,y_train)
model5.fit(x_train,y_train)
model6.fit(x_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [25]:
yhat1=model1.predict(x_test)
yhat2=model2.predict(x_test)
yhat3=model3.predict(x_test)
yhat4=model4.predict(x_test)
yhat5=model5.predict(x_test)
yhat6=model6.predict(x_test)

In [26]:
#Evaluation Decision Tree
metrics.accuracy_score(y_test,yhat1)

0.7526960784313725

In [27]:
#Evaluation K Nearest
metrics.accuracy_score(y_test,yhat2)

0.865686274509804

In [28]:
#Evaluation  Random Forest
metrics.accuracy_score(y_test,yhat3)

0.8757352941176471

In [29]:
#Evaluation Gradient Boosting
metrics.accuracy_score(y_test,yhat4)

0.9313725490196079

In [30]:
#Evaluation SVM
metrics.accuracy_score(y_test,yhat5)

0.9053921568627451

In [31]:
#Evaluation MLP Classifier
metrics.accuracy_score(y_test,yhat6)

0.9262254901960785

### Classification report

In [32]:
print(metrics.classification_report(y_test,yhat4))

             precision    recall  f1-score   support

       s002       0.82      0.89      0.86        83
       s003       0.93      0.93      0.93        88
       s004       0.94      0.93      0.93        82
       s005       0.92      0.92      0.92        91
       s007       0.89      0.93      0.91        82
       s008       0.88      0.85      0.86        67
       s010       0.99      0.97      0.98        69
       s011       0.95      0.92      0.94        79
       s012       0.94      0.82      0.88        80
       s013       0.96      0.94      0.95        82
       s015       0.91      0.89      0.90        76
       s016       0.94      0.96      0.95        89
       s017       0.98      1.00      0.99        84
       s018       0.96      0.86      0.91        87
       s019       0.96      0.98      0.97        82
       s020       0.87      0.88      0.88        76
       s021       0.88      1.00      0.94        82
       s022       0.95      0.99      0.97   

In [33]:
print(metrics.classification_report(y_test,yhat6))

             precision    recall  f1-score   support

       s002       0.92      0.86      0.89        83
       s003       0.95      0.91      0.93        88
       s004       0.95      0.94      0.94        82
       s005       0.97      0.95      0.96        91
       s007       0.92      0.89      0.91        82
       s008       0.87      0.87      0.87        67
       s010       0.99      0.96      0.97        69
       s011       0.90      0.95      0.93        79
       s012       0.89      0.89      0.89        80
       s013       0.94      0.91      0.93        82
       s015       0.91      0.89      0.90        76
       s016       0.91      0.97      0.94        89
       s017       0.97      1.00      0.98        84
       s018       0.97      0.84      0.90        87
       s019       0.97      0.95      0.96        82
       s020       0.86      0.89      0.88        76
       s021       0.87      0.91      0.89        82
       s022       0.97      1.00      0.98   

# Confusion Matrix