**Importing Libraries & Loading Dataset**

In [3]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [4]:
#Load data from CSV

url = 'https://raw.githubusercontent.com/ihasanreza/data_science/master/Data_for_UCI_named.csv'
df = pd.read_csv(url)

**Data Pre-Processing**

In [5]:
df.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable


In [6]:
df.shape
print('-'*28)
print(df.columns.to_list())

----------------------------
['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2', 'g3', 'g4', 'stab', 'stabf']


In [7]:
def ret_int(str_label):
  if str_label == 'unstable':
    return 0
  else:
    return 1


In [8]:
#Add new columns (stabf_integer)

df['stabf_int'] = df['stabf'].apply(lambda str_lab: ret_int(str_lab))

In [9]:
df.head(3)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf,stabf_int
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable,0
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable,1
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable,0


In [10]:
data_cols = ['tau1',	'tau2',	'tau3',	'tau4',	'p1',	'p2',	'p3',	'p4',	'g1',	'g2',	'g3',	'g4',	'stab']
lab_col = ['stabf_int']

df_X = df[data_cols]
df_Y = df[lab_col]

In [11]:
df_X.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986


In [12]:
df_Y.head()

Unnamed: 0,stabf_int
0,0
1,1
2,0
3,0
4,0


In [13]:
X = df_X.to_numpy(copy = True)
Y = df_Y.to_numpy(copy = True)

In [14]:
print(type(df_X))
print(type(X))

<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>


In [15]:
print(X.shape)
print(Y.shape)

(10000, 13)
(10000, 1)


In [16]:
print(X[0:2, :])

[[ 2.95906002e+00  3.07988520e+00  8.38102539e+00  9.78075443e+00
   3.76308477e+00 -7.82603631e-01 -1.25739483e+00 -1.72308631e+00
   6.50456461e-01  8.59578106e-01  8.87444921e-01  9.58033988e-01
   5.53474892e-02]
 [ 9.30409723e+00  4.90252411e+00  3.04754073e+00  1.36935736e+00
   5.06781210e+00 -1.94005843e+00 -1.87274169e+00 -1.25501199e+00
   4.13440568e-01  8.62414076e-01  5.62139051e-01  7.81759911e-01
  -5.95746433e-03]]


In [17]:
df.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf,stabf_int
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable,0
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable,1


In [18]:
print(X[0:2, :])

[[ 2.95906002e+00  3.07988520e+00  8.38102539e+00  9.78075443e+00
   3.76308477e+00 -7.82603631e-01 -1.25739483e+00 -1.72308631e+00
   6.50456461e-01  8.59578106e-01  8.87444921e-01  9.58033988e-01
   5.53474892e-02]
 [ 9.30409723e+00  4.90252411e+00  3.04754073e+00  1.36935736e+00
   5.06781210e+00 -1.94005843e+00 -1.87274169e+00 -1.25501199e+00
   4.13440568e-01  8.62414076e-01  5.62139051e-01  7.81759911e-01
  -5.95746433e-03]]


In [19]:
print(Y[0:2, :])

[[0]
 [1]]


In [20]:
tot_rows, tot_cols = df_X.shape
print(tot_rows)
print(0.7 * tot_rows, 0.3 * tot_rows)

# X[0:7000, :] ----> Training X
# Y[0:7000, :] ----> Training Y

# X[7000:, :] ----> Testing X
# Y[7000:, :] ----> Testing Y

10000
7000.0 3000.0


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)

#RECOMMENDATION
#DO IT USING INDICES SLICING

In [22]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(7000, 13) (3000, 13)
(7000, 1) (3000, 1)


In [23]:
X_train[0:2, :]
# X_test[0:2, :]

array([[ 5.83794467,  8.43695176,  1.43228098,  6.95328558,  3.04763533,
        -1.29628743, -0.93971628, -0.81163163,  0.22000689,  0.72385184,
         0.70076655,  0.93635346,  0.04019929],
       [ 2.41418562,  7.21915187,  3.970973  ,  8.24679011,  2.91574022,
        -0.76355824, -1.09754498, -1.054637  ,  0.28358951,  0.9056858 ,
         0.78448079,  0.80167097,  0.06609188]])

**Modeling**

**Naive Bayes Classifier**

In [24]:
# Applying Naive Bayes Model on our Train Set (X_train, y_train)

clf_nb = GaussianNB()
clf_nb.fit(X_train, y_train.ravel())


GaussianNB(priors=None, var_smoothing=1e-09)

In [25]:
print(clf_nb.class_count_)

[4469. 2531.]


In [26]:
#y_train.shape
#y_train.ravel().shape
# dir(clf_nb)
# print(clf_nb.class_count_)

predictions = clf_nb.predict(X_test)

In [27]:
# X_test[0,:]
print(type(predictions), predictions.shape)

<class 'numpy.ndarray'> (3000,)


In [28]:
index = 1220
print(y_test[index], predictions[index])

[0] 0


In [29]:
accuracy_score(y_test, predictions)

0.973

In [157]:
y_test.shape

(3000, 1)

In [158]:
sum(y_test)

array([1082])

In [159]:
1082/3000

0.3606666666666667

In [160]:
# Prediction on Single Datapoint

index = 0
x_single = X_test[index,:]
print(x_single)
print(y_test[index,:])

[ 7.07877833  0.61931187  9.90169994  7.2468625   2.36487237 -0.53314974
 -1.17383901 -0.65788362  0.5463623   0.45054152  0.65482283  0.77218521
  0.03965098]
[0]


In [161]:
x_single.reshape(-1,1).shape

(13, 1)

In [162]:
prediction_xsingle = clf_nb.predict(x_single.reshape(1,-1))
print(X_test[0,:])

[ 7.07877833  0.61931187  9.90169994  7.2468625   2.36487237 -0.53314974
 -1.17383901 -0.65788362  0.5463623   0.45054152  0.65482283  0.77218521
  0.03965098]


In [163]:
print(prediction_xsingle, y_test[index])

[0] [0]


In [164]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print((tn, fp, fn, tp))

(1877, 41, 33, 1049)


In [165]:
precision_recall_fscore_support(y_test, predictions, pos_label=1, average='binary')

(0.9623853211009175, 0.9695009242144177, 0.9659300184162063, None)

**Logistic Regression Classifier**

In [166]:
clf_lr = LogisticRegression(max_iter=500)

In [167]:
clf_lr.fit(X_train, y_train.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [168]:
preditions = clf_lr.predict(X_test)

In [169]:
accuracy_score(y_test, predictions)

0.9753333333333334

In [170]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print((tn, fp, fn, tp))

(1877, 41, 33, 1049)


In [171]:
precision_recall_fscore_support(y_test, predictions, pos_label=1, average='binary')

(0.9623853211009175, 0.9695009242144177, 0.9659300184162063, None)

**XGBoost**

In [172]:
import xgboost as xgb

In [173]:
clf_xgboost = xgb.XGBClassifier(learning_rate=0.01)

In [174]:
clf_xgboost.fit(X_train, y_train.ravel())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.01, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [175]:
predictions = clf_xgboost.predict(X_test)

In [176]:
accuracy_score(y_test, predictions)

1.0

In [177]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print((tn, fp, fn, tp))

(1918, 0, 0, 1082)


In [178]:
precision_recall_fscore_support(y_test, predictions, pos_label=1, average='binary')

(1.0, 1.0, 1.0, None)