#### Importing packages

In [336]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import plotly as py
import plotly.express as px
import missingno as msno
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import accuracy_score,recall_score, precision_score
# to avoid warnings
import warnings
warnings.filterwarnings('ignore')
warnings.warn("this will not show")

#### Functions

In [337]:
# Function to remove missing values above a given threshold
def missing_remove(dataframe, threshold):
    columns = dataframe.columns[(dataframe.isna().sum()/dataframe.shape[1])>threshold].tolist()
    print(f"# Features deleted with more than {threshold} % missing values", len(columns))
    return dataframe.drop(columns, axis=1,inplace=True)

In [338]:
# Function to remove features with unique values
def unique_remove(dataframe):
    unique_features = []
    for col in dataframe.columns:
        if(dataframe[col].unique().size == 2):
            if(len(unique_features) == 0):
                unique_features =  dataframe[col]
            else:
                 unique_features = pd.concat([unique_features, dataframe[col]], axis=1)
            dataframe.drop(col, axis=1, inplace=True)

In [339]:
# Function to remove outlier
def outlier_detection_zcore(dataframe):
    data_mean, data_std  = dataframe.mean(), dataframe.std()
    data_z_scores = ((dataframe - data_mean) / data_std).abs()
    m = data_z_scores > 3
    dataframe = dataframe.mask(m,inplace=True)
    return dataframe

In [340]:
def evaluate2(model, X_train,X_test,y_train,y_test):
  from matplotlib.pyplot import figure
  figure(figsize=(8, 5), dpi=80)

  scaler = StandardScaler()
  scaler.fit(X_train)
  X_train_std = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
  X_test_std = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

  model.fit(X_train_std, y_train)
  y_pred = model.predict(X_test_std)
  cm=confusion_matrix(y_test, y_pred)
  sns.heatmap(cm, annot=True, fmt="d",cmap="YlOrRd",yticklabels = ['Act. Pass', 'Act. Fail'], xticklabels = ['Pred. Pass' , 'Pred. Fail'])
  #print(classification_report(y_test, y_pred))
  ax = plt.gca()
  plot = plot_roc_curve(model, X_test_std, y_test, ax=ax, alpha=0.8)
  print("Model Name:", model)
  print("Train Accuracy Score:", accuracy_score(y_train, model.predict(X_train_std)))
  print("Test Accuracy Score:", accuracy_score(y_test, y_pred))
  f1score = f1_score(y_test, y_pred, average='micro')
  mccscore=matthews_corrcoef(y_test, y_pred)
  recall = recall_score(y_test,y_pred)
  precision = precision_score(y_test,y_pred)
  specificity = cm[1,1]/(cm[1,1]+cm[0,1])
  sensitivity = cm[0,0]/(cm[0,0]+cm[1,0])
  FNR = cm[1,0]/(cm[1,0]+cm[0,0])
  FPR = cm[0,1]/(cm[0,1]+cm[1,1])
  specificity1 = 1 - FPR
  sensitivity1 = 1- FNR
  print("F1 Score:", f1score) 
  print("MC Score:" , mccscore)
  print("Sensitivity:" , sensitivity)
  print("Specificity:" , specificity)
  print("Recall:" , recall)
  print("Precision:" , precision)
  print("FNR:" , FNR)
  print(f"FPR: {FPR}\n" )
  #print("Sensitivity1:" , sensitivity1)
  #print("Specificity1:" , specificity1)


#### Importing and basic cleaning

In [341]:
# Importing data from internet in runtime
data_url="https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data"

In [342]:
# Assigning data to a pandas dataframe
secom_data = pd.read_csv(data_url,sep=' ',header=None)

In [343]:
# Renaming the columns 
secom_data.columns = ["Feature_"+str(column+1) for column in range(len(secom_data.columns))]
secom_data.head()

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_581,Feature_582,Feature_583,Feature_584,Feature_585,Feature_586,Feature_587,Feature_588,Feature_589,Feature_590
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,,,0.5005,0.0118,0.0035,2.363,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,0.006,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,0.0044,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432


In [344]:
# Importing target data from internet in runtime
label_url="https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data"

In [345]:
# Assigning target data to a pandas dataframe
secom_labels = pd.read_csv(label_url, sep = " ",header=None)

In [346]:
# Renaming the columns 
secom_labels.columns = ["Classification","Timestamp"]

In [347]:
# Changing datatype for consitency
secom_labels['Timestamp'] = pd.to_datetime(secom_labels['Timestamp'],errors='raise')

In [348]:
# Merging the data
data= pd.concat([secom_labels,secom_data],axis=1)

In [349]:
# Dropping the timestamp column as it provides no insights
data.drop(columns="Timestamp", errors='raise',inplace=True)

In [350]:
target = data[['Classification']]
data.drop(['Classification'], axis=1,inplace=True)

#### Spliting the data into training set and test set

In [351]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data,target, test_size=0.20, random_state=42, stratify=target)
# We use the startify parameter as the data is imbalanced on the basis of pass and fail and this will ensure the same 
#is retained when we split - Random state will help us create a reproducible data - Test size is the split ration

In [352]:
X_test_original = X_test

#### Re-labeling the Target values


In [353]:
# # Relabeling target variables 1 is pass and 0 is fail
# y_train = y_train.replace(to_replace=[-1, 1], value=[1, 0])
# y_test = y_test.replace(to_replace=[-1, 1], value=[1, 0])

In [354]:
# # Scaling the data
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
# X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Decision : Removing columns with more that 50% missing values

In [355]:
X_train.shape

(1253, 590)

In [356]:
X_train_missing = X_train
missing_remove(X_train_missing, 0.5)
X_train_missing.shape

# Features deleted with more than 0.5 % missing values 32


(1253, 558)

## Removing Unique cols

In [357]:
# X_train_unique = X_train_missing
# unique_remove(X_train_unique)
# X_train_unique.shape

In [358]:
# From test set deleting columns that are removed in training set
X_test_missing = X_test[np.array(X_train_missing.columns)]
X_test_missing.shape

(314, 558)

# Decision : Replacing outliers with 3s boundary

##### Other option of removing outliers and replacing them has been explored in file "Base file - removing outliers and replacing with knn.ipynb"

In [359]:
def replace_outlier(val, mean, std):
    if val > mean + 3*std:
        return mean + 3*std 
    elif val < mean - 3*std:
        return mean - 3*std
    return val

In [360]:
# replace with 3s in X_train
for col in X_train_missing.columns:
    mean = X_train_missing[col].mean()
    std_dev = X_train_missing[col].std(axis=0)
    X_train_missing[col] = X_train_missing[col].map(lambda x: replace_outlier(x, mean, std_dev))

In [361]:
# replace with 3s in X_test
for col in X_test_missing.columns:
    mean = X_test_missing[col].mean()
    std_dev = X_test_missing[col].std(axis=0)
    X_test_missing[col] = X_test_missing[col].map(lambda x: replace_outlier(x, mean, std_dev))

# KNN Imputation

In [362]:
from sklearn.impute import KNNImputer
knn = KNNImputer()
knn.fit(X_train_missing)
X_train_knn=pd.DataFrame(knn.transform(X_train_missing), columns=X_train_missing.columns)
X_train_knn.isna().sum().sum()

0

In [363]:
from sklearn.impute import KNNImputer
knn = KNNImputer()
knn.fit(X_test_missing)
X_test_knn=pd.DataFrame(knn.transform(X_test_missing), columns=X_test_missing.columns)
X_test_knn.isna().sum().sum()

0

# Feature Selection

## Boruta - 15 features

In [364]:
pip install boruta

Note: you may need to restart the kernel to use updated packages.


In [370]:
#Boruta function

def BorutaFeatureSelection (X, y) :
    feature_names = np.array(X.columns)

    model = model = RandomForestClassifier(n_estimators=200, n_jobs=-1, max_depth=8,class_weight='balanced_subsample')
    model.fit(X, y)
    
    feature_selector = BorutaPy(model, random_state=42, verbose=2,max_iter=250,n_estimators=200)

    feature_selector.fit(X.to_numpy(),y)

    feature_selector.support_

    feature_selector.ranking_

    feature_ranks = list(zip(feature_names, 
                             feature_selector.ranking_, 
                             feature_selector.support_))

    for feat in feature_ranks:
        print('Feature: {:<30} Rank: {},  Keep: {}'.format(feat[0], feat[1], feat[2]))
        
    final_features = list()
    indexes = np.where(feature_selector.ranking_ <= 1)
    for x in np.nditer(indexes):
        final_features.append(feature_names[x])
    print(final_features)
    
    return pd.DataFrame(X.filter(final_features)) , final_features

In [371]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
X_train_boruta , final_features = BorutaFeatureSelection(X_train_knn,y_train.values.ravel())

Iteration: 	1 / 250
Confirmed: 	0
Tentative: 	558
Rejected: 	0
Iteration: 	2 / 250
Confirmed: 	0
Tentative: 	558
Rejected: 	0
Iteration: 	3 / 250
Confirmed: 	0
Tentative: 	558
Rejected: 	0
Iteration: 	4 / 250
Confirmed: 	0
Tentative: 	558
Rejected: 	0
Iteration: 	5 / 250
Confirmed: 	0
Tentative: 	558
Rejected: 	0
Iteration: 	6 / 250
Confirmed: 	0
Tentative: 	558
Rejected: 	0
Iteration: 	7 / 250
Confirmed: 	0
Tentative: 	558
Rejected: 	0
Iteration: 	8 / 250
Confirmed: 	0
Tentative: 	25
Rejected: 	533
Iteration: 	9 / 250
Confirmed: 	3
Tentative: 	22
Rejected: 	533
Iteration: 	10 / 250
Confirmed: 	3
Tentative: 	22
Rejected: 	533
Iteration: 	11 / 250
Confirmed: 	3
Tentative: 	22
Rejected: 	533
Iteration: 	12 / 250
Confirmed: 	3
Tentative: 	16
Rejected: 	539
Iteration: 	13 / 250
Confirmed: 	3
Tentative: 	16
Rejected: 	539
Iteration: 	14 / 250
Confirmed: 	3
Tentative: 	16
Rejected: 	539
Iteration: 	15 / 250
Confirmed: 	3
Tentative: 	16
Rejected: 	539
Iteration: 	16 / 250
Confirmed: 	3
Tentat

Iteration: 	127 / 250
Confirmed: 	13
Tentative: 	5
Rejected: 	540
Iteration: 	128 / 250
Confirmed: 	13
Tentative: 	5
Rejected: 	540
Iteration: 	129 / 250
Confirmed: 	13
Tentative: 	5
Rejected: 	540
Iteration: 	130 / 250
Confirmed: 	13
Tentative: 	5
Rejected: 	540
Iteration: 	131 / 250
Confirmed: 	13
Tentative: 	5
Rejected: 	540
Iteration: 	132 / 250
Confirmed: 	13
Tentative: 	5
Rejected: 	540
Iteration: 	133 / 250
Confirmed: 	13
Tentative: 	5
Rejected: 	540
Iteration: 	134 / 250
Confirmed: 	13
Tentative: 	5
Rejected: 	540
Iteration: 	135 / 250
Confirmed: 	13
Tentative: 	5
Rejected: 	540
Iteration: 	136 / 250
Confirmed: 	13
Tentative: 	5
Rejected: 	540
Iteration: 	137 / 250
Confirmed: 	13
Tentative: 	5
Rejected: 	540
Iteration: 	138 / 250
Confirmed: 	13
Tentative: 	5
Rejected: 	540
Iteration: 	139 / 250
Confirmed: 	13
Tentative: 	5
Rejected: 	540
Iteration: 	140 / 250
Confirmed: 	13
Tentative: 	5
Rejected: 	540
Iteration: 	141 / 250
Confirmed: 	13
Tentative: 	5
Rejected: 	540
Iteration:

NameError: name 'tentative' is not defined

In [None]:
# From test set deleting columns that are removed in training set
X_test_boruta = X_test_knn[np.array(X_train_boruta.columns)]
X_test_boruta.shape

## Balancing using SMOTE

In [None]:
import numpy as np
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA

In [None]:
from imblearn.combine import SMOTETomek 
# Sampling technique

def Sampling(X_train, y_train, sampler):

    #SMOTE
    if sampler == 'SMOTE':
        sampler = SMOTE(random_state=100)    
    

    #SMOTTEENN
    if sampler == 'SMOTEENN' :
        sampler = SMOTEENN(random_state=100)
        
    # SMOTETOMEK
    
    if sampler == 'SMOTETOMEK' :
        sampler = SMOTETomek(random_state=100)

    #ADASYN
    if sampler == 'ADASYN':
        sampler = ADASYN(random_state=100)
        
    #ROSE
    if sampler == 'ROSE':
        sampler = RandomOverSampler(random_state=100, shrinkage=1)
        

    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    counter = Counter(y_resampled)
    print(counter)
    return X_resampled, y_resampled

## Rose

In [None]:
# Imbalance treatment
from collections import Counter
from sklearn.metrics import f1_score, matthews_corrcoef
X_rose, y_rose = Sampling(X_train_boruta, y_train,'ROSE')

In [None]:
evaluate2(RandomForestClassifier(n_jobs=-1, class_weight='balanced_subsample', max_depth=6, random_state=100),X_rose,X_test_boruta,y_rose,y_test)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
X_rose

In [None]:
# Random forest
params = {'criterion' : ["gini","entropy","log_loss"],
          'max_depth':[1,2, 3,4, 5,6,7,8],
         'max_features':["sqrt","log2", None]}
          

grid_searcher = GridSearchCV(RandomForestClassifier(), params,scoring='roc_auc', cv = 5, verbose=1)
grid_searcher.fit(X_rose, y_rose)
#plot_result_smoted_fe_tuned(grid_searcher, "dtc")

In [None]:
grid_searcher.best_params_

In [None]:
grid_searcher.best_estimator_


In [None]:
evaluate2(grid_searcher.best_estimator_,X_rose,X_test_boruta,y_rose,y_test)


In [None]:
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split # for separating train and test data, scaling, model building, ...

# Evaluation using Startified K Fold cross validation
cv = StratifiedKFold(n_splits=5)
scores = cross_val_score(grid_searcher.best_estimator_, X_rose, y_rose, cv=cv, scoring = 'roc_auc')
print(scores.mean())
# fit
grid_searcher.best_estimator_.fit(X_rose, y_rose)    
# predict
y_pred_test = grid_searcher.best_estimator_.predict(X_test_boruta)
# View accuracy score
accuracyFinal = accuracy_score(y_test, y_pred_test)
print('Test Accuracy : ', accuracyFinal)

In [374]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=500)

trans = BorutaPy(clf, verbose=2,max_iter=250)
sel = trans.fit_transform(X_train_knn.values, y_train.values.ravel())

Iteration: 	1 / 250
Confirmed: 	0
Tentative: 	558
Rejected: 	0
Iteration: 	2 / 250
Confirmed: 	0
Tentative: 	558
Rejected: 	0
Iteration: 	3 / 250
Confirmed: 	0
Tentative: 	558
Rejected: 	0
Iteration: 	4 / 250
Confirmed: 	0
Tentative: 	558
Rejected: 	0
Iteration: 	5 / 250
Confirmed: 	0
Tentative: 	558
Rejected: 	0
Iteration: 	6 / 250
Confirmed: 	0
Tentative: 	558
Rejected: 	0
Iteration: 	7 / 250
Confirmed: 	0
Tentative: 	558
Rejected: 	0
Iteration: 	8 / 250
Confirmed: 	0
Tentative: 	14
Rejected: 	544
Iteration: 	9 / 250
Confirmed: 	3
Tentative: 	11
Rejected: 	544
Iteration: 	10 / 250
Confirmed: 	3
Tentative: 	11
Rejected: 	544
Iteration: 	11 / 250
Confirmed: 	3
Tentative: 	11
Rejected: 	544
Iteration: 	12 / 250
Confirmed: 	3
Tentative: 	11
Rejected: 	544
Iteration: 	13 / 250
Confirmed: 	3
Tentative: 	11
Rejected: 	544


KeyboardInterrupt: 

In [None]:
trans