Importing packages

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import plotly as py
import plotly.express as px
import missingno as msno
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

## Functions

In [2]:
# Function to remove missing values above a given threshold
def missing_remove(dataframe, threshold):
    columns = dataframe.columns[(dataframe.isna().sum()/dataframe.shape[1])>threshold].tolist()
    print(f"# Features deleted with more than {threshold} % missing values", len(columns))
    return dataframe.drop(columns, axis=1,inplace=True)

In [3]:
# Function to remove features with unique values
def unique_remove(dataframe):
    unique_features = []
    for col in dataframe.columns:
        if(dataframe[col].unique().size == 2):
            if(len(unique_features) == 0):
                unique_features =  dataframe[col]
            else:
                 unique_features = pd.concat([unique_features, dataframe[col]], axis=1)
            dataframe.drop(col, axis=1, inplace=True)

In [4]:
# Function to remove outlier
def outlier_detection_zcore(dataframe):
    data_mean, data_std  = dataframe.mean(), dataframe.std()
    data_z_scores = ((dataframe - data_mean) / data_std).abs()
    m = data_z_scores > 3
    dataframe = dataframe.mask(m,inplace=True)
    return dataframe

## Importing and basic cleaning

In [5]:
# Importing data from internet in runtime
data_url="https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data"

In [6]:
# Assigning data to a pandas dataframe
secom_data = pd.read_csv(data_url,sep=' ',header=None)

In [7]:
# Renaming the columns 
secom_data.columns = ["Feature_"+str(column+1) for column in range(len(secom_data.columns))]
secom_data.head()

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_581,Feature_582,Feature_583,Feature_584,Feature_585,Feature_586,Feature_587,Feature_588,Feature_589,Feature_590
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,,,0.5005,0.0118,0.0035,2.363,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,0.006,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,0.0044,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432


In [8]:
# Importing target data from internet in runtime
label_url="https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data"

In [9]:
# Assigning target data to a pandas dataframe
secom_labels = pd.read_csv(label_url, sep = " ",header=None)

In [10]:
# Renaming the columns 
secom_labels.columns = ["Classification","Timestamp"]

In [11]:
# Changing datatype for consitency
secom_labels['Timestamp'] = pd.to_datetime(secom_labels['Timestamp'],errors='raise')

In [12]:
# Merging the data
data= pd.concat([secom_labels,secom_data],axis=1)

In [13]:
# Dropping the timestamp column as it provides no insights
data.drop(columns="Timestamp", errors='raise',inplace=True)

In [14]:
target = data[['Classification']]
data.drop(['Classification'], axis=1,inplace=True)

## Spliting the data into training set and test set

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data,target, test_size=0.25, random_state=42, stratify=target)
# We use the startify parameter as the data is imbalanced on the basis of pass and fail and this will ensure the same 
#is retained when we split - Random state will help us create a reproducible data - Test size is the split ration

## Re-labeling the Target values


In [16]:
# Relabeling target variables 1 is pass and 0 is fail
y_train = y_train.replace(to_replace=[-1, 1], value=[1, 0])
y_test = y_test.replace(to_replace=[-1, 1], value=[1, 0])

In [17]:
# # Scaling the data
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
# X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

## Removing columns with more that 50% missing values

In [18]:
X_train.shape

(1175, 590)

In [19]:
missing_remove(X_train, 0.5)
X_train.shape

# Features deleted with more than 0.5 % missing values 32


(1175, 558)

In [20]:
## Removing Columns with non unique values

In [21]:
unique_remove(X_train)
X_train.shape

(1175, 442)

In [22]:
# From test set deleting columns that are removed in training set
X_test = X_test[np.array(X_train.columns)]
X_test.shape

(392, 442)

# MICE

In [23]:
# impute na values using MICE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
MICE_imputer = IterativeImputer(random_state=100)
X_train_MICE = X_train.copy(deep=True)
X_train_MICE.iloc[:,:] = pd.DataFrame(MICE_imputer.fit_transform(X_train))
X_train = X_train_MICE



In [24]:
X_train

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_7,Feature_8,Feature_9,Feature_10,Feature_11,...,Feature_577,Feature_578,Feature_583,Feature_584,Feature_585,Feature_586,Feature_587,Feature_588,Feature_589,Feature_590
160,3057.03,2468.41,2184.8778,960.8486,1.0160,102.5333,0.1214,1.4549,-0.0125,-0.0196,...,1.0653,15.0191,0.5067,0.0126,0.0034,2.4798,0.0195,0.0121,0.0048,62.1248
814,3012.09,2551.08,2216.7333,1748.0885,1.1127,97.5822,0.1242,1.5136,-0.0090,0.0129,...,2.4530,12.7241,0.4994,0.0073,0.0020,1.4634,0.0111,0.0069,0.0021,62.3602
124,3032.81,2534.74,2239.4223,1997.3782,1.5397,98.3356,0.1229,1.4974,-0.0046,-0.0002,...,1.5902,18.6118,0.4950,0.0149,0.0041,3.0137,0.0241,0.0086,0.0027,35.5550
501,2987.72,2550.52,2180.7000,1159.3838,1.0177,98.9367,0.1222,1.4207,0.0016,-0.0056,...,1.5141,11.3379,0.5042,0.0099,0.0030,1.9562,0.0048,0.0226,0.0079,474.0812
1362,3001.90,2465.51,2223.0444,1194.5986,1.2016,112.5811,0.1229,1.4201,-0.0182,0.0055,...,83.9190,67.3679,0.5026,0.0085,0.0026,1.6862,0.0182,0.0077,0.0025,42.5048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1250,3185.69,2537.68,2173.2778,1116.2950,0.8525,103.8200,0.1237,1.4653,-0.0204,0.0038,...,1.0597,13.1807,0.5055,0.0104,0.0028,2.0588,0.0230,0.0095,0.0028,41.2178
365,2988.92,2460.91,2178.0778,941.9524,0.8039,104.0167,0.1229,1.5829,-0.0278,-0.0324,...,1.2816,10.4728,0.4976,0.0148,0.0032,2.9645,0.0291,0.0135,0.0045,46.4165
1420,2975.74,2517.35,2162.5556,1041.0369,1.4305,100.4111,0.1238,1.4968,-0.0201,-0.0060,...,1.0949,16.9589,0.4994,0.0115,0.0033,2.3077,0.0299,0.0071,0.0020,23.6431
113,2928.16,2523.21,2210.6111,1184.6481,1.2577,102.9356,0.1201,1.4453,-0.0126,0.0152,...,1.8020,7.1763,0.5016,0.0160,0.0035,3.1882,0.0049,0.0144,0.0047,293.2614


In [25]:
clfscore = RandomForestClassifier(n_estimators=100, max_depth=2)
clfscore.fit(X_train, y_train.values.ravel())

print ('RF accuracy: TRAINING', clfscore.score(X_train,y_train.values.ravel()))

RF accuracy: TRAINING 0.9336170212765957


# BORUTA

In [26]:
#Boruta function

def BorutaFeatureSelection (X, y) :
    feature_names = np.array(X.columns)

    model = model = RandomForestClassifier(n_jobs=-1, class_weight='balanced_subsample', max_depth=5, random_state=100)
    model.fit(X, y)
    
    feature_selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=100, max_iter=100)

    feature_selector.fit(X.to_numpy(),y)

    feature_selector.support_

    feature_selector.ranking_

    feature_ranks = list(zip(feature_names, 
                             feature_selector.ranking_, 
                             feature_selector.support_))

    for feat in feature_ranks:
        print('Feature: {:<30} Rank: {},  Keep: {}'.format(feat[0], feat[1], feat[2]))
        
    final_features = list()
    indexes = np.where(feature_selector.ranking_ <= 1)
    for x in np.nditer(indexes):
        final_features.append(feature_names[x])
    print(final_features)
    
    return pd.DataFrame(X.filter(final_features)) , final_features

In [27]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
X_train , final_features = BorutaFeatureSelection(X_train,y_train.values.ravel())

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	442
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	442
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	442
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	442
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	442
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	442
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	442
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	20
Rejected: 	422
Iteration: 	9 / 100
Confirmed: 	3
Tentative: 	17
Rejected: 	422
Iteration: 	10 / 100
Confirmed: 	3
Tentative: 	17
Rejected: 	422
Iteration: 	11 / 100
Confirmed: 	3
Tentative: 	17
Rejected: 	422
Iteration: 	12 / 100
Confirmed: 	5
Tentative: 	13
Rejected: 	424
Iteration: 	13 / 100
Confirmed: 	5
Tentative: 	13
Rejected: 	424
Iteration: 	14 / 100
Confirmed: 	5
Tentative: 	13
Rejected: 	424
Iteration: 	15 / 100
Confirmed: 	5
Tentative: 	13
Rejected: 	424
Iteration: 	16 / 100
Confirmed: 	6
Tentat

In [28]:
X_train

Unnamed: 0,Feature_22,Feature_32,Feature_34,Feature_60,Feature_65,Feature_66,Feature_104,Feature_122,Feature_131,Feature_214,Feature_320,Feature_352,Feature_478,Feature_511,Feature_578
160,-5645.25,3.3907,8.3754,14.645500,25.354500,34.515500,-0.0056,15.74,0.8300,0.1156,10.0971,0.0579,6.002000,97.7778,15.0191
814,-6616.00,4.6484,8.6706,5.572236,15.453244,25.628517,-0.0106,15.86,0.7402,0.0309,8.6690,0.0137,6.178896,50.8235,12.7241
124,-5653.75,3.3321,8.7947,29.191800,10.808200,17.468400,-0.0050,15.74,0.7429,0.0761,10.8187,0.0355,14.624700,88.6650,18.6118
501,-5697.50,3.3272,8.5459,0.161800,19.838200,25.126200,-0.0116,15.76,0.6645,0.0332,8.5869,0.0146,9.609100,39.7838,11.3379
1362,-5130.75,3.3744,8.7786,2.420900,17.579100,20.979000,-0.0122,15.80,0.7430,0.0657,8.6381,0.0275,4.298300,25.4882,67.3679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1250,-5248.75,3.3631,8.7556,-3.090900,23.090900,30.517400,-0.0053,15.71,0.7304,0.0294,7.6805,0.0136,7.245400,81.3239,13.1807
365,-5415.00,3.4226,9.3960,4.099100,15.900900,18.324800,-0.0132,15.81,0.6469,0.0554,10.8728,0.0256,3.927300,27.9509,10.4728
1420,-6504.75,4.6809,8.1605,-2.099100,22.099100,28.287700,-0.0112,15.70,0.7048,0.0729,8.5762,0.0343,4.026200,40.8889,16.9589
113,-5401.75,3.4199,8.7495,16.790900,23.209100,29.535800,-0.0055,15.70,0.6925,0.0763,6.2614,0.0340,3.577400,78.0045,7.1763


In [29]:
#Test Data

In [30]:
X_test_MICE = X_test.copy(deep=True)
X_test_MICE.iloc[:,:] = pd.DataFrame(MICE_imputer.fit_transform(X_test))
X_test = X_test_MICE

LinAlgError: SVD did not converge

In [None]:
X_test = X_test.filter(final_features)

In [None]:
X_test

In [None]:
clfscore = RandomForestClassifier(n_estimators=100, max_depth=2)
clfscore.fit(X_train, y_train.values.ravel())

print ('RF accuracy: TRAINING', clfscore.score(X_train,y_train))
print ('RF accuracy: TESTING', clfscore.score(X_test,y_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn import metrics
from scikitplot.metrics import plot_roc_curve
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=4)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", yticklabels = ['Pred. Pass' , 'Pred. Fail'] , 
            xticklabels = ['Act. Pass', 'Act. Fail'])
print(classification_report(y_test, y_pred))
print("Test Accuracy Score", accuracy_score(y_test, y_pred))
metrics.RocCurveDisplay.from_estimator(model, X_test, y_test)
plt.show()

In [None]:
#BALANCING

In [None]:
import numpy as np
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA

In [None]:
y_train.loc[0]

In [None]:
g.tail()

In [None]:
pca = PCA(n_components=2)
pca.fit(X_train)
g = pca.transform(X_train)
g = pd.DataFrame(g, columns=['a','b'])
g['Classification'] = y_train
sns.scatterplot(data = g , x='a' , y='b', hue = g['Classification'],legend= True )
print(pca.explained_variance_)
print('\n Total Variance Explained:', round(sum(list(pca.explained_variance_ratio_))*100, 2))

In [None]:
# Sampling technique

def Sampling(X_train, y_train, sampler):

    #SMOTE
    if sampler == 'SMOTE':
        sampler = SMOTE(random_state=100)    
    
    #ROSE
    if sampler == 'ROSE':
        sampler = RandomOverSampler(random_state=100, shrinkage=1)

    #ADASYN
    if sampler == 'ADASYN':
        sampler = ADASYN(random_state=100)
    

    #SMOTTEENN
    if sampler == 'SMOTEENN' :
        sampler = SMOTEENN(random_state=100)
        
        
    #Random under Sampling
    if sampler == "randomunder":
        sampler = RandomUnderSampler(random_state=100)

    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    counter = Counter(y_resampled)
    print(counter)
    return X_resampled, y_resampled

In [None]:
# Imbalance treatment
from collections import Counter
X_resampled, y_resampled = Sampling(X_train, y_train,'SMOTE')

In [None]:
pca = PCA(n_components=2)
pca.fit(X_resampled)
g = pca.transform(X_resampled)
g = pd.DataFrame(g, columns=['a','b'])
g['Classification'] = y_resampled
sns.scatterplot(data = g , x='a' , y='b', hue= g['Classification'], legend= True )
print(pca.explained_variance_)
print('\n Total Variance Explained:', round(sum(list(pca.explained_variance_ratio_))*100, 2))

In [None]:
# Imbalance treatment
X_resampled1, y_resampled1 = Sampling(X_train, y_train,'ADASYN')

In [None]:
pca = PCA(n_components=2)
pca.fit(X_resampled1)
g = pca.transform(X_resampled1)
g = pd.DataFrame(g, columns=['a','b'])
g['Classification'] = y_resampled1
sns.scatterplot(data = g , x='a' , y='b', hue= g['Classification'], legend= True )
print(pca.explained_variance_)
print('\n Total Variance Explained:', round(sum(list(pca.explained_variance_ratio_))*100, 2))

In [None]:
# Imbalance treatment
X_resampled2, y_resampled2 = Sampling(X_train, y_train,'ROSE')

In [None]:
pca = PCA(n_components=2)
pca.fit(X_resampled2)
g = pca.transform(X_resampled2)
g = pd.DataFrame(g, columns=['a','b'])
g['Classification'] = y_resampled2
sns.scatterplot(data = g , x='a' , y='b', hue= g['Classification'], legend= True )
print(pca.explained_variance_)
print('\n Total Variance Explained:', round(sum(list(pca.explained_variance_ratio_))*100, 2))

In [None]:
# Imbalance treatment
from imblearn.combine import SMOTEENN
X_resampled3, y_resampled3 = Sampling(X_train, y_train,'SMOTEENN')

In [None]:
pca = PCA(n_components=2)
pca.fit(X_resampled3)
g = pca.transform(X_resampled3)
g = pd.DataFrame(g, columns=['a','b'])
g['Classification'] = y_resampled3
sns.scatterplot(data = g , x='a' , y='b', hue= g['Classification'], legend= True )
print(pca.explained_variance_)
print('\n Total Variance Explained:', round(sum(list(pca.explained_variance_ratio_))*100, 2))

In [None]:
# Imbalance treatment
from imblearn.under_sampling import RandomUnderSampler
X_resampled4, y_resampled4 = Sampling(X_train, y_train,'randomunder')

In [None]:
pca = PCA(n_components=2)
pca.fit(X_resampled4)
g = pca.transform(X_resampled4)
g = pd.DataFrame(g, columns=['a','b'])
g['Classification'] = y_resampled4
sns.scatterplot(data = g , x='a' , y='b', hue= g['Classification'], legend= True )
print(pca.explained_variance_)
print('\n Total Variance Explained:', round(sum(list(pca.explained_variance_ratio_))*100, 2))
#Percentage of variance explained by each of the selected components