#### Importing packages

In [18]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import plotly as py
import plotly.express as px
import missingno as msno
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

#### Functions

In [19]:
# Function to remove missing values above a given threshold
def missing_remove(dataframe, threshold):
    columns = dataframe.columns[(dataframe.isna().sum()/dataframe.shape[1])>threshold].tolist()
    print(f"# Features deleted with more than {threshold} % missing values", len(columns))
    return dataframe.drop(columns, axis=1,inplace=True)

In [20]:
# Function to remove features with unique values
def unique_remove(dataframe):
    unique_features = []
    for col in dataframe.columns:
        if(dataframe[col].unique().size == 2):
            if(len(unique_features) == 0):
                unique_features =  dataframe[col]
            else:
                 unique_features = pd.concat([unique_features, dataframe[col]], axis=1)
            dataframe.drop(col, axis=1, inplace=True)

In [21]:
# Function to remove outlier
def outlier_detection_zcore(dataframe):
    data_mean, data_std  = dataframe.mean(), dataframe.std()
    data_z_scores = ((dataframe - data_mean) / data_std).abs()
    m = data_z_scores > 3
    dataframe = dataframe.mask(m,inplace=True)
    return dataframe

#### Importing and basic cleaning

In [22]:
# Importing data from internet in runtime
data_url="https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data"

In [23]:
# Assigning data to a pandas dataframe
secom_data = pd.read_csv(data_url,sep=' ',header=None)

In [24]:
# Renaming the columns 
secom_data.columns = ["Feature_"+str(column+1) for column in range(len(secom_data.columns))]
secom_data.head()

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_581,Feature_582,Feature_583,Feature_584,Feature_585,Feature_586,Feature_587,Feature_588,Feature_589,Feature_590
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,,,0.5005,0.0118,0.0035,2.363,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,0.006,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,0.0044,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432


In [25]:
# Importing target data from internet in runtime
label_url="https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data"

In [26]:
# Assigning target data to a pandas dataframe
secom_labels = pd.read_csv(label_url, sep = " ",header=None)

In [27]:
# Renaming the columns 
secom_labels.columns = ["Classification","Timestamp"]

In [28]:
# Changing datatype for consitency
secom_labels['Timestamp'] = pd.to_datetime(secom_labels['Timestamp'],errors='raise')

In [29]:
# Merging the data
data= pd.concat([secom_labels,secom_data],axis=1)

In [30]:
# Dropping the timestamp column as it provides no insights
data.drop(columns="Timestamp", errors='raise',inplace=True)

In [31]:
target = data[['Classification']]
data.drop(['Classification'], axis=1,inplace=True)

#### Spliting the data into training set and test set

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data,target, test_size=0.25, random_state=42, stratify=target)
# We use the startify parameter as the data is imbalanced on the basis of pass and fail and this will ensure the same 
#is retained when we split - Random state will help us create a reproducible data - Test size is the split ration

In [33]:
X_test_original = X_test

#### Re-labeling the Target values


In [17]:
# # Relabeling target variables 1 is pass and 0 is fail
# y_train = y_train.replace(to_replace=[-1, 1], value=[1, 0])
# y_test = y_test.replace(to_replace=[-1, 1], value=[1, 0])

In [None]:
# # Scaling the data
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
# X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

#### Removing columns with more that 50% missing values

In [34]:
X_train.shape

(1175, 590)

In [35]:
X_train_missing = X_train
missing_remove(X_train_missing, 0.5)
X_train_missing.head()

# Features deleted with more than 0.5 % missing values 32


Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_577,Feature_578,Feature_583,Feature_584,Feature_585,Feature_586,Feature_587,Feature_588,Feature_589,Feature_590
160,3057.03,2468.41,2184.8778,960.8486,1.016,100.0,102.5333,0.1214,1.4549,-0.0125,...,1.0653,15.0191,0.5067,0.0126,0.0034,2.4798,0.0195,0.0121,0.0048,62.1248
814,3012.09,2551.08,2216.7333,1748.0885,1.1127,100.0,97.5822,0.1242,1.5136,-0.009,...,2.453,12.7241,0.4994,0.0073,0.002,1.4634,0.0111,0.0069,0.0021,62.3602
124,3032.81,2534.74,2239.4223,1997.3782,1.5397,100.0,98.3356,0.1229,1.4974,-0.0046,...,1.5902,18.6118,0.495,0.0149,0.0041,3.0137,0.0241,0.0086,0.0027,35.555
501,2987.72,2550.52,2180.7,1159.3838,1.0177,100.0,98.9367,0.1222,1.4207,0.0016,...,1.5141,11.3379,0.5042,0.0099,0.003,1.9562,0.0048,0.0226,0.0079,474.0812
1362,3001.9,2465.51,2223.0444,1194.5986,1.2016,100.0,112.5811,0.1229,1.4201,-0.0182,...,83.919,67.3679,0.5026,0.0085,0.0026,1.6862,0.0182,0.0077,0.0025,42.5048


#### Removing Columns with non unique values

In [36]:
X_train_unique = X_train_missing
unique_remove(X_train_unique)
X_train_unique.shape

(1175, 442)

### Removing outliers and using knn

In [37]:
X_train_unique.isnull().sum().sum()

6038

In [38]:
X_train_no_outlier = X_train_unique
outlier_detection_zcore(X_train_no_outlier)
X_train_no_outlier.isnull().sum().sum()

10506

In [39]:
# From test set deleting columns that are removed in training set
X_test = X_test[np.array(X_train_unique.columns)]
X_test.shape

(392, 442)

In [40]:
X_test.isnull().sum().sum()

1970

In [41]:
outlier_detection_zcore(X_test)
X_test.isnull().sum().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._where(cond, other, inplace, axis, level, errors=errors)


3662