In [229]:
import pandas as pd 
import numpy as np 
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt 
from sklearn.preprocessing import MinMaxScaler

In [258]:
#df features 
df_features = pd.read_csv('/Users/arasdirekoglu/Documents/Github/Fault-Detection-SECOM-2/secom.data', delimiter=' ', header=None, na_values=['NaN'])
df_features.columns = ['feature'+str(x+1) for x in range(len(df_features.columns))] 

#df target 
df_target = pd.read_csv('/Users/arasdirekoglu/Documents/Github/Fault-Detection-SECOM-2/secom_labels.data', delimiter= ' ', header=None, na_values={'NaN'})
df_target.columns = ['status', 'timestamp']
df_target['timestamp'] = pd.to_datetime(df_target['timestamp'], dayfirst=True)

#concat both to df
df_secom  = pd.concat([df_features, df_target], axis=1) 

Attempt KNN Imputation 

In [259]:
#Number of Missing Values BEFORE KNN Imputation
df_features.isnull().sum().sum()

41951

In [243]:
#KNN Imputation
knn = KNNImputer()
knn.fit(df_features)
data_knn=pd.DataFrame(knn.fit_transform(df_features), columns=df_features.columns)

In [244]:
#Number of Missing Values AFTER KNN Imputation
data_knn.isnull().sum().sum()

0

Defition & Test of Function: KNN Imputation

In [252]:
#Definition of Function
def knnimputation_distance(x):
    #First scale the data 
    scaler = MinMaxScaler()
    x = pd.DataFrame(scaler.fit_transform(x), columns= x.columns)
    knn = KNNImputer(n_neighbors=20, weights='distance')
    x = pd.DataFrame(knn.fit_transform(x), columns=x.columns)
    df = pd.DataFrame(scaler.inverse_transform(x), columns= x.columns)
    return df 

#n_neighbors=2 as hyperparameter possible / Default is 5 
#weights / default is uniform, distance means that closer points have a greater influence than neighbors whoch are further away 

In [248]:
#Definition of Function
def knnimputation_uniform(x):
    #First scale the data 
    scaler = MinMaxScaler()
    x = pd.DataFrame(scaler.fit_transform(x), columns= x.columns)
    knn = KNNImputer(n_neighbors=20, weights='uniform')
    x = pd.DataFrame(knn.fit_transform(x), columns=x.columns)
    df = pd.DataFrame(scaler.inverse_transform(x), columns= x.columns)
    return df 

In [249]:
#Number of Missing Values BEFORE KNN Imputation
df_features.isnull().sum().sum()

41951

In [253]:
#Test of function 
df_test_function = knnimputation_distance(df_features)
df_test_function.isnull().sum().sum()

0

Outlier Detection & Removal 

In [254]:
#more complex outlier removal
def outlier_complex(df):
    for col in df.columns:
        ll_col = df[col].mean() - 3 * df[col].std()
        ul_col = df[col].mean() + 3 * df[col].std()
        df[col] = np.where(df[col]>ul_col,np.NaN,np.where(df[col]<ll_col,np.NaN,df[col]))
    return df

In [260]:
#Number of Missing Values BEFORE OUTLIER REMOVAL
df_features.isnull().sum().sum()

41951

In [256]:
#Number of Missing Values AFTER OUTLIER REMOVAL 
df_test1 = outlier_complex(df_features)
df_test1.isnull().sum().sum()

48067

OUTLIER REMOVAL & KNN IMPUTATION IN ONE STEP 

In [257]:
#Outlier AND KNN 
def outlierknn(df):
    #Outlier treatment first:
    for col in df.columns:
        ll_col = df[col].mean() - 3 * df[col].std()
        ul_col = df[col].mean() + 3 * df[col].std()
        df[col] = np.where(df[col]>ul_col,np.NaN,np.where(df[col]<ll_col,np.NaN,df[col]))
    #Scaling data 
    scaler = MinMaxScaler()
    df = pd.DataFrame(scaler.fit_transform(df), columns= df.columns)
    #KNN Imputation 
    knn = KNNImputer(n_neighbors=20, weights='uniform')
    df = pd.DataFrame(knn.fit_transform(df), columns=df.columns)
    #Reverse Scaling
    df = pd.DataFrame(scaler.inverse_transform(df), columns= df.columns)
    return df  

In [261]:
#Number of Missing Values BEFORE OUTLIER REMOVAL & KNN IMPUTATION 
df_features.isnull().sum().sum()

41951

In [262]:
#Number of Missing Values AFTER OUTLIER REMOVAL & KNN IMPUTATION 
df_no_na_outlier = outlierknn(df_features)
df_no_na_outlier.isnull().sum().sum()

0

APPENDIX

In [187]:
#simple outlier removal with one line 
def outlier_simple(x):
    dfnew =x.mask(x.sub(x.mean()).div(x.std()).abs().gt(3))
    return dfnew