In [168]:
import pandas as pd 
import numpy as np 
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt 
from sklearn.preprocessing import MinMaxScaler

In [169]:
#df features 
df_features = pd.read_csv('/Users/arasdirekoglu/Documents/Github/Fault-Detection-SECOM-2/secom.data', delimiter=' ', header=None, na_values=['NaN'])
df_features.columns = ['feature'+str(x+1) for x in range(len(df_features.columns))] 

#df target 
df_target = pd.read_csv('/Users/arasdirekoglu/Documents/Github/Fault-Detection-SECOM-2/secom_labels.data', delimiter= ' ', header=None, na_values={'NaN'})
df_target.columns = ['status', 'timestamp']
df_target['timestamp'] = pd.to_datetime(df_target['timestamp'], dayfirst=True)

#concat both to df
df_secom  = pd.concat([df_features, df_target], axis=1) 

Attempt KNN Imputation 

In [170]:
#Number of Rows with Missing Values BEFORE KNN Imputation
df_features.isna().any(axis=1).sum()

1567

In [6]:
#KNN Imputation
knn = KNNImputer()
knn.fit(df_features)
data_knn=pd.DataFrame(knn.fit_transform(df_features), columns=df_features.columns)

In [7]:
#Number of Rows with Missing Values AFTER KNN Imputation
data_knn.isna().any(axis=1).sum()

0

Defition & Test of Function

In [6]:
#Definition of Function
def knnimputation_distance(x):
    #First scale the data 
    scaler = MinMaxScaler()
    x = pd.DataFrame(scaler.fit_transform(x), columns= x.columns)
    knn = KNNImputer(n_neighbors=20, weights='distance')
    x = pd.DataFrame(knn.fit_transform(x), columns=x.columns)
    df = pd.DataFrame(scaler.inverse_transform(x), columns= x.columns)
    return df 

#n_neighbors=2 as hyperparameter possible / Default is 5 
#weights / default is uniform, distance means that closer points have a greater influence than neighbors whoch are further away 

In [7]:
#Definition of Function
def knnimputation_uniform(x):
    #First scale the data 
    scaler = MinMaxScaler()
    x = pd.DataFrame(scaler.fit_transform(x), columns= x.columns)
    knn = KNNImputer(n_neighbors=20, weights='uniform')
    x = pd.DataFrame(knn.fit_transform(x), columns=x.columns)
    df = pd.DataFrame(scaler.inverse_transform(x), columns= x.columns)
    return df 

In [115]:
#Number of Rows with Missing Values BEFORE KNN Imputation
df_features.isna().any(axis=1).sum()

1567

In [5]:
#Test of function 
df_test_function = knnimputation(df_features)
df_test_function.isna().any(axis=1).sum()

0

Test of Function with a completely different Data Set

In [178]:
#Read the Data
df_test = pd.read_csv('testdata.csv')
df_test =  df_test[['Age', 'Fare', 'Pclass']]
df_test.head()

Unnamed: 0,Age,Fare,Pclass
0,22.0,7.25,3
1,38.0,71.2833,1
2,26.0,7.925,3
3,35.0,53.1,1
4,35.0,8.05,3


In [172]:
df_test.isna().any(axis=1).sum()

177

In [13]:
df_test_knn = knnimputation(df_test)
df_test_knn.isna().any(axis=1).sum()

0

Outlier Treatment with KNN

In [153]:
#simple outlier removal with one line 
def outlier_simple(x):
    dfnew =x.mask(x.sub(x.mean()).div(x.std()).abs().gt(3))
    return dfnew

In [154]:
#Before Outlier Removal with NA 
df_test.isna().any(axis=1).sum()

177

In [155]:
#After Outlier Removal with NA 
df_test_simple_nona = outlier_simple(df_test)
df_test_simple_nona.isna().any(axis=1).sum()

197

In [166]:
#more complex outlier removal
def outlier_complex(df):
    for col in df:
        ll_col = df[col].mean() - 3 * df[col].std()
        ul_col = df[col].mean() + 3 * df[col].std()
        df[col] = np.where(df[col]>ul_col,np.NaN,np.where(df[col]<ll_col,np.NaN,df[col]))
    return df

In [165]:
outlier_complex(df_test)
df_test.isna().any(axis=1).sum()

488

In [174]:
#Outlier AND KNN 
def outlierknn(df):
    #Outlier treatment first:
    df =df.mask(df.sub(df.mean()).div(df.std()).abs().gt(3))
    #Scaling data 
    scaler = MinMaxScaler()
    df = pd.DataFrame(scaler.fit_transform(df), columns= df.columns)
    #KNN Imputation 
    knn = KNNImputer(n_neighbors=20, weights='uniform')
    df = pd.DataFrame(knn.fit_transform(df), columns=df.columns)
    #Reverse Scaling
    df = pd.DataFrame(scaler.inverse_transform(df), columns= df.columns)
    return df  

In [179]:
df_test_new = outlierknn(df_test)

In [180]:
df_test.describe()

Unnamed: 0,Age,Fare,Pclass
count,714.0,891.0,891.0
mean,29.699118,32.204208,2.308642
std,14.526497,49.693429,0.836071
min,0.42,0.0,1.0
25%,20.125,7.9104,2.0
50%,28.0,14.4542,3.0
75%,38.0,31.0,3.0
max,80.0,512.3292,3.0


In [181]:
df_test_new.describe()

Unnamed: 0,Age,Fare,Pclass
count,891.0,891.0,891.0
mean,29.215839,27.098649,2.308642
std,12.991614,29.28063,0.836071
min,0.42,0.0,1.0
25%,21.0,7.9104,2.0
50%,28.0,14.4542,3.0
75%,35.0,31.0,3.0
max,71.0,164.8667,3.0
