In [34]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier

from boruta import BorutaPy
from BorutaShap import BorutaShap
from sklearn.feature_selection import RFE

from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

from collections import Counter
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC

In [62]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Read df

In [56]:
#this function is to read, transform and join 2 data frame

def read_features():
    path = 'input/secom.data'
    df = pd.read_csv(path, delimiter=' ', header=None, na_values=['NaN'])
    df.columns = ['feature_'+str(x+1) for x in range(len(df.columns))]
    return df



def read_target():
    path = 'input/secom_labels.data'
    df = pd.read_csv(path, delimiter=' ', header=None, na_values=['NaN'])
    df.columns = ['status','timestamp']
    df['timestamp'] = pd.to_datetime(df['timestamp'],dayfirst=True)
    return df

#for the testing purporse, trim to remain first 100 rows only
X = read_features()
y = read_target().iloc[:,0]

# Remove duplicated columns

In [37]:
#find the duplicated features (columns)
def remove_duplicated_columns(df):
    dict_duplicate_pair = {}
    dict_duplicate_matches = {}
    list_duplicate = []
    to_remove = []
    for i in range(0, len(df.columns)):
        l = []
        for j in range(i+1,len(df.columns)):
            dict_duplicate_pair[str(i+1)+';'+str(j+1)] = df.iloc[:,i].equals(df.iloc[:,j])
            if df.iloc[:,i].equals(df.iloc[:,j]) == True:
                if j not in list_duplicate:
                    l.append(j)
                    to_remove.append('feature_'+str(j+1))
                list_duplicate.append(i)
                list_duplicate.append(j)
        if len(l)!=0:
            dict_duplicate_matches[i] = l


    df_duplicate_pair = pd.DataFrame.from_dict(dict_duplicate_pair, orient='index')
    df_duplicate_pair.columns=['duplicate']

    df_duplicate_matches = pd.DataFrame.from_dict(dict_duplicate_matches, orient='index')

    
    df = df.drop(columns=to_remove, axis = 1)

    return df

# X = remove_duplicated_columns(X)
# X.shape


# Remove columns with Constant volatility (std=0)

In [38]:
def remove_constant_volatility(df):
    df_EDA= df.describe().T
    df_EDA= df_EDA[df_EDA["std"] == 0]
    df = df.drop(axis=1, columns=df_EDA.index)
    return df

# X = remove_constant_volatility(X)
# X.shape

# Remove columns with high %Missing values

In [39]:
def remove_cols_with_high_pct_null(df, null_threshold):
    list_column_with_pct_null = pd.concat([df.isnull().sum(), df.isnull().sum()/df.shape[0]],axis=1).rename(columns={0:'Missing_Records', 1:'Percentage (%)'})
    list_column_with_pct_null= list_column_with_pct_null[list_column_with_pct_null["Percentage (%)"] >= null_threshold]
    df = df.drop(axis=1, columns=list_column_with_pct_null.index)
    return df

# X = remove_cols_with_high_pct_null(X, 0.8)
# X.shape

# Split data

In [40]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1,stratify=y)

# Outlier treatment

In [41]:
#how = ['NaN', '3s']
def replace_outlier(df, how):
    for col in df:
        ll_col = df[col].mean() - 3 * df[col].std()
        ul_col = df[col].mean() + 3 * df[col].std()
        if how == 'NaN':
            df[col] = np.where(df[col]>ul_col,np.NaN,np.where(df[col]<ll_col,np.NaN,df[col]))
        elif how == '3s':
            df[col] = np.where(df[col]>ul_col,ul_col,np.where(df[col]<ll_col,ll_col,df[col]))
    return df

# Missing value Imputation

In [42]:
#which_weights = ['distance','uniform']

def impute_null_with_knn(X_train, X_test, which_weights):
    #First scale the data 
    scaler = MinMaxScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns= X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns= X_test.columns)

    knn = KNNImputer(n_neighbors=5, weights=which_weights) #check this neighbors = 5

    X_train = pd.DataFrame(knn.fit_transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(knn.transform(X_test), columns=X_test.columns)
    
    X_train = pd.DataFrame(scaler.inverse_transform(X_train), columns= X_train.columns)
    X_test = pd.DataFrame(scaler.inverse_transform(X_test), columns= X_test.columns)
    return X_train, X_test

#X_train = impute_null_with_knn(X_train)

In [43]:
def impute_null_with_mice(X_train, X_test): 
    imp = IterativeImputer(maX_train_iter=10, verbose=0, imputation_order='roman', random_state=0)
    X_train = pd.DataFrame(imp.fit_transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(imp.transform(X_test), columns=X_test.columns)
    return X_train, X_test

# Feature Selection

In [44]:
#list_method=['shap','gini']

def BorutaShap_FS (X, y, method_option) :
    modelshap = RandomForestClassifier(n_jobs=-1,n_estimators=100, class_weight='balanced_subsample', max_depth=5, random_state=100)
    # define model for resp. classifier
    modelshap.fit(X,y)
    ##-- feature_names = np.array(X.columns)
    # define Boruta Sahp feature selection method
    feature_selector = BorutaShap(model=modelshap,
                              importance_measure=method_option,
                              classification=True)  # find all relevant features
    feature_selector.fit(X,y,n_trials=100, sample=False, verbose=False, random_state=100)  
    ##-- feature_selector.plot(which_features='accepted',figsize=(20,10))
    # call transform() on X to filter it down to selected features
    return  feature_selector.Subset()

In [45]:
#RFE function with random forest

def RFE_FS (X, y, classify) :
    feature_names = np.array(X.columns)
    if classify == 'RF':
    # define random forest classifier
        model = RandomForestClassifier(n_jobs=-1, class_weight='balanced_subsample', max_depth=5, random_state=100)
        #model.fit(X, y)
        #rfe = RFE(estimator = model,n_features_to_select = 15)
    if classify== 'SVM':
        model = SVC(kernel='linear',C=5)
        #model.fit(X, y)
        #rfe = RFECV(estimator = model,scoring='accuracy')
    # find all relevant features
    model.fit(X, y)
    rfe = RFE(estimator = model,n_features_to_select = 15)
    rfe.fit(X,y)

    # check selected features
    ##--rfe.support_

    # check ranking of features
    ##--rfe.ranking_

    # zip feature names, ranks, and decisions 
    # feature_ranks = list(zip(feature_names, 
    #                          rfe.ranking_, 
    #                          rfe.support_))

    # print the results
    ##--for feat in feature_ranks:
    ##--    print('Feature: {:<30} Rank: {},  Keep: {}'.format(feat[0], feat[1], feat[2]))
        
    final_features_rfe = list()
    indexes = np.where(rfe.ranking_ <= 1)
    for x in np.nditer(indexes):
        final_features_rfe.append(feature_names[x])
    ##-- print(final_features_rfe)
    
 # call transform() on X to filter it down to selected features
    return pd.DataFrame(X.filter(final_features_rfe))

In [46]:
#Boruta function with random forest

def BorutaPy_FS (X, y) :
    feature_names = np.array(X.columns)

    # define random forest classifier
    model = RandomForestClassifier(n_jobs=-1, class_weight='balanced_subsample', max_depth=5, random_state=100)
    model.fit(X, y)
    # define Boruta feature selection method
    
    feature_selector = BorutaPy(model, n_estimators='auto', verbose=0, random_state=100, max_iter=140)

    # find all relevant features
    feature_selector.fit(X.to_numpy(),y)

    # check selected features
    ##--feature_selector.support_

    # check ranking of features
    ##--feature_ranking=feature_selector.ranking_

    # zip feature names, ranks, and decisions 
    # feature_ranks = list(zip(feature_names, 
    #                          feature_selector.ranking_, 
    #                          feature_selector.support_))

    # print the results
    ##--for feat in feature_ranks:
    ##--    print('Feature: {:<30} Rank: {},  Keep: {}'.format(feat[0], feat[1], feat[2]))
        
    final_features = list()
    indexes = np.where(feature_selector.ranking_ <= 1)
    for x in np.nditer(indexes):
        final_features.append(feature_names[x])
    ##--print(final_features)
    
 # call transform() on X to filter it down to selected features
    return pd.DataFrame(X.filter(final_features))

# Balancing

In [52]:
def sampling(X_train, y_train, sampler):
    
    #SMOTE
    if sampler == 'SMOTE':
        sampler = SMOTE(random_state=100)    
    
    #ROSE
    if sampler == 'ROSE':
        sampler = RandomOverSampler(random_state=100, shrinkage=1)

    #ADASYN
    if sampler == 'ADASYN':
        sampler = ADASYN(random_state=100)
    

    #SMOTTEENN
    if sampler == 'SMOTEENN' :
        sampler = SMOTEENN(random_state=100)
        
        
    #Random under Sampling
    if sampler == "randomunder":
        sampler = RandomUnderSampler(random_state=100)

    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    counter = Counter(y_resampled)
    print(counter)
    
    return X_resampled, y_resampled

# X_train, y_train = sampling(X_train, y_train,'SMOTE')
# X_train.shape

# Model

In [48]:
def run_model(X_train, y_train, X_test, y_test):
    # building model before balancing data
    model = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
    model.fit(X_train,y_train)
    y_pred= model.predict(X_test)
    accuracy= accuracy_score(y_test, y_pred)
    cf_matrix = confusion_matrix(y_test, y_pred)
    specificity = cf_matrix[1][1] / ( cf_matrix[1][1] + cf_matrix[1][0] )
    #add type I/II error, F1
    #add loss/cost ROC/AUC 
    return accuracy, specificity, cf_matrix

#run_model(X_train, y_train, X_test, y_test)

## Combine

In [59]:
X = read_features()
y = read_target().iloc[:,0]

result = []
i = 0

#step 4:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify=y)


#step 1
X = remove_duplicated_columns(X)
#step 2:
X = remove_constant_volatility(X)
#step 3:
X = remove_cols_with_high_pct_null(X, 0.8) #this can be in the loop too, may be later






#step 5-9
replace_outlier_options = ['NaN', '3s']
impute_null_options = ['knn__distance', 'knn__uniform', 'MICE']
FS_options = ['BoS__shap', 'BoS__gini', 'RFE__RF', 'RFE__SVM', 'BoP']
sampling_options = ['SMOTE','ROSE','ADASYN','SMOTEENN']

for replace_with in replace_outlier_options:
    for knn_weight in impute_null_options:
        for classifier_model in FS_options:
            #<remove correlated columns, decide on the thresold 70%>
            for sampling_technique in sampling_options:
                X_train_temp = X_train
                X_test_temp = X_test
                y_train_temp = y_train
                y_test_temp = y_test

                #step 5: oulier treatement (on both TRAIN & TEST split)
                X_train_temp = replace_outlier(X_train_temp, replace_with)
                X_test_temp = replace_outlier(X_test_temp, replace_with)
                
                #step 6: missing value imputation (on both TRAIN & TEST split)
                if knn_weight == 'knn__distance' or knn_weight == 'knn__uniform':
                    X_train_temp, X_test_temp = impute_null_with_knn(X_train_temp, X_test_temp, knn_weight[-(len(knn_weight)-5):])
                elif knn_weight == 'MICE':
                    X_train_temp, X_test_temp = impute_null_with_mice(X_train_temp, X_test_temp)


                if classifier_model == 'BoS__shap' or classifier_model == 'BoS__gini':
                    X_train_temp = BorutaShap_FS(X_train_temp, y_train_temp, classifier_model[-(len(classifier_model)-5):])
                elif classifier_model == 'RFE__RF' or classifier_model == 'RFE__SVM':
                    X_train_temp = RFE_FS(X_train_temp, y_train_temp, classifier_model[-(len(classifier_model)-5):])
                elif classifier_model == 'BoP':
                    X_train_temp = BorutaPy_FS(X_train_temp, y_train_temp)
                

                #step 7: sampling only on TRAIN split
                X_train_temp, y_train_temp = sampling(X_train_temp, y_train_temp, sampling_technique)
                X_train_temp.to_csv('output/X_train_temp_'+str(i)+'.csv')
                X_test_temp.to_csv('output/X_test_temp_'+str(i)+'.csv')
                y_train_temp.to_csv('output/y_train_temp_'+str(i)+'.csv')
                y_test_temp.to_csv('output/y_test_temp_'+str(i)+'.csv')
                i+=1

                #step 8: remove unused features on TRAIN_TEST split
                X_test_temp = X_test_temp.loc[:,X_train_temp.columns]


                #step 9: train model, predict, and print scores
                accuracy, specificity, cf_matrix = run_model(X_train_temp, y_train_temp, X_test_temp, y_test_temp)
                result.append([i, accuracy, specificity, cf_matrix, X_train_temp.columns])
                
                combined_technique = replace_with +'&'+ knn_weight +'&'+ classifier_model +'&'+ sampling_technique
                print(i, combined_technique,'\n', accuracy, '\n', specificity,'\n', cf_matrix, '\n', X_train_temp.columns, '\n')


df_result = pd.DataFrame(result)
df_result.to_csv('output/result.csv')
print(result)

100%|██████████| 100/100 [01:45<00:00,  1.05s/it]


Counter({-1: 1170, 1: 1170})
1 NaN&knn__distance&BoS__shap&SMOTE 
 0.7197452229299363 
 0.47619047619047616 
 [[216  77]
 [ 11  10]] 
 Index(['feature_469', 'feature_130', 'feature_60', 'feature_34', 'feature_478',
       'feature_461'],
      dtype='object') 



100%|██████████| 100/100 [01:33<00:00,  1.07it/s]


Counter({-1: 1170, 1: 1170})
2 NaN&knn__distance&BoS__shap&ROSE 
 0.7834394904458599 
 0.42857142857142855 
 [[237  56]
 [ 12   9]] 
 Index(['feature_248', 'feature_469', 'feature_130', 'feature_60', 'feature_34',
       'feature_461'],
      dtype='object') 



100%|██████████| 100/100 [01:39<00:00,  1.01it/s]


Counter({1: 1202, -1: 1170})
3 NaN&knn__distance&BoS__shap&ADASYN 
 0.7484076433121019 
 0.6190476190476191 
 [[222  71]
 [  8  13]] 
 Index(['feature_469', 'feature_130', 'feature_60', 'feature_34',
       'feature_461'],
      dtype='object') 



100%|██████████| 100/100 [01:40<00:00,  1.00s/it]


Counter({1: 804, -1: 643})
4 NaN&knn__distance&BoS__shap&SMOTEENN 
 0.7611464968152867 
 0.6190476190476191 
 [[226  67]
 [  8  13]] 
 Index(['feature_248', 'feature_469', 'feature_60', 'feature_34',
       'feature_461'],
      dtype='object') 



100%|██████████| 100/100 [00:53<00:00,  1.87it/s]


Counter({-1: 1170, 1: 1170})
5 NaN&knn__distance&BoS__gini&SMOTE 
 0.7133757961783439 
 0.5714285714285714 
 [[212  81]
 [  9  12]] 
 Index(['feature_248', 'feature_469', 'feature_130', 'feature_206',
       'feature_60', 'feature_34', 'feature_461'],
      dtype='object') 



100%|██████████| 100/100 [00:55<00:00,  1.81it/s]


Counter({-1: 1170, 1: 1170})
6 NaN&knn__distance&BoS__gini&ROSE 
 0.8439490445859873 
 0.47619047619047616 
 [[255  38]
 [ 11  10]] 
 Index(['feature_469', 'feature_130', 'feature_60', 'feature_34',
       'feature_461'],
      dtype='object') 



100%|██████████| 100/100 [00:54<00:00,  1.83it/s]


Counter({1: 1205, -1: 1170})
7 NaN&knn__distance&BoS__gini&ADASYN 
 0.7133757961783439 
 0.5714285714285714 
 [[212  81]
 [  9  12]] 
 Index(['feature_469', 'feature_130', 'feature_60', 'feature_34', 'feature_478',
       'feature_461'],
      dtype='object') 



100%|██████████| 100/100 [00:56<00:00,  1.78it/s]


Counter({1: 898, -1: 618})
8 NaN&knn__distance&BoS__gini&SMOTEENN 
 0.6592356687898089 
 0.7142857142857143 
 [[192 101]
 [  6  15]] 
 Index(['feature_248', 'feature_469', 'feature_130', 'feature_60', 'feature_34',
       'feature_478', 'feature_461'],
      dtype='object') 

Counter({-1: 1170, 1: 1170})
9 NaN&knn__distance&RFE__RF&SMOTE 
 0.7802547770700637 
 0.47619047619047616 
 [[235  58]
 [ 11  10]] 
 Index(['feature_1', 'feature_20', 'feature_32', 'feature_34', 'feature_60',
       'feature_92', 'feature_103', 'feature_104', 'feature_122',
       'feature_130', 'feature_131', 'feature_154', 'feature_206',
       'feature_248', 'feature_478'],
      dtype='object') 

Counter({-1: 1170, 1: 1170})
10 NaN&knn__distance&RFE__RF&ROSE 
 0.8280254777070064 
 0.3333333333333333 
 [[253  40]
 [ 14   7]] 
 Index(['feature_1', 'feature_20', 'feature_22', 'feature_32', 'feature_34',
       'feature_60', 'feature_103', 'feature_122', 'feature_130',
       'feature_131', 'feature_248', 'feature

NameError: name 'SVC' is not defined

# --------------------Appendix--------------------

In [None]:
# #------------------------------------
# X_train = replace_outlier(X_train, how)
# replace_outlier_options = ['NaN', '3s']
# #--
# X_train = impute_null_with_knn(X_train, which_weights) #
# impute_null_options = ['knn__distance', 'knn__uniform', 'MICE']

# X_train = impute_null_with_mice(X_train)
# #--
# BorutaShap_FS(X_train, y_train, method_option)
# list_method=['shap','gini']

# RFE_FS(X_train, y_train, classify) 
# list_clf=['RF','SVM']

# BorutaPy_FS(X_train, y_train)

# FS_options = ['BoS__shap', 'BoS__gini', 'RFE__RF', 'RFE__SVM', 'BoP']
# #--
# sampling(X_train, y_train, sampler)
# sampling_options = ['SMOTE','ROSE','ADASYN','SMOTEENN']

# #--------------------------------------

In [9]:
# l = []
# a=1
# b=2
# c=3

# l.append([a,b,c])
# l.append([b,c,a])

# l



[[1, 2, 3], [2, 3, 1]]

In [None]:
# list_null_impute = [knnimputation_distance, MICEimputation_distance]
# list_null_outlier = [outlier_knn,outlier_3s]
# list_feat_selection = ['Boruta_RF', 'Boruta_shap', 'RFE']
# Boruta = ['RF', 'XGB']
# Boruta_shap = ['RF', 'XGB', 'kNN']
# RFE = ['RF', 'SVC']

In [None]:
# #TESTING
# df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [1,1,1]]),
#                    columns=['a', 'b', 'c'])
# df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9
3,1,1,1
