In [2]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

In [190]:
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

# Read df

In [162]:
#this function is to read, transform and join 2 data frame

def read_features():
    path = 'secom.data'
    df = pd.read_csv(path, delimiter=' ', header=None, na_values=['NaN'])
    df.columns = ['feature_'+str(x+1) for x in range(len(df.columns))]
    return df



def read_target():
    path = 'secom_labels.data'
    df = pd.read_csv(path, delimiter=' ', header=None, na_values=['NaN'])
    df.columns = ['status','timestamp']
    df['timestamp'] = pd.to_datetime(df['timestamp'],dayfirst=True)
    return df

#for the testing purporse, trim to remain first 100 rows only
X = read_features()
y = read_target().iloc[:,0]

In [163]:
X.shape

(1567, 590)

# Remove duplicated columns

In [164]:
#find the duplicated features (columns)
def remove_duplicated_columns(df):
    dict_duplicate_pair = {}
    dict_duplicate_matches = {}
    list_duplicate = []
    to_remove = []
    for i in range(0, len(df.columns)):
        l = []
        for j in range(i+1,len(df.columns)):
            dict_duplicate_pair[str(i+1)+';'+str(j+1)] = df.iloc[:,i].equals(df.iloc[:,j])
            if df.iloc[:,i].equals(df.iloc[:,j]) == True:
                if j not in list_duplicate:
                    l.append(j)
                    to_remove.append('feature_'+str(j+1))
                list_duplicate.append(i)
                list_duplicate.append(j)
        if len(l)!=0:
            dict_duplicate_matches[i] = l


    df_duplicate_pair = pd.DataFrame.from_dict(dict_duplicate_pair, orient='index')
    df_duplicate_pair.columns=['duplicate']

    df_duplicate_matches = pd.DataFrame.from_dict(dict_duplicate_matches, orient='index')

    
    df = df.drop(columns=to_remove, axis = 1)

    return df

X = remove_duplicated_columns(X)
X.shape


(1567, 486)

# Remove columns with Constant volatility (std=0)

In [165]:
X.shape

(1567, 486)

In [166]:
def remove_constant_volatility(df):
    df_EDA= df.describe().T
    df_EDA= df_EDA[df_EDA["std"] == 0]
    df = df.drop(axis=1, columns=df_EDA.index)
    return df

X = remove_constant_volatility(X)
X.shape

(1567, 474)

# Remove columns with high %Missing values

In [167]:
X.shape

(1567, 474)

In [168]:
def remove_cols_with_high_pct_null(df, null_threshold):
    list_column_with_pct_null = pd.concat([df.isnull().sum(), df.isnull().sum()/df.shape[0]],axis=1).rename(columns={0:'Missing_Records', 1:'Percentage (%)'})
    list_column_with_pct_null= list_column_with_pct_null[list_column_with_pct_null["Percentage (%)"] >= null_threshold]
    df = df.drop(axis=1, columns=list_column_with_pct_null.index)
    return df

X = remove_cols_with_high_pct_null(X, 0.8)
X.shape

(1567, 466)

# Split data

In [169]:
X

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_7,feature_8,feature_9,feature_10,feature_11,...,feature_581,feature_582,feature_583,feature_584,feature_585,feature_586,feature_587,feature_588,feature_589,feature_590
0,3030.93,2564.00,2187.7333,1411.1265,1.3602,97.6133,0.1242,1.5005,0.0162,-0.0034,...,,,0.5005,0.0118,0.0035,2.3630,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,102.3433,0.1247,1.4966,-0.0005,-0.0148,...,0.0060,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.0060,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,95.4878,0.1241,1.4436,0.0041,0.0013,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.90,2199.0333,909.7926,1.3204,104.2367,0.1217,1.4882,-0.0124,-0.0033,...,0.0044,73.8432,0.4990,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.5200,1.5334,100.3967,0.1235,1.5031,-0.0031,-0.0072,...,,,0.4800,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,2899.41,2464.36,2179.7333,3085.3781,1.4843,82.2467,0.1248,1.3424,-0.0045,-0.0057,...,0.0047,203.1720,0.4988,0.0143,0.0039,2.8669,0.0068,0.0138,0.0047,203.1720
1563,3052.31,2522.55,2198.5667,1124.6595,0.8763,98.4689,0.1205,1.4333,-0.0061,-0.0093,...,,,0.4975,0.0131,0.0036,2.6238,0.0068,0.0138,0.0047,203.1720
1564,2978.81,2379.78,2206.3000,1110.4967,0.8236,99.4122,0.1208,,,,...,0.0025,43.5231,0.4987,0.0153,0.0041,3.0590,0.0197,0.0086,0.0025,43.5231
1565,2894.92,2532.01,2177.0333,1183.7287,1.5726,98.7978,0.1213,1.4622,-0.0072,0.0032,...,0.0075,93.4941,0.5004,0.0178,0.0038,3.5662,0.0262,0.0245,0.0075,93.4941


In [170]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1,stratify=y)

# Outlier treatment

In [36]:
#how = ['NaN', '3s']
def replace_outlier(df, how):
    for col in df:
        ll_col = df[col].mean() - 3 * df[col].std()
        ul_col = df[col].mean() + 3 * df[col].std()
        if how == 'NaN':
            df[col] = np.where(df[col]>ul_col,np.NaN,np.where(df[col]<ll_col,np.NaN,df[col]))
        elif how == '3s':
            df[col] = np.where(df[col]>ul_col,ul_col,np.where(df[col]<ll_col,ll_col,df[col]))
    return df

# Missing value Imputation

In [41]:
#TESTING
df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [1,1,1]]),
                   columns=['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9
3,1,1,1


In [42]:
#TESTING
#more complex outlier removal
how = '3s'
for col in df:
    ll_col = df[col].mean() - 1 * df[col].std()
    ul_col = df[col].mean() + 1 * df[col].std()
    if how == 'NaN':
        df[col] = np.where(df[col]>ul_col,np.NaN,np.where(df[col]<ll_col,np.NaN,df[col]))
    elif how == '3s':
        df[col] = np.where(df[col]>ul_col,ul_col,np.where(df[col]<ll_col,ll_col,df[col]))

df

Unnamed: 0,a,b,c
0,1.0,2.0,3.0
1,4.0,5.0,6.0
2,6.122281,7.162278,8.25
3,1.0,1.0,1.25


In [171]:
which_weights = ['','']
def impute_null_with_knn(X_train, X_test, which_weights):
    #First scale the data 
    scaler = MinMaxScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns= X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns= X_test.columns)

    knn = KNNImputer(n_neighbors=20, weights='which_weights')

    X_train = pd.DataFrame(knn.fit_transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(knn.transform(X_test), columns=X_test.columns)
    
    X_train = pd.DataFrame(scaler.inverse_transform(X_train), columns= X_train.columns)
    X_test = pd.DataFrame(scaler.inverse_transform(X_test), columns= X_test.columns)
    return X_train, X_test

X_train = null_imputation_knn(X_train)

In [43]:
def impute_null_with_mice(X_train, X_test): 
    imp = IterativeImputer(maX_train_iter=10, verbose=2, imputation_order='roman', random_state=0)
    X_train = pd.DataFrame(imp.fit_transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(imp.transform(X_test), columns=X_test.columns)
    return X_train, X_test

# Feature Selection

In [172]:
X_train.shape

(1253, 466)

In [None]:
#list_method=['shap','gini']

def BorutaShap_FS (X, y, method_option) :
    modelshap = RandomForestClassifier(n_jobs=-1,n_estimators=100, class_weight='balanced_subsample', max_depth=5, random_state=100)
    # define model for resp. classifier
    modelshap.fit(X,y)
    ##-- feature_names = np.array(X.columns)
    # define Boruta Sahp feature selection method
    feature_selector = BorutaShap(model=modelshap,
                              importance_measure=method_option,
                              classification=True)  # find all relevant features
    feature_selector.fit(X,y,n_trials=100,sample = False, verbose = True,random_state=100)  
    ##-- feature_selector.plot(which_features='accepted',figsize=(20,10))
    # call transform() on X to filter it down to selected features
    return  feature_selector.Subset()

In [None]:
#RFE function with random forest

def RFE_FS (X, y, classify) :
    feature_names = np.array(X.columns)
    if classify == 'RF':
    # define random forest classifier
        model = RandomForestClassifier(n_jobs=-1, class_weight='balanced_subsample', max_depth=5, random_state=100)
        #model.fit(X, y)
        #rfe = RFE(estimator = model,n_features_to_select = 15)
    if classify== 'SVM':
        model = SVC(kernel='linear',C=5)
        #model.fit(X, y)
        #rfe = RFECV(estimator = model,scoring='accuracy')
    # find all relevant features
    model.fit(X, y)
    rfe = RFE(estimator = model,n_features_to_select = 15)
    rfe.fit(X,y)

    # check selected features
    ##--rfe.support_

    # check ranking of features
    ##--rfe.ranking_

    # zip feature names, ranks, and decisions 
    feature_ranks = list(zip(feature_names, 
                             rfe.ranking_, 
                             rfe.support_))

    # print the results
    ##--for feat in feature_ranks:
    ##--    print('Feature: {:<30} Rank: {},  Keep: {}'.format(feat[0], feat[1], feat[2]))
        
    final_features_rfe = list()
    indexes = np.where(rfe.ranking_ <= 1)
    for x in np.nditer(indexes):
        final_features_rfe.append(feature_names[x])
    ##-- print(final_features_rfe)
    
 # call transform() on X to filter it down to selected features
    return pd.DataFrame(X.filter(final_features_rfe))

In [None]:
#Boruta function with random forest

def BorutaPy_FS (X, y) :
    feature_names = np.array(X.columns)

    # define random forest classifier
    model = RandomForestClassifier(n_jobs=-1, class_weight='balanced_subsample', max_depth=5, random_state=100)
    model.fit(X, y)
    # define Boruta feature selection method
    
    feature_selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=100, max_iter=140)

    # find all relevant features
    feature_selector.fit(X.to_numpy(),y)

    # check selected features
    ##--feature_selector.support_

    # check ranking of features
    ##--feature_ranking=feature_selector.ranking_

    # zip feature names, ranks, and decisions 
    feature_ranks = list(zip(feature_names, 
                             feature_selector.ranking_, 
                             feature_selector.support_))

    # print the results
    ##--for feat in feature_ranks:
    ##--    print('Feature: {:<30} Rank: {},  Keep: {}'.format(feat[0], feat[1], feat[2]))
        
    final_features = list()
    indexes = np.where(feature_selector.ranking_ <= 1)
    for x in np.nditer(indexes):
        final_features.append(feature_names[x])
    ##--print(final_features)
    
 # call transform() on X to filter it down to selected features
    return pd.DataFrame(X.filter(final_features))

In [174]:
X_train.to_csv('output/X_train')
y_train.to_csv('output/y_train')

# Balancing

In [156]:
# X_train = pd.read_csv('output/X_train').iloc[:,1:]
# y_train = pd.read_csv('output/y_train').iloc[:,1:]

In [158]:
X_train.shape

(1253, 12)

In [175]:
def sampling(X_train, y_train, sampler):
    
    #SMOTE
    if sampler == 'SMOTE':
        sampler = SMOTE(random_state=100)    
    
    #ROSE
    if sampler == 'ROSE':
        sampler = RandomOverSampler(random_state=100, shrinkage=1)

    #ADASYN
    if sampler == 'ADASYN':
        sampler = ADASYN(random_state=100)
    

    #SMOTTEENN
    if sampler == 'SMOTEENN' :
        sampler = SMOTEENN(random_state=100)
        
        
    #Random under Sampling
    if sampler == "randomunder":
        sampler = RandomUnderSampler(random_state=100)

    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    counter = Counter(y_resampled)
    print(counter)
    return X_resampled, y_resampled

X_train, y_train = sampling(X_train, y_train,'SMOTE')
X_train.shape

Counter({-1: 1170, 1: 1170})


(2340, 12)

# Transform Test split

In [186]:
#Remove cols that do not exist in Train data
X_test = X_test.loc[:,X_train.columns]

In [187]:
X_test = outlierknn(X_test)

# Model

In [188]:
X_test.shape

(314, 12)

In [199]:
def run_model(X_train, y_train, X_test, y_test):
    # building model before balancing data
    model = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
    model.fit(X_train,y_train)
    y_pred= model.predict(X_test)
    accuracy= accuracy_score(y_test, y_pred)
    cf_matrix = confusion_matrix(y_test, y_pred)
    specificity = cf_matrix[1][1] / ( cf_matrix[1][1] + cf_matrix[1][0] )
    return accuracy, specificity, cf_matrix

run_model(X_train, y_train, X_test, y_test)

## Combine

In [None]:
result = []
i = 0

#step 1
X_train = remove_duplicated_columns(X_train)
#step 2:
X_train = remove_constant_volatility(X_train)
#step 3:
X_train = remove_cols_with_high_pct_null(X_train)




#step 4:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1,stratify=y)


#step 5-8
replace_outlier_options = ['NaN', '3s']
impute_null_options = ['knn__distance', 'knn__uniform', 'MICE']
FS_options = ['BoS__shap', 'BoS__gini', 'RFE__RF', 'RFE__SVM', 'BoP']
sampling_options = ['SMOTE','ROSE','ADASYN','SMOTEENN']

for replace_with in replace_outlier_options:
    for knn_weight in impute_null_options:
        for classifier_model in FS_options:
            for sampling_technique in sampling_options:
                X_train_temp = X_train
                X_test_temp = X_test
                y_train_temp = y_train
                y_test_temp = y_test

                #step 5: oulier treatement (on both TRAIN & TEST split)
                X_train_temp = replace_outlier(X_train_temp, replace_with)
                X_test_temp = replace_outlier(X_test_temp, replace_with)
                
                #step 6: missing value imputation (on both TRAIN & TEST split)
                if knn_weight == 'knn__distance' or knn_weight == 'knn__uniform':
                    X_train_temp, X_test_temp = impute_null_with_knn(X_train_temp, X_test_temp, knn_weight[-(len(knn_weight)-5):])
                elif knn_weight == 'MICE':
                    X_train_temp, X_test_temp = impute_null_with_mice(X_train_temp, X_test_temp)


                if classifier_model == 'BoS__shap' or classifier_model == 'BoS__gini':
                    X_train_temp = BorutaShap_FS(X_train_temp, y, classifier_model[-(len(classifier_model)-5):])
                elif classifier_model == 'RFE__RF' or classifier_model == 'RFE__SVM':
                    X_train_temp = RFE_FS(X_train_temp, y, classifier_model[-(len(classifier_model)-5):])
                elif classifier_model == 'BoP':
                    X_train_temp = BorutaPy_FS(X_train, y_train)
                

                #step 7: sampling only on TRAIN split
                X_train_temp, y_train_temp = sampling(X_train_temp, y_train_temp, sampling_technique)
                X_train_temp.to_csv('output/X_train_temp_'+str(i)+'.csv')
                X_test_temp.to_csv('output/X_test_temp_'+str(i)+'.csv')
                y_train_temp.to_csv('output/y_train_temp_'+str(i)+'.csv')
                y_test_temp.to_csv('output/y_test_temp_'+str(i)+'.csv')
                i+=1


                #step 8: train model, predict, and print scores
                accuracy, specificity, cf_matrix = run_model(X_train_temp, y_train_temp, X_test_temp, y_test_temp)
                result.append([i, accuracy, specificity, cf_matrix, X_train_temp.columns])


df_result = pd.DataFrame(result)
df_result.to_csv('output/result.csv')
print(result)

# --------------------Appendix--------------------

In [None]:
#------------------------------------
X_train = replace_outlier(X_train, how)
replace_outlier_options = ['NaN', '3s']
#--
X_train = impute_null_with_knn(X_train, which_weights) #
impute_null_options = ['knn__distance', 'knn__uniform', 'MICE']

X_train = impute_null_with_mice(X_train):
#--
BorutaShap_FS(X_train, y_train, method_option)
list_method=['shap','gini']

RFE_FS(X_train, y_train, classify) 
list_clf=['RF','SVM']

BorutaPy_FS(X_train, y_train)

FS_options = ['BoS__shap', 'BoS__gini', 'RFE__RF', 'RFE__SVM', 'BoP']
#--
sampling(X_train, y_train, sampler)
sampling_options = ['SMOTE','ROSE','ADASYN','SMOTEENN']

#--------------------------------------

In [8]:
str = 'knn__di'
str[-(len(str)-5):]

'di'

In [9]:
l = []
a=1
b=2
c=3

l.append([a,b,c])
l.append([b,c,a])

l



[[1, 2, 3], [2, 3, 1]]

In [None]:
list_null_impute = [knnimputation_distance, MICEimputation_distance]
list_null_outlier = [outlier_knn,outlier_3s]
list_feat_selection = ['Boruta_RF', 'Boruta_shap', 'RFE']
Boruta = ['RF', 'XGB']
Boruta_shap = ['RF', 'XGB', 'kNN']
RFE = ['RF', 'SVC']