# Imports & Setups

In [359]:
import pandas as pd
import numpy as np
import itertools

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,roc_auc_score
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE

import warnings 

import lib.kotools as kt

In [360]:
%load_ext autoreload
%autoreload 2
warnings.simplefilter('ignore')
pd.set_option('display.max_columns', 7)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

Data source is https://archive.ics.uci.edu/ml/datasets/SECOM

In [361]:
df_raw = pd.read_csv('./input/secom.data',sep=' ',header=None)
labels = pd.read_csv('./input/secom_labels.data',sep=' ',header=None,usecols=[0],squeeze = True)
df_raw['TARGET'] = labels
df_raw.shape

(1567, 591)

In [362]:
evo = kt.DataFrameEvolution(df_raw)

# Test on Raw data

In [363]:
feats = [f for f in df_raw.columns if f not in ['TARGET']]
score = evo.test(df_raw[feats],df_raw['TARGET'],step_group = 'Initial data',step_name = 'Full RAW data')
print('CrossValidation(3) ROC AUC on full raw data:',score)

CrossValidation(3) ROC AUC on full raw data: 0.550166155936


# Train / test split

In [364]:
step_group = 'Train / Test Split'

In [365]:
df = df_raw
X = df.copy()
y = df['TARGET'].copy()
X.drop(['TARGET'],axis=1,inplace=True)


for ts in range(60,100,10):
    step_name = 'X_train RAW (split:{}/{})'.format(ts,100-ts,score)
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=1-ts/100, random_state=42)
    score = evo.test(X_train,y_train,step_group,step_name)
    print('CrossValidation(3) ROC AUC for step "{}":{}'.format(step_name,score))

CrossValidation(3) ROC AUC for step "X_train RAW (split:60/40)":0.601843255730768
CrossValidation(3) ROC AUC for step "X_train RAW (split:70/30)":0.6318002606712284
CrossValidation(3) ROC AUC for step "X_train RAW (split:80/20)":0.6714125759309049
CrossValidation(3) ROC AUC for step "X_train RAW (split:90/10)":0.7246004566210046


In [366]:
evo.add_step_comment('X_train RAW (split:90/10)','Best by score')
evo.add_step_comment('X_train RAW (split:80/20)','More confidence')

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
evo.get_history()

Unnamed: 0,Step Group,Step name,LGBM CV3 ROC AUC,Comment
0,Train / Test Split,X_train RAW (split:90/10),0.7246,Best by score
1,Train / Test Split,X_train RAW (split:80/20),0.671413,More confidence
2,Train / Test Split,X_train RAW (split:70/30),0.6318,
3,Train / Test Split,X_train RAW (split:60/40),0.601843,
4,Initial data,Full RAW data,0.550166,


# Exploratory data analysis

## Missing Values

In [367]:
step_group = 'Missing Values'

### Missing Values Exploring

For X_train:

In [368]:
kt.df_missing_report(X_train).sort_values(by='ratio_missing',ascending=False).head(10)

Unnamed: 0,count_missing,ratio_missing
157,1138,0.90822
293,1138,0.90822
292,1138,0.90822
158,1138,0.90822
85,1070,0.853951
220,1070,0.853951
492,1070,0.853951
358,1070,0.853951
383,816,0.651237
110,816,0.651237


For X_test:

In [369]:
kt.df_missing_report(X_test).sort_values(by='ratio_missing',ascending=False).head(10)

Unnamed: 0,count_missing,ratio_missing
158,291,0.926752
157,291,0.926752
293,291,0.926752
292,291,0.926752
492,271,0.863057
85,271,0.863057
358,271,0.863057
220,271,0.863057
516,202,0.643312
245,202,0.643312


### Missing Values Columns Drop

In [370]:
for c in range(30,100,10):
    step_name = 'Drop Columns with {}% of NaNs'.format(c)
    c=c/100
    nan_cols_to_drop = X_train.columns[(((X_train.isnull().sum()/X_train.shape[0])>c) | \
                                        ((X_test.isnull().sum()/X_test.shape[0])>c))]
    print('{}; Count of columns to drop:{}'.format(step_name,len(nan_cols_to_drop)))
    
    step_df = X_train.drop(nan_cols_to_drop,axis=1)
    
    score = evo.test(step_df,y_train,step_group,step_name)
    print('CrossValidation(3) ROC AUC for step "{}":{}\n'.format(step_name,score))

Drop Columns with 30% of NaNs; Count of columns to drop:32
CrossValidation(3) ROC AUC for step "Drop Columns with 30% of NaNs":0.6602436589648866

Drop Columns with 40% of NaNs; Count of columns to drop:32
CrossValidation(3) ROC AUC for step "Drop Columns with 40% of NaNs":0.6602436589648866

Drop Columns with 50% of NaNs; Count of columns to drop:28
CrossValidation(3) ROC AUC for step "Drop Columns with 50% of NaNs":0.6484250333355193

Drop Columns with 60% of NaNs; Count of columns to drop:24
CrossValidation(3) ROC AUC for step "Drop Columns with 60% of NaNs":0.6639718937246644

Drop Columns with 70% of NaNs; Count of columns to drop:8
CrossValidation(3) ROC AUC for step "Drop Columns with 70% of NaNs":0.6701155874216402

Drop Columns with 80% of NaNs; Count of columns to drop:8
CrossValidation(3) ROC AUC for step "Drop Columns with 80% of NaNs":0.6701155874216402

Drop Columns with 90% of NaNs; Count of columns to drop:4
CrossValidation(3) ROC AUC for step "Drop Columns with 90% of 

In [371]:
evo.add_step_comment('Drop Columns with 70% of NaNs','Best cutoff by score/amount of data')

c=0.7
nan_cols_to_drop = X_train.columns[(((X_train.isnull().sum()/X_train.shape[0])>c) | \
                                    ((X_test.isnull().sum()/X_test.shape[0])>c))]
print('Count of columns to drop:{}'.format(len(nan_cols_to_drop)))

X_train = X_train.drop(nan_cols_to_drop,axis=1)
X_test = X_test.drop(nan_cols_to_drop,axis=1)
print('Actual X_train shape:{} \tActual X_test shape:{}'.format(X_train.shape,X_test.shape))

Count of columns to drop:8
Actual X_train shape:(1253, 582) 	Actual X_test shape:(314, 582)


In [372]:
evo.get_history()

Unnamed: 0,Step Group,Step name,LGBM CV3 ROC AUC,Comment
0,Missing Values,Drop Columns with 90% of NaNs,0.66233,
1,Missing Values,Drop Columns with 80% of NaNs,0.670116,
2,Missing Values,Drop Columns with 70% of NaNs,0.670116,Best cutoff by score/amount of data
3,Missing Values,Drop Columns with 60% of NaNs,0.663972,
4,Missing Values,Drop Columns with 50% of NaNs,0.648425,
5,Missing Values,Drop Columns with 40% of NaNs,0.660244,
6,Missing Values,Drop Columns with 30% of NaNs,0.660244,
7,Train / Test Split,X_train RAW (split:90/10),0.7246,Best by score
8,Train / Test Split,X_train RAW (split:80/20),0.671413,More confidence
9,Train / Test Split,X_train RAW (split:70/30),0.6318,


### Missing Values Imputation

In [373]:
nan_cols = X_train.columns[X_train.isna().any()]
if len(nan_cols)>0:
    for strategy in ['median','mean','mode']:
        step_name = 'Impute NaNs by {}'.format(strategy)
        step_df = kt.df_impute(X_train[nan_cols],strategy=strategy)    
        score = evo.test(step_df,y_train,step_group,step_name)
        print('CrossValidation(3) ROC AUC for step "{}":{}'.format(step_name,score))
else:
    print('DataFrame has no missed values')

CrossValidation(3) ROC AUC for step "Impute NaNs by median":0.6839889925993933
CrossValidation(3) ROC AUC for step "Impute NaNs by mean":0.6632420340860238
CrossValidation(3) ROC AUC for step "Impute NaNs by mode":0.6767086123437359


In [374]:
evo.add_step_comment('Impute NaNs by median','Best by score')

nan_cols = X_train.columns[X_train.isna().any()]
if len(nan_cols)>0:
    X_train[nan_cols] = kt.df_impute(X_train[nan_cols],strategy='median')

nan_cols = list(X_test.columns[X_test.isna().any()])
if len(nan_cols)>0:
    X_test[nan_cols] = kt.df_impute(X_test[nan_cols],strategy='median')
    
evo.get_history()

Unnamed: 0,Step Group,Step name,LGBM CV3 ROC AUC,Comment
0,Missing Values,Impute NaNs by mode,0.676709,
1,Missing Values,Impute NaNs by mean,0.663242,
2,Missing Values,Impute NaNs by median,0.683989,Best by score
3,Missing Values,Drop Columns with 90% of NaNs,0.66233,
4,Missing Values,Drop Columns with 80% of NaNs,0.670116,
5,Missing Values,Drop Columns with 70% of NaNs,0.670116,Best cutoff by score/amount of data
6,Missing Values,Drop Columns with 60% of NaNs,0.663972,
7,Missing Values,Drop Columns with 50% of NaNs,0.648425,
8,Missing Values,Drop Columns with 40% of NaNs,0.660244,
9,Missing Values,Drop Columns with 30% of NaNs,0.660244,


## Constant Values

In [375]:
step_group = 'Constant Values'

### Constant Values Exploring 

In [376]:
const_cols = X_train.columns[(((X_train == X_train.iloc[0]).all())|\
                              (X_test == X_test.iloc[0]).all())]
print('Count of columns with constant value:',len(const_cols))

Count of columns with constant value: 122


### Constant Values Columns Drop

In [377]:
step_name = 'Before drop columns with constant values'
score = evo.test(step_df,y_train,step_group,step_name = step_name)
print('CrossValidation(3) ROC AUC for step "{}":{}'.format(step_name,score))

step_df = X_train.drop(const_cols,axis=1)

step_name = 'After drop columns with constant values'
score = evo.test(step_df,y_train,step_group,step_name = 'X_train Drop constant values columns')
print('CrossValidation(3) ROC AUC for step "{}":{}'.format(step_name,score))

CrossValidation(3) ROC AUC for step "Before drop columns with constant values":0.6767086123437359
CrossValidation(3) ROC AUC for step "After drop columns with constant values":0.6673965263053071


No changes without constant columns -> Drop

In [378]:
X_train.drop(const_cols,axis=1,inplace=True)
X_test.drop(const_cols,axis=1,inplace=True)

## Correlation

In [379]:
# Find correlations with the target and sort
correlations = X_train.corrwith(y_train).sort_values(ascending = False)

# Display correlations
print('Most Positive Correlations:\n', correlations.head(5))
print('\nMost Negative Correlations:\n', correlations.tail(5))

Most Positive Correlations:
 103    0.166603
59     0.164126
510    0.147652
431    0.140725
434    0.128902
dtype: float64

Most Negative Correlations:
 122   -0.089176
26    -0.099154
28    -0.100750
316   -0.102978
125   -0.103150
dtype: float64


# Data Transformation

## Normalization

In [380]:
step_group = 'Normalization'

In [381]:
_, cat_cols,num_cols = kt.df_feats_summary(X_train,[],verbose=False)

In [382]:
for strategy in ['minmax','standart']:
    step_name = 'Normalization of numeric columns with strategy = {}'.format(strategy)
    step_df = kt.df_scale(X_train[num_cols],strategy=strategy)   
    score = evo.test(step_df,y_train,step_group,step_name)
    print('CrossValidation(3) ROC AUC for step "{}":{}'.format(step_name,score))

CrossValidation(3) ROC AUC for step "Normalization of numeric columns with strategy = minmax":0.6602132987469731
CrossValidation(3) ROC AUC for step "Normalization of numeric columns with strategy = standart":0.6576241793633097


In [383]:
evo.add_step_comment('Normalization of numeric columns with strategy = minmax','Best by score')

X_train[num_cols] = kt.df_scale(X_train[num_cols],strategy='minmax')
X_test[num_cols] = kt.df_scale(X_test[num_cols],strategy='minmax')

print('CrossValidation(3) ROC AUC on X_train:',score)

CrossValidation(3) ROC AUC on X_train: 0.657624179363


In [384]:
evo.get_history()

Unnamed: 0,Step Group,Step name,LGBM CV3 ROC AUC,Comment
0,Normalization,Normalization of numeric columns with strategy...,0.657624,
1,Normalization,Normalization of numeric columns with strategy...,0.660213,Best by score
2,Constant Values,X_train Drop constant values columns,0.667397,
3,Constant Values,Before drop columns with constant values,0.676709,
4,Missing Values,Impute NaNs by mode,0.676709,
5,Missing Values,Impute NaNs by mean,0.663242,
6,Missing Values,Impute NaNs by median,0.683989,Best by score
7,Missing Values,Drop Columns with 90% of NaNs,0.66233,
8,Missing Values,Drop Columns with 80% of NaNs,0.670116,
9,Missing Values,Drop Columns with 70% of NaNs,0.670116,Best cutoff by score/amount of data


# Over Sampling

In [385]:
step_group = 'Oversampling'

In [393]:
step_name = 'Oversampling with strategy SMOTE'

step_df_x, step_df_y = SMOTE(random_state = 0).fit_resample(X_train,y_train) 
step_df_x = pd.DataFrame(step_df_x)
step_df_y = pd.Series(step_df_y)
score = evo.test(step_df_x,step_df_y,step_group,step_name)
print('CrossValidation(3) ROC AUC for step "{}":{}'.format(step_name,score))

CrossValidation(3) ROC AUC for step "Oversampling with strategy SMOTE":0.9989577950606462


In [394]:
evo.add_step_comment('Oversampling with strategy SMOTE','Overfitting')

Since classificator was overfitted oversampling is not appliying to X_train dataset

# Progress Overview

In [395]:
evo.get_history()

Unnamed: 0,Step Group,Step name,LGBM CV3 ROC AUC,Comment
0,Oversampling,Oversampling with strategy SMOTE,0.998958,Overfitting
1,Normalization,Normalization of numeric columns with strategy...,0.657624,
2,Normalization,Normalization of numeric columns with strategy...,0.660213,Best by score
3,Constant Values,X_train Drop constant values columns,0.667397,
4,Constant Values,Before drop columns with constant values,0.676709,
5,Missing Values,Impute NaNs by mode,0.676709,
6,Missing Values,Impute NaNs by mean,0.663242,
7,Missing Values,Impute NaNs by median,0.683989,Best by score
8,Missing Values,Drop Columns with 90% of NaNs,0.66233,
9,Missing Values,Drop Columns with 80% of NaNs,0.670116,


# Training

In [396]:
lgbm = LGBMClassifier(random_state = 0)
lgbm.fit(X_train,y_train);

# Prediction

In [399]:
y_pred = lgbm.predict(X_test)
print('Confusion matrix:\n',confusion_matrix(y_pred,y_test))

Confusion matrix:
 [[290  24]
 [  0   0]]


In [401]:
#print('ROC AUC score on X_test:{}'.format(roc_auc_score(y_pred,y_test)))