# Imports & Setups

In [27]:
import pandas as pd
import numpy as np
import itertools

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,roc_auc_score
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE

import warnings 

import lib.kotools as kt

In [28]:
%load_ext autoreload
%autoreload 2
sns.set()
warnings.simplefilter('ignore')
pd.set_option('display.max_columns', 7)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

Data source is https://archive.ics.uci.edu/ml/datasets/SECOM

In [29]:
df_raw = pd.read_csv('./input/secom.data',sep=' ',header=None)
labels = pd.read_csv('./input/secom_labels.data',sep=' ',header=None,usecols=[0],squeeze = True)
df_raw['TARGET'] = labels
df_raw.shape

(1567, 591)

In [30]:
evo = kt.DataFrameEvolution(df_raw)

# Test on Raw data

In [31]:
feats = [f for f in df_raw.columns if f not in ['TARGET']]
score = evo.test(df_raw[feats],df_raw['TARGET'],comment = 'Full RAW data')
print('CrossValidation(3) ROC AUC on full raw data:',score)

CrossValidation(3) ROC AUC on full raw data: 0.550166155936


# Train / test split

In [32]:
df = df_raw
X = df.copy()
y = df['TARGET'].copy()
X.drop(['TARGET'],axis=1,inplace=True) # Exclude target variable from X dataframe

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
print('X_train shape:{} \tX_test shape:{}'.format(X_train.shape,X_test.shape))

score = evo.test(X_train,y_train,comment = 'X_train RAW')
print('CrossValidation(3) ROC AUC on X_train raw:',score)

X_train shape:(1253, 590) 	X_test shape:(314, 590)
CrossValidation(3) ROC AUC on X_train raw: 0.671412575931


# Exploratory data analysis

## Missing Values

### Missing Values Exploring

For X_train:

In [33]:
kt.df_missing_report(X_train).sort_values(by='ratio_missing',ascending=False).head(10)

Unnamed: 0,count_missing,ratio_missing
157,1138,0.90822
293,1138,0.90822
292,1138,0.90822
158,1138,0.90822
85,1070,0.853951
220,1070,0.853951
492,1070,0.853951
358,1070,0.853951
383,816,0.651237
110,816,0.651237


For X_test:

In [34]:
kt.df_missing_report(X_test).sort_values(by='ratio_missing',ascending=False).head(10)

Unnamed: 0,count_missing,ratio_missing
158,291,0.926752
157,291,0.926752
293,291,0.926752
292,291,0.926752
492,271,0.863057
85,271,0.863057
358,271,0.863057
220,271,0.863057
516,202,0.643312
245,202,0.643312


### Missing Values Columns Drop

In [35]:
cutoff = 0.80
nan_cols_to_drop = X_train.columns[(((X_train.isnull().sum()/X_train.shape[0])>cutoff) | \
                                    ((X_test.isnull().sum()/X_test.shape[0])>cutoff))]

print('Count of columns to drop:',len(nan_cols_to_drop)) 

X_train.drop(nan_cols_to_drop,axis=1,inplace=True)
X_test.drop(nan_cols_to_drop,axis=1,inplace=True)

print('Actual X_train shape:{} \tActual X_test shape:{}'.format(X_train.shape,X_test.shape))

score = evo.test(X_train,y_train,comment = 'X_train NaN columns drop')
print('CrossValidation(3) ROC AUC on X_train:',score)

Count of columns to drop: 8
Actual X_train shape:(1253, 582) 	Actual X_test shape:(314, 582)
CrossValidation(3) ROC AUC on X_train: 0.670115587422


### Missing Values Imputation

In [36]:
nan_cols = X_train.columns[X_train.isna().any()]
if len(nan_cols)>0:
    X_train[nan_cols] = kt.df_impute(X_train[nan_cols],strategy='median')

nan_cols = list(X_test.columns[X_test.isna().any()])
if len(nan_cols)>0:
    X_test[nan_cols] = kt.df_impute(X_test[nan_cols],strategy='median')
    
score = evo.test(X_train,y_train,comment = 'X_train NaN imputation by median')
print('CrossValidation(3) ROC AUC on X_train:',score)

CrossValidation(3) ROC AUC on X_train: 0.667396526305


## Constant Values

### Constant Values Exploring 

In [37]:
const_cols = X_train.columns[(((X_train == X_train.iloc[0]).all())|\
                              (X_test == X_test.iloc[0]).all())]
print('Count of columns with constant value:',len(const_cols))

Count of columns with constant value: 122


### Constant Values Columns Drop

In [38]:
X_train.drop(const_cols,axis=1,inplace=True)
X_test.drop(const_cols,axis=1,inplace=True)

print('X_train shape:{} \tX_test shape:{}'.format(X_train.shape,X_test.shape))

score = evo.test(X_train,y_train,comment = 'X_train Drop constant values columns')
print('CrossValidation(3) ROC AUC on X_train:',score)

X_train shape:(1253, 460) 	X_test shape:(314, 460)
CrossValidation(3) ROC AUC on X_train: 0.667396526305


## Correlation

In [39]:
# Find correlations with the target and sort
correlations = X_train.corrwith(y_train).sort_values(ascending = False)

# Display correlations
print('Most Positive Correlations:\n', correlations.head(5))
print('\nMost Negative Correlations:\n', correlations.tail(5))

Most Positive Correlations:
 103    0.166603
59     0.164126
510    0.147652
431    0.140725
434    0.128902
dtype: float64

Most Negative Correlations:
 122   -0.089176
26    -0.099154
28    -0.100750
316   -0.102978
125   -0.103150
dtype: float64


# Data Transformation

## Normalization

In [40]:
_, cat_cols,num_cols = kt.df_feats_summary(X_train,[],verbose=False)
X_train[num_cols] = kt.df_scale(X_train[num_cols],strategy='minmax')
X_test[num_cols] = kt.df_scale(X_test[num_cols],strategy='minmax')

score = evo.test(X_train,y_train,comment = 'X_train Normalization')
print('CrossValidation(3) ROC AUC on X_train:',score)

CrossValidation(3) ROC AUC on X_train: 0.660213298747


# Over Sampling

In [41]:
X_train,y_train = SMOTE(random_state = 0).fit_resample(X_train,y_train)
X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)
y_train.value_counts()

 1    1173
-1    1173
dtype: int64

In [42]:
score = evo.test(X_train,y_train,comment = 'X_train Oversampling')
print('CrossValidation(3) ROC AUC on X_train:',score)

CrossValidation(3) ROC AUC on X_train: 0.998957795061


# Progress Overview

In [43]:
evo.get_history()

Unnamed: 0,Comment,LGBM CV3 ROC AUC
0,X_train Oversampling,0.998958
1,X_train Normalization,0.660213
2,X_train Drop constant values columns,0.667397
3,X_train NaN imputation by median,0.667397
4,X_train NaN columns drop,0.670116
5,X_train RAW,0.671413
6,Full RAW data,0.550166


# Training

In [44]:
lgbm = LGBMClassifier(random_state = 0)
lgbm.fit(X_train,y_train);

# Prediction

In [45]:
y_pred = lgbm.predict(X_test)
print('ROC AUC score on X_test:{}'.format(roc_auc_score(y_pred,y_test)))
print('Confusion matrix:\n',confusion_matrix(y_pred,y_test))

ROC AUC score on X_test:0.46153846153846156
Confusion matrix:
 [[288  24]
 [  2   0]]


# Conclusion

After oversampling we achieved ROC AUC 0.99 on CV(3 folds) on X_train but on X_test our result is 0.46. Obviously classifier was **overfitted**.