In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import make_scorer
from xgboost import XGBClassifier
import featuretools as ft
from sklearn.decomposition import PCA

#from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
#from sklearn.utils import resample
import sklearn as sk

In [4]:
# load data
dataframe_org = pd.read_csv('train.csv', delimiter='|')
dataframe = dataframe_org.copy()

### Manual Feature Generation

In [5]:
########### manual feature generation ##########

# totalScanned:
dataframe['totalScanned'] = dataframe['scannedLineItemsPerSecond'] * dataframe['totalScanTimeInSeconds']
# avgValuePerScan:
dataframe['avgTimePerScan'] = 1/ dataframe['scannedLineItemsPerSecond']
dataframe['avgValuePerScan'] = dataframe['avgTimePerScan'] * dataframe['valuePerSecond']
# manual feature generation - "totalScanned" ratios
# withoutRegisPerPosition
dataframe['withoutRegisPerPosition'] = dataframe['scansWithoutRegistration'] / dataframe['totalScanned']
# ratio of scansWithoutRegis in totalScan
# equivalent to lineItemVoidsPerPosition
# Might indicate how new or ambivalent a customer is. Expected to be higher for low "trustLevel"
# quantiModPerPosition
dataframe['quantiModPerPosition'] = dataframe['quantityModifications'] / dataframe['totalScanned']
# ratio of quanityMods in totalScan
# manual feature generation - "grandTotal" ratios
# lineItemVoidsPerTotal
dataframe['lineItemVoidsPerTotal'] = dataframe['lineItemVoids'] / dataframe['grandTotal']
# withoutRegisPerTotal
dataframe['withoutRegisPerTotal'] = dataframe['scansWithoutRegistration'] / dataframe['grandTotal']
# quantiModPerTotal
dataframe['quantiModPerTotal'] = dataframe['quantityModifications'] / dataframe['grandTotal']
# manual feature generation - "totalScanTimeInSeconds" ratios
# lineItemVoidsPerTime
dataframe['lineItemVoidsPerTime'] = dataframe['lineItemVoids'] / dataframe['totalScanTimeInSeconds']
# withoutRegisPerTime
dataframe['withoutRegisPerTime'] = dataframe['scansWithoutRegistration'] / dataframe['totalScanTimeInSeconds']
# quantiModPerTime
dataframe['quantiModPerTime'] = dataframe['quantityModifications'] / dataframe['totalScanTimeInSeconds']
# extra
dataframe['extra'] = dataframe['totalScanned'] * dataframe['totalScanned']/ dataframe['trustLevel']

########### end manual feature generation ###########

In [6]:
# put the ftarget label fraud at the end of the dataframe

dataframe_XG = dataframe[['trustLevel', 'totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids',
       'scansWithoutRegistration', 'quantityModifications',
       'scannedLineItemsPerSecond', 'valuePerSecond',
       'lineItemVoidsPerPosition', 'totalScanned', 'avgTimePerScan',
       'avgValuePerScan', 'withoutRegisPerPosition', 'quantiModPerPosition',
       'lineItemVoidsPerTotal', 'withoutRegisPerTotal', 'quantiModPerTotal',
       'lineItemVoidsPerTime', 'withoutRegisPerTime', 'quantiModPerTime', 'extra', 'fraud']]

dataframe_LR = dataframe_XG.drop('extra',axis=1)

### Cross-Validation by Nico

In [7]:
# Cross Validation by Nico
cv = StratifiedKFold(n_splits=10, random_state=42)
def profit_scorer(y, y_pred):
    profit_matrix = {(0,0): 0, (0,1): -5, (1,0): -25, (1,1): 5}
    return sum(profit_matrix[(pred, actual)] for pred, actual in zip(y_pred, y))
profit_scoring = make_scorer(profit_scorer, greater_is_better=True)

### Baseline Models with no pre-processing at all 

In [8]:
#Baseline model of original data w/out pre-processing
X_base_org = dataframe_org.drop('fraud',axis=1)
y_base_org = dataframe_org['fraud']
print('Baseline models of original data w/out pre-processing and default parameters: \n')
print('XGB: {}'.format(sum(cross_validate(XGBClassifier(), X_base_org, y=y_base_org, cv=cv, scoring=profit_scoring)['test_score'])))
print('LR: {}'.format(sum(cross_validate(LogisticRegression(), X_base_org, y=y_base_org, cv=cv, scoring=profit_scoring)['test_score'])))
print('ADB: {}'.format(sum(cross_validate(AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, random_state=None), X_base_org, y=y_base_org, cv=cv, scoring=profit_scoring)['test_score'])))

Baseline models of original data w/out pre-processing and default parameters: 

XGB: -80
LR: -360
ADB: -130


### Baseline Models w/ the manual features generated

In [9]:
#Baseline models of data w/ manual feature generated
X_base = dataframe.drop('fraud',axis=1)
y_base = dataframe['fraud']
print('Baseline models of data w/ manual features generated: \n')
print('XGB: {}'.format(sum(cross_validate(XGBClassifier(), X_base, y=y_base, cv=cv, scoring=profit_scoring)['test_score'])))
print('LR: {}'.format(sum(cross_validate(LogisticRegression(), X_base, y=y_base, cv=cv, scoring=profit_scoring)['test_score'])))
print('ADB: {}'.format(sum(cross_validate(AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.5, random_state=None), X_base, y=y_base, cv=cv, scoring=profit_scoring)['test_score'])))

Baseline models of data w/ manual features generated: 

XGB: 265
LR: 120
ADB: 200


In [10]:
X_LR = dataframe_LR.drop('fraud',axis=1)
y_LR = dataframe_LR['fraud']

In [11]:
# create feature union
features = []
features.append(('select_best', SelectKBest(k=16)))
features.append(('pca', PCA(n_components=2)))

feature_union = FeatureUnion(features)
estimators = []
estimators.append(('feature_union', feature_union))
estimators.append(('scaler',sk.preprocessing.StandardScaler()))

#Best LR parameters found using exhaustive GridSearch and RandomizedSearch
LR = LogisticRegression(C=2, solver='liblinear')

estimators.append(('LR',LR))

pipe_LR = Pipeline(estimators)
print('LR Pipeline: {}'.format(sum(cross_validate(pipe_LR, X_LR, y=y_LR
                                                          , cv=cv, scoring=profit_scoring)['test_score'])))

LR Pipeline: 300


In [138]:
# create feature union
features = []
features.append(('select_best', SelectKBest(k=11)))
features.append(('pca', PCA(n_components=2)))

feature_union = FeatureUnion(features)
estimators = []
estimators.append(('feature_union', feature_union))
estimators.append(('scaler',sk.preprocessing.StandardScaler()))
#XGBBoost
estimators = []
estimators.append(('feature_union', feature_union))

XG = XGBClassifier(learning_rate=0.1, max_depth=3, min_child_weight=5)
estimators.append(('XG',XG))
pipe_XG = Pipeline(estimators)

print('XG Pipeline: {}'.format(sum(cross_validate(pipe_XG, X_LR, y=y_base
                                                          , cv=cv, scoring=profit_scoring)['test_score'])))

XG Pipeline: 325


In [120]:
# create feature union
features = []
features.append(('select_best', SelectKBest(k=10)))
features.append(('pca', PCA(n_components=2)))

feature_union = FeatureUnion(features)
estimators = []
estimators.append(('feature_union', feature_union))
estimators.append(('scaler',sk.preprocessing.StandardScaler()))

#Best LR parameters found using exhaustive GridSearch and RandomizedSearch
ADB = AdaBoostClassifier(base_estimator=None, n_estimators=55, learning_rate=0.35)

estimators.append(('ADB',ADB))

pipe_ADB = Pipeline(estimators)
print('ADB Pipeline: {}'.format(sum(cross_validate(pipe_ADB, X_LR, y=y_LR
                                                          , cv=cv, scoring=profit_scoring)['test_score'])))

ADB Pipeline: 225
