In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import decomposition
from sklearn import linear_model

from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer

from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

import sklearn as sk

In [2]:
pd.set_option('display.max_columns', 100)
train=pd.read_csv("train.csv", sep="|")
test = pd.read_csv("test.csv", sep="|")

X_ori = train.drop(columns='fraud')

train['totalScanned'] = train['scannedLineItemsPerSecond'] * train['totalScanTimeInSeconds']
test['totalScanned'] = test['scannedLineItemsPerSecond'] * test['totalScanTimeInSeconds']
# totalScanned:
train['totalScanned'] = train['scannedLineItemsPerSecond'] * train['totalScanTimeInSeconds']

# avgValuePerScan:
train['avgTimePerScan'] = 1/ train['scannedLineItemsPerSecond']
train['avgValuePerScan'] = train['avgTimePerScan'] * train['valuePerSecond']



# manual feature generation - "totalScanned" ratios

# withoutRegisPerPosition
train['withoutRegisPerPosition'] = train['scansWithoutRegistration'] / train['totalScanned']
# ratio of scansWithoutRegis in totalScan
# equivalent to lineItemVoidsPerPosition
# Might indicate how new or ambivalent a customer is. Expected to be higher for low "trustLevel"

# quantiModPerPosition
train['quantiModPerPosition'] = train['quantityModifications'] / train['totalScanned']
# ratio of quanityMods in totalScan



# manual feature generation - "grandTotal" ratios

# lineItemVoidsPerTotal
train['lineItemVoidsPerTotal'] = train['lineItemVoids'] / train['grandTotal']

# withoutRegisPerTotal
train['withoutRegisPerTotal'] = train['scansWithoutRegistration'] / train['grandTotal']

# quantiModPerTotal
train['quantiModPerTotal'] = train['quantityModifications'] / train['grandTotal']



# manual feature generation - "totalScanTimeInSeconds" ratios

# lineItemVoidsPerTime
train['lineItemVoidsPerTime'] = train['lineItemVoids'] / train['totalScanTimeInSeconds']

# withoutRegisPerTime
train['withoutRegisPerTime'] = train['scansWithoutRegistration'] / train['totalScanTimeInSeconds']

# quantiModPerTime
train['quantiModPerTime'] = train['quantityModifications'] / train['totalScanTimeInSeconds']

In [3]:
# no preprocessing (no fixed seed?)

X = train.drop(columns='fraud')
y = train['fraud']

def profit_scorer(y, y_pred):
    profit_matrix = {(0,0): 0, (0,1): -5, (1,0): -25, (1,1): 5}
    return sum(profit_matrix[(pred, actual)] for pred, actual in zip(y_pred, y))
                            # zip baut aus jedem iterierbaren object ein Tuple

profit_scoring = make_scorer(profit_scorer, greater_is_better=True)

cv = StratifiedKFold(n_splits=10, random_state=42)

In [41]:
# create feature union
features = []
features.append(('select_best', SelectKBest(k=16)))
features.append(('pca', decomposition.PCA(n_components=2)))

feature_union = FeatureUnion(features)
print('LR with PCA')
estimators = []
estimators.append(('feature_union', feature_union))
estimators.append(('scaler',preprocessing.StandardScaler()))


for c in range(6):
    if(c==0):
        c=0.1
    estimators.append(('LR', linear_model.LogisticRegression(C = c, solver = 'liblinear')))
    lr_af = Pipeline(estimators)
    print('C set to: ',c)
    print('LR score: {}'.format(sum(cross_validate(lr_af, X, y=y, cv=cv, scoring=profit_scoring)['test_score'])))
    estimators.pop()

LR with PCA
C set to:  0.1
LR score: -205
C set to:  1
LR score: 260
C set to:  2
LR score: 300
C set to:  3
LR score: 295
C set to:  4
LR score: 295
C set to:  5
LR score: 305
