# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing as prep
from sklearn.utils import resample

import warnings

In [2]:
warnings.filterwarnings('ignore')

train = pd.read_csv('C:/Users/University/Desktop/DM2/DMC_2019_task/train.csv', sep='|')

### manual feature generation

In [3]:
# totalScanned:
train['totalScanned'] = train['scannedLineItemsPerSecond'] * train['totalScanTimeInSeconds']

# avgValuePerScan:
train['avgTimePerScan'] = 1/ train['scannedLineItemsPerSecond']
train['avgValuePerScan'] = train['avgTimePerScan'] * train['valuePerSecond']



# manual feature generation - "totalScanned" ratios

# withoutRegisPerPosition
train['withoutRegisPerPosition'] = train['scansWithoutRegistration'] / train['totalScanned']
# ratio of scansWithoutRegis in totalScan
# equivalent to lineItemVoidsPerPosition
# Might indicate how new or ambivalent a customer is. Expected to be higher for low "trustLevel"

# quantiModPerPosition
train['quantiModPerPosition'] = train['quantityModifications'] / train['totalScanned']
# ratio of quanityMods in totalScan



# manual feature generation - "grandTotal" ratios

# lineItemVoidsPerTotal
train['lineItemVoidsPerTotal'] = train['lineItemVoids'] / train['grandTotal']

# withoutRegisPerTotal
train['withoutRegisPerTotal'] = train['scansWithoutRegistration'] / train['grandTotal']

# quantiModPerTotal
train['quantiModPerTotal'] = train['quantityModifications'] / train['grandTotal']



# manual feature generation - "totalScanTimeInSeconds" ratios

# lineItemVoidsPerTime
train['lineItemVoidsPerTime'] = train['lineItemVoids'] / train['totalScanTimeInSeconds']

# withoutRegisPerTime
train['withoutRegisPerTime'] = train['scansWithoutRegistration'] / train['totalScanTimeInSeconds']

# quantiModPerTime
train['quantiModPerTime'] = train['quantityModifications'] / train['totalScanTimeInSeconds']

In [4]:
# Problems:
# mean of fraud = 0.055 --> unbalanced data set
# outlier handling
# how to handle trustLevel? (unbalanced)

# automated feature generation for more features
# Subset selection with filter, wrapper or pca 

# Simple Classification

In [5]:
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer
#from sklearn.metrics import confusion_matrix, roc_auc_score ,roc_curve,auc

from xgboost.sklearn import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [6]:
# Train and test split
# No Nans

X = train.drop(columns='fraud', axis=1)
y = train['fraud']

### Evaluation function of supervisor Nico

In [7]:
def profit_scorer(y, y_pred):
    profit_matrix = {(0,0): 0, (0,1): -5, (1,0): -25, (1,1): 5}
    return sum(profit_matrix[(pred, actual)] for pred, actual in zip(y_pred, y))
                            # zip baut aus jedem iterierbaren object ein Tuple

profit_scoring = make_scorer(profit_scorer, greater_is_better=True)

### Baseline Model of supervisor Nico

In [8]:
# no preprocessing (no fixed seed?)

df = pd.read_csv('C:/Users/University/Desktop/DM2/DMC_2019_task/train.csv', sep='|')
X_base = df.drop(columns='fraud')
y_base = df['fraud']

print('Pure model without any preprocessing: \n',
      'XGB \t', sum(cross_validate(XGBClassifier(), X_base, y=y_base, cv=10, scoring=profit_scoring)['test_score']))

Pure model without any preprocessing: 
 XGB 	 -80


In [9]:
# other baseline models
print('Pure models without any preprocessing: \n',
      'LR C2 \t', sum(cross_validate(LogisticRegression(C=2), X_base, y=y_base, cv=10, scoring=profit_scoring)['test_score']), '\n'
                                                    #C=2 arbitrarily choosen
      'GNB \t'  , sum(cross_validate(GaussianNB(), X_base, y=y_base, cv=10, scoring=profit_scoring)['test_score']), '\n'
      'DT \t'    , sum(cross_validate(DecisionTreeClassifier(), X_base, y=y_base, cv=10, scoring=profit_scoring)['test_score']), '\n'
      'KNN \t'    , sum(cross_validate(KNeighborsClassifier(), X_base, y=y_base, cv=10, scoring=profit_scoring)['test_score']), '\n'
      'RANFO \t'    , sum(cross_validate(RandomForestClassifier(), X_base, y=y_base, cv=10, scoring=profit_scoring)['test_score']), '\n'
      'MLP \t'    , sum(cross_validate(MLPClassifier(), X_base, y=y_base, cv=10, scoring=profit_scoring)['test_score']), '\n'
      'SVC \t'    , sum(cross_validate(SVC(), X_base, y=y_base, cv=10, scoring=profit_scoring)['test_score']), '\n'
     ) 

Pure models without any preprocessing: 
 LR C2 	 -295 
GNB 	 -7185 
DT 	 -655 
KNN 	 -520 
RANFO 	 -290 
MLP 	 -815 
SVC 	 -520 



### Own Approaches:

In [10]:
# Own Model Approach with engineered features - pure (no scaling etc. implemented)
# Commented out: brief test how much only incorporating the feature 'totalscanned' would boost our models (xgb = 205)


# dictio = {'avgTimePerScan','avgValuePerScan','withoutRegisPerPosition','quantiModPerPosition',
#           'lineItemVoidsPerTotal','withoutRegisPerTotal','quantiModPerTotal','lineItemVoidsPerTime','withoutRegisPerTime',
#           'quantiModPerTime'}
#X_wf = X.drop(columns=dictio, axis=1)


# Init

score_list_xgb = []
score_list_lr = []
score_list_gnb = []
score_list_tr = []
score_list_knn = []
score_list_ranfo = []
score_list_mlp = []
score_list_svc = []


# Cross Val Init

cv = StratifiedKFold(n_splits=10, random_state=42)

# Cross Val Loop

for train_index, test_index in cv.split(X,y):  # adjust for X_wf
    X_train, X_test = X.loc[train_index], X.loc[test_index] # adjust for X_wf
    y_train, y_test = y.loc[train_index], y.loc[test_index] # adjust for X_wf
    
    # Models per split   

    xgb = XGBClassifier()
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    score_xgb = profit_scorer(y_test, y_pred)
    score_list_xgb.append(score_xgb)
    
    lr = LogisticRegression(C=2)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    score_lr = profit_scorer(y_test,y_pred)
    score_list_lr.append(score_lr)
    
    naive_bayes = GaussianNB()
    naive_bayes.fit(X_train, y_train)
    y_pred = naive_bayes.predict(X_test)
    score_gnb = profit_scorer(y_test, y_pred)
    score_list_gnb.append(score_gnb)
    
    tree = DecisionTreeClassifier()
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    score_tr = profit_scorer(y_test, y_pred)
    score_list_tr.append(score_tr)  
    
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    score_knn = profit_scorer(y_test, y_pred)
    score_list_knn.append(score_knn)  
    
    ranfo = RandomForestClassifier()
    ranfo.fit(X_train, y_train)
    y_pred = ranfo.predict(X_test)
    score_ranfo = profit_scorer(y_test, y_pred)
    score_list_ranfo.append(score_ranfo)  
    
    mlp = MLPClassifier()
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    score_mlp = profit_scorer(y_test, y_pred)
    score_list_mlp.append(score_mlp) 
    
    svc = SVC()
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    score_svc = profit_scorer(y_test, y_pred)
    score_list_svc.append(score_svc) 
    
    
print('Model with engineered features (no other preprocessing:)')
print('XGB \t', sum(score_list_xgb))
print('LR \t', sum(score_list_lr))
print('GNB \t', sum(score_list_gnb))
print('TR \t', sum(score_list_tr))
print('KNN \t', sum(score_list_knn))
print('RANFO \t', sum(score_list_ranfo))
print('MLP \t', sum(score_list_mlp))
print('SVC \t', sum(score_list_svc))



# Why do these tree results vary between executions of this cell(s)? We are using the same cross val seed...

Model with engineered features (no other preprocessing:)
XGB 	 230
LR 	 190
GNB 	 -9140
TR 	 -45
KNN 	 -910
RANFO 	 -170
MLP 	 -690
SVC 	 -520


In [11]:
# In the following you might want to commented GNB, KNN, MLP and SVC out to decrease the computing time.
# They preform consistently the worse. Later on, when working on further hyperparameter tuning, they might be included again

In [12]:
# Own Model Approach - feature evaluation
# I know that its pretty quick and dirty and that we might want to use less stochastic and more sophisticated approaches
# like PCA, etc.  


# Init

cv = StratifiedKFold(n_splits=10, random_state=42)
dictio = {'totalScanned','avgTimePerScan','avgValuePerScan','withoutRegisPerPosition','quantiModPerPosition',
          'lineItemVoidsPerTotal','withoutRegisPerTotal','quantiModPerTotal','lineItemVoidsPerTime','withoutRegisPerTime',
          'quantiModPerTime'}

# Eval loop

for f in dictio:
    X_wf = X
    X_wf=X.drop(columns=f, axis=1)

    # Cross Val Init

    score_list_xgb = []
    score_list_lr = []
    score_list_gnb = []
    score_list_tr = []
    score_list_knn = []
    score_list_ranfo = []
    score_list_mlp = []
    score_list_svc = []

    # Cross Val Loop

    for train_index, test_index in cv.split(X_wf,y): 
        X_train, X_test = X_wf.loc[train_index], X_wf.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        
        # Models per split   

        xgb = XGBClassifier()
        xgb.fit(X_train, y_train)
        y_pred = xgb.predict(X_test)
        score_xgb = profit_scorer(y_test, y_pred)
        score_list_xgb.append(score_xgb)

        lr = LogisticRegression(C=2)
        lr.fit(X_train, y_train)
        y_pred = lr.predict(X_test)
        score_lr = profit_scorer(y_test,y_pred)
        score_list_lr.append(score_lr)

        naive_bayes = GaussianNB()
        naive_bayes.fit(X_train, y_train)
        y_pred = naive_bayes.predict(X_test)
        score_gnb = profit_scorer(y_test, y_pred)
        score_list_gnb.append(score_gnb)

        tree = DecisionTreeClassifier()
        tree.fit(X_train, y_train)
        y_pred = tree.predict(X_test)
        score_tr = profit_scorer(y_test, y_pred)
        score_list_tr.append(score_tr)
        
        knn = KNeighborsClassifier()
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        score_knn = profit_scorer(y_test, y_pred)
        score_list_knn.append(score_knn)  

        ranfo = RandomForestClassifier()
        ranfo.fit(X_train, y_train)
        y_pred = ranfo.predict(X_test)
        score_ranfo = profit_scorer(y_test, y_pred)
        score_list_ranfo.append(score_ranfo)  

        mlp = MLPClassifier()
        mlp.fit(X_train, y_train)
        y_pred = mlp.predict(X_test)
        score_mlp = profit_scorer(y_test, y_pred)
        score_list_mlp.append(score_mlp) 

        svc = SVC()
        svc.fit(X_train, y_train)
        y_pred = svc.predict(X_test)
        score_svc = profit_scorer(y_test, y_pred)
        score_list_svc.append(score_svc) 
        

    print('\n leaving out:', f)
    print('XGB \t', sum(score_list_xgb))
    print('LR \t', sum(score_list_lr))
    print('GNB \t', sum(score_list_gnb))
    print('TR \t', sum(score_list_tr))
    print('KNN \t', sum(score_list_knn))
    print('RANFO \t', sum(score_list_ranfo))
    print('MLP \t', sum(score_list_mlp))
    print('SVC \t', sum(score_list_svc))

# Following regards to xgb: 
# These feature seem to have the most influence (leaving out results in a lower cost score)
# totalScanned = 5
# quantiModPerTime = 130
# quantiModPerTotal = 160
# withoutRegisPerPositio = 165
# withoutRegisPerPositio = 165
# withoutRegisPerTime = 170

# These feature seem to have the least influence (leaving out results in a similar cost score)
# avgTimePerScan = 230 at first glance leaving avgTimePerScan out of the model would not enhance xgb but worsen lr
# avgValuePerScan = 215
# lineItemVoidsPerTotal = 205
# quantiModPerPosition = 205
# withoutRegisPerTotal = 195
# lineItemVoidsPerTime = 190



 leaving out: avgTimePerScan
XGB 	 230
LR 	 70
GNB 	 -9190
TR 	 -165
KNN 	 -620
RANFO 	 25
MLP 	 -1015
SVC 	 -520

 leaving out: lineItemVoidsPerTotal
XGB 	 205
LR 	 225
GNB 	 -8740
TR 	 -200
KNN 	 -910
RANFO 	 40
MLP 	 -495
SVC 	 -520

 leaving out: avgValuePerScan
XGB 	 215
LR 	 190
GNB 	 -9165
TR 	 -220
KNN 	 -910
RANFO 	 0
MLP 	 -605
SVC 	 -520

 leaving out: quantiModPerTime
XGB 	 130
LR 	 200
GNB 	 -8865
TR 	 -45
KNN 	 -910
RANFO 	 -15
MLP 	 -725
SVC 	 -520

 leaving out: lineItemVoidsPerTime
XGB 	 190
LR 	 190
GNB 	 -8765
TR 	 -215
KNN 	 -910
RANFO 	 -100
MLP 	 -945
SVC 	 -520

 leaving out: withoutRegisPerPosition
XGB 	 165
LR 	 200
GNB 	 -8915
TR 	 -240
KNN 	 -910
RANFO 	 -60
MLP 	 -1200
SVC 	 -520

 leaving out: quantiModPerTotal
XGB 	 160
LR 	 200
GNB 	 -9115
TR 	 -175
KNN 	 -910
RANFO 	 -85
MLP 	 -735
SVC 	 -520

 leaving out: quantiModPerPosition
XGB 	 205
LR 	 190
GNB 	 -9090
TR 	 -195
KNN 	 -910
RANFO 	 30
MLP 	 -495
SVC 	 -520

 leaving out: withoutRegisPerTime
XGB 	 1

In [13]:
# Own Model Approach - only scaling


# Following outcommented lines: I tryed to figure out whether the cost score differs, when trustlevel is not scaled
# (It probably wont make a difference, but I guessed its more of a ordinal variable)
# I failed because of emergence of NaNs (dont know yet where they come from)
    

# Init

score_list_xgb = []
score_list_lr = []
score_list_gnb = []
score_list_tr = []
score_list_knn = []
score_list_ranfo = []
score_list_mlp = []
score_list_svc = []

# Cross Val Init

cv = StratifiedKFold(n_splits=10, random_state=42)


# Cross Val Loop

for train_index, test_index in cv.split(X,y):   
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    # Scaling per split

    X_train_unscal = X_train
    #X_train_unscal = X_train.drop(['trustLevel'], axis=1)
    scaler = prep.StandardScaler()
    X_train_scal_np = scaler.fit_transform(X=X_train_unscal) # scaled data as np-array

    X_train_cols = X_train_unscal.columns
    X_train_scal = pd.DataFrame(X_train_scal_np, columns=X_train_cols)
    #X_train_scal_unfinished = pd.DataFrame(X_train_scal_np, columns=X_train_cols)
    #X_train_scal= pd.concat([X_train_scal_unfinished, X_train['trustLevel']])
    
    
    X_test_unscal = X_test
    #X_test_unscal = X_test.drop(['trustLevel'], axis=1)
    scaler = prep.StandardScaler()
    X_test_scal_np = scaler.fit_transform(X=X_test_unscal) # scaled data as np-array

    X_test_cols = X_test_unscal.columns
    X_test_scal = pd.DataFrame(X_test_scal_np, columns=X_test_cols)
    #X_test_scal_unfinished = pd.DataFrame(X_test_scal_np, columns=X_train_cols)
    #X_test_scal= pd.concat([X_test_scal_unfinished, X_test['trustLevel']])    
    
    
    # Models per split   

    xgb = XGBClassifier()
    xgb.fit(X_train_scal, y_train)
    y_pred = xgb.predict(X_test_scal)
    score_xgb = profit_scorer(y_test, y_pred)
    score_list_xgb.append(score_xgb)
    
    lr = LogisticRegression(C=2)
    lr.fit(X_train_scal, y_train)
    y_pred = lr.predict(X_test_scal)
    score_lr = profit_scorer(y_test,y_pred)
    score_list_lr.append(score_lr)
    
    naive_bayes = GaussianNB()
    naive_bayes.fit(X_train_scal, y_train)
    y_pred = naive_bayes.predict(X_test_scal)
    score_gnb = profit_scorer(y_test, y_pred)
    score_list_gnb.append(score_gnb)
    
    tree = DecisionTreeClassifier()
    tree.fit(X_train_scal, y_train)
    y_pred = tree.predict(X_test_scal)
    score_tr = profit_scorer(y_test, y_pred)
    score_list_tr.append(score_tr)
    
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    score_knn = profit_scorer(y_test, y_pred)
    score_list_knn.append(score_knn)  

    ranfo = RandomForestClassifier()
    ranfo.fit(X_train, y_train)
    y_pred = ranfo.predict(X_test)
    score_ranfo = profit_scorer(y_test, y_pred)
    score_list_ranfo.append(score_ranfo)  

    mlp = MLPClassifier()
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    score_mlp = profit_scorer(y_test, y_pred)
    score_list_mlp.append(score_mlp) 

    svc = SVC()
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    score_svc = profit_scorer(y_test, y_pred)
    score_list_svc.append(score_svc)

print('Model with engineered features - scaled')
print('XGB \t', sum(score_list_xgb))
print('LR \t', sum(score_list_lr))
print('GNB \t', sum(score_list_gnb))
print('TR \t', sum(score_list_tr))
print('KNN \t', sum(score_list_knn))
print('RANFO \t', sum(score_list_ranfo))
print('MLP \t', sum(score_list_mlp))
print('SVC \t', sum(score_list_svc))

# Notice LR = 230!
# I dont really know, why only LR benefits from additional scaling. I guess the DMC send us already scaled data

Model with engineered features - scaled
XGB 	 -15
LR 	 230
GNB 	 -3755
TR 	 -535
KNN 	 -910
RANFO 	 -120
MLP 	 -735
SVC 	 -520


In [14]:
# Own Model Approach - upsampling (always worse off)

# I know that its pretty quick and dirty and that we might want to use less stochastic approaches


# Init

cv = StratifiedKFold(n_splits=10, random_state=42)
print('Model with engineered features - upsampled (no scaling)')

# Upsampling Loop

for counter_ratio in range(70,110,3):   # super dirty I know. I just wanted more refined steps than my original range(7,10)
                                        # for loops only allow integer steps --> weird factor 10 fuckering
    def upsam_paras(majo_len):          # upsample parameters; length of majority class
        upsam_ratio = majo_len/(counter_ratio/10)   # '/10' would drop out if with original range(7,10). Still dirty, I know.
        return int(upsam_ratio)
    
    
    # Cross Val Init
    
    score_list_xgb = []
    score_list_lr = []
    score_list_gnb = []
    score_list_tr = []
    score_list_knn = []
    score_list_ranfo = []
    score_list_mlp = []
    score_list_svc = []


    # Cross Val Loop

    for train_index, test_index in cv.split(X,y):
        #X_train, X_test = X.loc[train_index], X.loc[test_index]
        #y_train, y_test = y.loc[train_index], y.loc[test_index]


        # Upsampling per split 

        Xy_train = train.loc[train_index]
        Xy_test = train.loc[test_index]

        Xy_train_majo = Xy_train[Xy_train[('fraud')]==0]
        Xy_tain_mino = Xy_train[Xy_train[('fraud')]==1]
        Xy_train_mino_upsamp = resample(Xy_tain_mino, replace = True, n_samples=upsam_paras(len(Xy_train[Xy_train[('fraud')]==0])), random_state= 123)

        Xy_train_balanced = pd.concat([Xy_train_majo, Xy_train_mino_upsamp])  

        X_train = Xy_train_balanced.drop(['fraud'], axis=1)
        y_train = Xy_train_balanced['fraud']

        X_test = Xy_test.drop(['fraud'], axis=1)
        y_test = Xy_test['fraud']


        # Models per split   

        xgb = XGBClassifier()
        xgb.fit(X_train, y_train)
        y_pred = xgb.predict(X_test)
        score_xgb = profit_scorer(y_test, y_pred)
        score_list_xgb.append(score_xgb)

        lr = LogisticRegression(C=2)
        lr.fit(X_train, y_train)
        y_pred = lr.predict(X_test)
        score_lr = profit_scorer(y_test,y_pred)
        score_list_lr.append(score_lr)

        naive_bayes = GaussianNB()
        naive_bayes.fit(X_train, y_train)
        y_pred = naive_bayes.predict(X_test)
        score_gnb = profit_scorer(y_test, y_pred)
        score_list_gnb.append(score_gnb)

        tree = DecisionTreeClassifier()
        tree.fit(X_train, y_train)
        y_pred = tree.predict(X_test)
        score_tr = profit_scorer(y_test, y_pred)
        score_list_tr.append(score_tr)
        
        knn = KNeighborsClassifier()
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        score_knn = profit_scorer(y_test, y_pred)
        score_list_knn.append(score_knn)  

        ranfo = RandomForestClassifier()
        ranfo.fit(X_train, y_train)
        y_pred = ranfo.predict(X_test)
        score_ranfo = profit_scorer(y_test, y_pred)
        score_list_ranfo.append(score_ranfo)  

        mlp = MLPClassifier()
        mlp.fit(X_train, y_train)
        y_pred = mlp.predict(X_test)
        score_mlp = profit_scorer(y_test, y_pred)
        score_list_mlp.append(score_mlp) 

        svc = SVC()
        svc.fit(X_train, y_train)
        y_pred = svc.predict(X_test)
        score_svc = profit_scorer(y_test, y_pred)
        score_list_svc.append(score_svc)

    print('upsampled to:', upsam_paras(len(Xy_train[Xy_train[('fraud')]==0])))
    print('XGB \t', sum(score_list_xgb))
    print('LR \t', sum(score_list_lr))
    print('GNB \t', sum(score_list_gnb))
    print('TR \t', sum(score_list_tr))
    print('KNN \t', sum(score_list_knn))
    print('RANFO \t', sum(score_list_ranfo))
    print('MLP \t', sum(score_list_mlp))
    print('SVC \t', sum(score_list_svc))

print('Unsampled number of minority - frauds:', len(Xy_train[Xy_train[('fraud')]==1]))

# range(1,10):
# upsampled to: 1598
# XGB 	 -165
# upsampled to: 799
# XGB 	 -30
# upsampled to: 532
# XGB 	 50
# upsampled to: 399
# XGB 	 25
# upsampled to: 319
# XGB 	 15
# upsampled to: 266
# XGB 	 55
# upsampled to: 228
# XGB 	 45
# upsampled to: 199
# XGB 	 120             sweet spot?
# upsampled to: 177
# XGB 	 50

Model with engineered features - upsampled (no scaling)
upsampled to: 228
XGB 	 45
LR 	 60
GNB 	 -9690
TR 	 -345
KNN 	 -3435
RANFO 	 -85
MLP 	 -1540
SVC 	 -520
upsampled to: 218
XGB 	 95
LR 	 60
GNB 	 -9690
TR 	 -285
KNN 	 -3395
RANFO 	 -40
MLP 	 -1590
SVC 	 -520
upsampled to: 210
XGB 	 95
LR 	 75
GNB 	 -9790
TR 	 -285
KNN 	 -3135
RANFO 	 60
MLP 	 -830
SVC 	 -520
upsampled to: 202
XGB 	 80
LR 	 75
GNB 	 -9765
TR 	 -330
KNN 	 -3120
RANFO 	 -35
MLP 	 -465
SVC 	 -520
upsampled to: 194
XGB 	 120
LR 	 100
GNB 	 -9640
TR 	 -315
KNN 	 -3120
RANFO 	 -35
MLP 	 -830
SVC 	 -520
upsampled to: 188
XGB 	 95
LR 	 100
GNB 	 -9565
TR 	 -440
KNN 	 -2945
RANFO 	 -80
MLP 	 -805
SVC 	 -520
upsampled to: 181
XGB 	 20
LR 	 90
GNB 	 -9515
TR 	 -385
KNN 	 -2870
RANFO 	 -40
MLP 	 -530
SVC 	 -520
upsampled to: 175
XGB 	 20
LR 	 90
GNB 	 -9515
TR 	 -390
KNN 	 -2780
RANFO 	 -205
MLP 	 -485
SVC 	 -520
upsampled to: 170
XGB 	 5
LR 	 90
GNB 	 -9490
TR 	 -360
KNN 	 -2655
RANFO 	 -105
MLP 	 -210
SVC 	 -520
upsampled to

In [15]:
# Own Model Approach - upsampling and scaling

# I know that its pretty quick and dirty and that we might want to use less stochastic approaches


# Init

cv = StratifiedKFold(n_splits=10, random_state=42)
print('Model with engineered features - upsampled & scaled)')

# Upsampling Loop

for counter_ratio in range(70,110,3):   # super dirty I know. I just wanted more refined steps than my original range(7,10)

    def upsam_paras(majo_len):
        upsam_ratio = majo_len/(counter_ratio/10)   # '/10' would drop out if with original range(7,10). Still dirty, I know.
        return int(upsam_ratio)
    
    
    # Cross Val Init
    
    score_list_xgb = []
    score_list_lr = []
    score_list_gnb = []
    score_list_tr = []
    score_list_knn = []
    score_list_ranfo = []
    score_list_mlp = []
    score_list_svc = []


    # Cross Val Loop

    for train_index, test_index in cv.split(X,y):
        #X_train, X_test = X.loc[train_index], X.loc[test_index]
        #y_train, y_test = y.loc[train_index], y.loc[test_index]


        # Upsampling per split 

        Xy_train = train.loc[train_index]
        Xy_test = train.loc[test_index]

        Xy_train_majo = Xy_train[Xy_train[('fraud')]==0]
        Xy_tain_mino = Xy_train[Xy_train[('fraud')]==1]
        Xy_train_mino_upsamp = resample(Xy_tain_mino, replace = True, n_samples=upsam_paras(len(Xy_train[Xy_train[('fraud')]==0])), random_state= 123)

        Xy_train_balanced = pd.concat([Xy_train_majo, Xy_train_mino_upsamp])  

        X_train = Xy_train_balanced.drop(['fraud'], axis=1)
        y_train = Xy_train_balanced['fraud']

        X_test = Xy_test.drop(['fraud'], axis=1)
        y_test = Xy_test['fraud']
        
            # Scaling per split

        X_train_unscal = X_train
        #X_train_unscal = X_train.drop(['trustLevel'], axis=1)
        scaler = prep.StandardScaler()
        X_train_scal_np = scaler.fit_transform(X=X_train_unscal) # scaled data as np-array

        X_train_cols = X_train_unscal.columns
        X_train_scal = pd.DataFrame(X_train_scal_np, columns=X_train_cols)
        #X_train_scal_unfinished = pd.DataFrame(X_train_scal_np, columns=X_train_cols)
        #X_train_scal= pd.concat([X_train_scal_unfinished, X_train['trustLevel']])


        X_test_unscal = X_test
        #X_test_unscal = X_test.drop(['trustLevel'], axis=1)
        scaler = prep.StandardScaler()
        X_test_scal_np = scaler.fit_transform(X=X_test_unscal) # scaled data as np-array

        X_test_cols = X_test_unscal.columns
        X_test_scal = pd.DataFrame(X_test_scal_np, columns=X_test_cols)
        #X_test_scal_unfinished = pd.DataFrame(X_test_scal_np, columns=X_train_cols)
        #X_test_scal= pd.concat([X_test_scal_unfinished, X_test['trustLevel']])



        # Models per split   

        xgb = XGBClassifier()
        xgb.fit(X_train_scal, y_train)
        y_pred = xgb.predict(X_test_scal)
        score_xgb = profit_scorer(y_test, y_pred)
        score_list_xgb.append(score_xgb)

        lr = LogisticRegression(C=2)
        lr.fit(X_train_scal, y_train)
        y_pred = lr.predict(X_test_scal)
        score_lr = profit_scorer(y_test,y_pred)
        score_list_lr.append(score_lr)

        naive_bayes = GaussianNB()
        naive_bayes.fit(X_train_scal, y_train)
        y_pred = naive_bayes.predict(X_test_scal)
        score_gnb = profit_scorer(y_test, y_pred)
        score_list_gnb.append(score_gnb)

        tree = DecisionTreeClassifier()
        tree.fit(X_train_scal, y_train)
        y_pred = tree.predict(X_test_scal)
        score_tr = profit_scorer(y_test, y_pred)
        score_list_tr.append(score_tr)
        
        knn = KNeighborsClassifier()
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        score_knn = profit_scorer(y_test, y_pred)
        score_list_knn.append(score_knn)  

        ranfo = RandomForestClassifier()
        ranfo.fit(X_train, y_train)
        y_pred = ranfo.predict(X_test)
        score_ranfo = profit_scorer(y_test, y_pred)
        score_list_ranfo.append(score_ranfo)  

        mlp = MLPClassifier()
        mlp.fit(X_train, y_train)
        y_pred = mlp.predict(X_test)
        score_mlp = profit_scorer(y_test, y_pred)
        score_list_mlp.append(score_mlp) 

        svc = SVC()
        svc.fit(X_train, y_train)
        y_pred = svc.predict(X_test)
        score_svc = profit_scorer(y_test, y_pred)
        score_list_svc.append(score_svc)


    print('upsampled to:', upsam_paras(len(Xy_train[Xy_train[('fraud')]==0])))
    print('XGB \t', sum(score_list_xgb))
    print('LR \t', sum(score_list_lr))
    print('GNB \t', sum(score_list_gnb))
    print('TR \t', sum(score_list_tr))
    print('KNN \t', sum(score_list_knn))
    print('RANFO \t', sum(score_list_ranfo))
    print('MLP \t', sum(score_list_mlp))
    print('SVC \t', sum(score_list_svc))
    
print('Unsampled number of minority - frauds:', len(Xy_train[Xy_train[('fraud')]==1]))

# Preformes surprisingly bad on xgb and lr. Did I do smth wrong? 

Model with engineered features - upsampled & scaled)
upsampled to: 228
XGB 	 35
LR 	 -1165
GNB 	 -4815
TR 	 -405
KNN 	 -3435
RANFO 	 -45
MLP 	 -595
SVC 	 -520
upsampled to: 218
XGB 	 15
LR 	 -1040
GNB 	 -4780
TR 	 -665
KNN 	 -3395
RANFO 	 -75
MLP 	 -390
SVC 	 -520
upsampled to: 210
XGB 	 25
LR 	 -965
GNB 	 -4780
TR 	 -495
KNN 	 -3135
RANFO 	 -160
MLP 	 -1005
SVC 	 -520
upsampled to: 202
XGB 	 -35
LR 	 -750
GNB 	 -4755
TR 	 -485
KNN 	 -3120
RANFO 	 -80
MLP 	 -815
SVC 	 -520
upsampled to: 194
XGB 	 -45
LR 	 -760
GNB 	 -4690
TR 	 -565
KNN 	 -3120
RANFO 	 -20
MLP 	 -405
SVC 	 -520
upsampled to: 188
XGB 	 -20
LR 	 -660
GNB 	 -4665
TR 	 -360
KNN 	 -2945
RANFO 	 -20
MLP 	 -720
SVC 	 -520
upsampled to: 181
XGB 	 -15
LR 	 -635
GNB 	 -4715
TR 	 -405
KNN 	 -2870
RANFO 	 -25
MLP 	 -690
SVC 	 -520
upsampled to: 175
XGB 	 10
LR 	 -605
GNB 	 -4640
TR 	 -380
KNN 	 -2780
RANFO 	 -145
MLP 	 -250
SVC 	 -520
upsampled to: 170
XGB 	 10
LR 	 -505
GNB 	 -4665
TR 	 -215
KNN 	 -2655
RANFO 	 -105
MLP 	 -650
SVC

### More Hyperparameter tuning

In [16]:
# Best model until now: XGB without scaling and no upsampling -> 230
# and LR with only scaling -> 230

In [17]:
# next up: XGB & LR Tuning, MetaCost Algorithm (Lecture 4, ensembles)