# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing as prep
from sklearn.utils import resample

import warnings

In [2]:
warnings.filterwarnings('ignore')

train = pd.read_csv('C:/Users/University/Desktop/DM2/DMC_2019_task/train.csv', sep='|')

### manual feature generation

In [3]:
# totalScanned:
train['totalScanned'] = train['scannedLineItemsPerSecond'] * train['totalScanTimeInSeconds']

# avgValuePerScan:
train['avgTimePerScan'] = 1/ train['scannedLineItemsPerSecond']
train['avgValuePerScan'] = train['avgTimePerScan'] * train['valuePerSecond']



# manual feature generation - "totalScanned" ratios

# withoutRegisPerPosition
train['withoutRegisPerPosition'] = train['scansWithoutRegistration'] / train['totalScanned']
# ratio of scansWithoutRegis in totalScan
# equivalent to lineItemVoidsPerPosition
# Might indicate how new or ambivalent a customer is. Expected to be higher for low "trustLevel"

# quantiModPerPosition
train['quantiModPerPosition'] = train['quantityModifications'] / train['totalScanned']
# ratio of quanityMods in totalScan



# manual feature generation - "grandTotal" ratios

# lineItemVoidsPerTotal
train['lineItemVoidsPerTotal'] = train['lineItemVoids'] / train['grandTotal']

# withoutRegisPerTotal
train['withoutRegisPerTotal'] = train['scansWithoutRegistration'] / train['grandTotal']

# quantiModPerTotal
train['quantiModPerTotal'] = train['quantityModifications'] / train['grandTotal']



# manual feature generation - "totalScanTimeInSeconds" ratios

# lineItemVoidsPerTime
train['lineItemVoidsPerTime'] = train['lineItemVoids'] / train['totalScanTimeInSeconds']

# withoutRegisPerTime
train['withoutRegisPerTime'] = train['scansWithoutRegistration'] / train['totalScanTimeInSeconds']

# quantiModPerTime
train['quantiModPerTime'] = train['quantityModifications'] / train['totalScanTimeInSeconds']

In [4]:
# Problems:
# mean of fraud = 0.055 --> unbalanced data set
# outlier handling
# how to handle trustLevel? (unbalanced)

# automated feature generation for more features
# Subset selection with filter, wrapper or pca 

In [6]:
import featuretools as ft

In [7]:
es = ft.EntitySet(id = 'customers')

In [8]:
es = es.entity_from_dataframe(entity_id='customers',dataframe=train,
                              variable_types = {'trustLevel': ft.variable_types.Categorical},index='id',make_index=True)

In [9]:
es['customers']

Entity: customers
  Variables:
    id (dtype: index)
    totalScanTimeInSeconds (dtype: numeric)
    grandTotal (dtype: numeric)
    lineItemVoids (dtype: numeric)
    scansWithoutRegistration (dtype: numeric)
    quantityModifications (dtype: numeric)
    scannedLineItemsPerSecond (dtype: numeric)
    valuePerSecond (dtype: numeric)
    lineItemVoidsPerPosition (dtype: numeric)
    fraud (dtype: numeric)
    totalScanned (dtype: numeric)
    avgTimePerScan (dtype: numeric)
    avgValuePerScan (dtype: numeric)
    withoutRegisPerPosition (dtype: numeric)
    quantiModPerPosition (dtype: numeric)
    lineItemVoidsPerTotal (dtype: numeric)
    withoutRegisPerTotal (dtype: numeric)
    quantiModPerTotal (dtype: numeric)
    lineItemVoidsPerTime (dtype: numeric)
    withoutRegisPerTime (dtype: numeric)
    quantiModPerTime (dtype: numeric)
    trustLevel (dtype: categorical)
  Shape:
    (Rows: 1879, Columns: 22)

In [10]:
# List possible aggregation primitives 
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 100
#primitives[primitives['type'] == 'aggregation'].head(20)

In [11]:
# List possible transformation primitives
#primitives[primitives['type'] == 'transform'].head(40)

In [12]:

# Can take a while to compute!!!

features, feature_names = ft.dfs(entityset = es, target_entity = 'customers', 
                                 agg_primitives = ['skew','trend','median', 'mean', 'max', 'std'],
                                 trans_primitives = ['subtract', 'percentile'])

In [13]:
features.head()

Unnamed: 0_level_0,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud,totalScanned,...,PERCENTILE(withoutRegisPerPosition - lineItemVoidsPerTotal),PERCENTILE(lineItemVoidsPerTotal - withoutRegisPerPosition),PERCENTILE(lineItemVoidsPerPosition - valuePerSecond),PERCENTILE(totalScanTimeInSeconds - quantiModPerTotal),PERCENTILE(lineItemVoidsPerTime - quantiModPerTotal),PERCENTILE(lineItemVoidsPerPosition - quantiModPerTotal),PERCENTILE(scansWithoutRegistration - quantiModPerPosition),PERCENTILE(scansWithoutRegistration - lineItemVoidsPerTime),PERCENTILE(withoutRegisPerTotal - grandTotal),PERCENTILE(withoutRegisPerTotal - valuePerSecond)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0,29.0,...,0.144758,0.855774,0.405535,0.565194,0.435338,0.394891,0.068121,0.040447,0.464609,0.252262
1,108,27.36,5,2,4,0.12963,0.253333,0.357143,0,14.0,...,0.220862,0.77967,0.306014,0.055349,0.223523,0.423097,0.219798,0.194252,0.731772,0.108568
2,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0,13.0,...,0.816924,0.183608,0.406067,0.821181,0.276743,0.357105,0.927621,0.98297,0.384779,0.724321
3,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0,29.0,...,0.342735,0.657797,0.457158,0.970197,0.499202,0.443321,0.436402,0.431613,0.073443,0.398616
4,430,81.53,3,7,2,0.062791,0.189605,0.111111,0,27.0,...,0.552954,0.447578,0.099521,0.221394,0.686535,0.276743,0.730442,0.689729,0.196381,0.158595


In [14]:
features.columns

Index(['totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids',
       'scansWithoutRegistration', 'quantityModifications',
       'scannedLineItemsPerSecond', 'valuePerSecond',
       'lineItemVoidsPerPosition', 'fraud', 'totalScanned',
       ...
       'PERCENTILE(withoutRegisPerPosition - lineItemVoidsPerTotal)',
       'PERCENTILE(lineItemVoidsPerTotal - withoutRegisPerPosition)',
       'PERCENTILE(lineItemVoidsPerPosition - valuePerSecond)',
       'PERCENTILE(totalScanTimeInSeconds - quantiModPerTotal)',
       'PERCENTILE(lineItemVoidsPerTime - quantiModPerTotal)',
       'PERCENTILE(lineItemVoidsPerPosition - quantiModPerTotal)',
       'PERCENTILE(scansWithoutRegistration - quantiModPerPosition)',
       'PERCENTILE(scansWithoutRegistration - lineItemVoidsPerTime)',
       'PERCENTILE(withoutRegisPerTotal - grandTotal)',
       'PERCENTILE(withoutRegisPerTotal - valuePerSecond)'],
      dtype='object', length=801)

# Simple Classification

In [15]:
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer
#from sklearn.metrics import confusion_matrix, roc_auc_score ,roc_curve,auc

from xgboost.sklearn import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [16]:
# Train and test split
# No Nans

X = train.drop(columns='fraud', axis=1)
y = train['fraud']

### Evaluation function of supervisor Nico

In [17]:
def profit_scorer(y, y_pred):
    profit_matrix = {(0,0): 0, (0,1): -5, (1,0): -25, (1,1): 5}
    return sum(profit_matrix[(pred, actual)] for pred, actual in zip(y_pred, y))
                            # zip baut aus jedem iterierbaren object ein Tuple

profit_scoring = make_scorer(profit_scorer, greater_is_better=True)

### Baseline Model of supervisor Nico

In [18]:
# no preprocessing (no fixed seed?)

df = pd.read_csv('C:/Users/University/Desktop/DM2/DMC_2019_task/train.csv', sep='|')
X_base = features.drop(columns='fraud')
y_base = features['fraud']

cv = StratifiedKFold(n_splits=10, random_state=42)

print('Pure model without any preprocessing: \n',
      'XGB \t', sum(cross_validate(XGBClassifier(), X_base, y=y_base, cv=cv, scoring=profit_scoring)['test_score']))

Pure model without any preprocessing: 
 XGB 	 520


In [19]:
# other baseline models
print('Pure models without any preprocessing: \n',
      'LR C2 \t', sum(cross_validate(LogisticRegression(C=2), X_base, y=y_base, cv=cv, scoring=profit_scoring)['test_score']), '\n'
                                                    #C=2 arbitrarily choosen
#       'GNB \t'  , sum(cross_validate(GaussianNB(), X_base, y=y_base, cv=cv, scoring=profit_scoring)['test_score']), '\n'
#       'DT \t'    , sum(cross_validate(DecisionTreeClassifier(), X_base, y=y_base, cv=cv, scoring=profit_scoring)['test_score']), '\n'
#       'KNN \t'    , sum(cross_validate(KNeighborsClassifier(), X_base, y=y_base, cv=cv, scoring=profit_scoring)['test_score']), '\n'
#       'RANFO \t'    , sum(cross_validate(RandomForestClassifier(), X_base, y=y_base, cv=cv, scoring=profit_scoring)['test_score']), '\n'
#       'MLP \t'    , sum(cross_validate(MLPClassifier(), X_base, y=y_base, cv=cv, scoring=profit_scoring)['test_score']), '\n'
#       'SVC \t'    , sum(cross_validate(SVC(), X_base, y=y_base, cv=cv, scoring=profit_scoring)['test_score']), '\n'
     ) 

# Pure models without any preprocessing: 
#  LR C2 	 -295 
# GNB 	 -7185 
# DT 	 -655 
# KNN 	 -520 
# RANFO 	 -290 
# MLP 	 -815 
# SVC 	 -520 

Pure models without any preprocessing: 
 LR C2 	 510 



### Own Approaches:

In [None]:
# Own Model Approach with engineered features - pure (no scaling etc. implemented)
# Commented out: brief test how much only incorporating the feature 'totalscanned' would boost our models (xgb = 205)


# dictio = {'avgTimePerScan','avgValuePerScan','withoutRegisPerPosition','quantiModPerPosition',
#           'lineItemVoidsPerTotal','withoutRegisPerTotal','quantiModPerTotal','lineItemVoidsPerTime','withoutRegisPerTime',
#           'quantiModPerTime'}
#X_wf = X.drop(columns=dictio, axis=1)


# Init

score_list_xgb = []
score_list_lr = []
score_list_gnb = []
score_list_tr = []
score_list_knn = []
score_list_ranfo = []
score_list_mlp = []
score_list_svc = []


# Cross Val Init

cv = StratifiedKFold(n_splits=10, random_state=42)

# Cross Val Loop

for train_index, test_index in cv.split(X,y):  # adjust for X_wf
    X_train, X_test = X.loc[train_index], X.loc[test_index] # adjust for X_wf
    y_train, y_test = y.loc[train_index], y.loc[test_index] # adjust for X_wf
    
    # Models per split   

    xgb = XGBClassifier()
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    score_xgb = profit_scorer(y_test, y_pred)
    score_list_xgb.append(score_xgb)
    
    lr = LogisticRegression(C=2)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    score_lr = profit_scorer(y_test,y_pred)
    score_list_lr.append(score_lr)
    
    naive_bayes = GaussianNB()
    naive_bayes.fit(X_train, y_train)
    y_pred = naive_bayes.predict(X_test)
    score_gnb = profit_scorer(y_test, y_pred)
    score_list_gnb.append(score_gnb)
    
    tree = DecisionTreeClassifier()
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    score_tr = profit_scorer(y_test, y_pred)
    score_list_tr.append(score_tr)  
    
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    score_knn = profit_scorer(y_test, y_pred)
    score_list_knn.append(score_knn)  
    
    ranfo = RandomForestClassifier()
    ranfo.fit(X_train, y_train)
    y_pred = ranfo.predict(X_test)
    score_ranfo = profit_scorer(y_test, y_pred)
    score_list_ranfo.append(score_ranfo)  
    
    mlp = MLPClassifier()
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    score_mlp = profit_scorer(y_test, y_pred)
    score_list_mlp.append(score_mlp) 
    
    svc = SVC()
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    score_svc = profit_scorer(y_test, y_pred)
    score_list_svc.append(score_svc) 
    
    
print('Model with engineered features (no other preprocessing:)')
print('XGB \t', sum(score_list_xgb))
print('LR \t', sum(score_list_lr))
print('GNB \t', sum(score_list_gnb))
print('TR \t', sum(score_list_tr))
print('KNN \t', sum(score_list_knn))
print('RANFO \t', sum(score_list_ranfo))
print('MLP \t', sum(score_list_mlp))
print('SVC \t', sum(score_list_svc))



# Why do these tree results vary between executions of this cell(s)? We are using the same cross val seed...

In [None]:
# In the following you might want to commented GNB, KNN, MLP and SVC out to decrease the computing time.
# They preform consistently the worse. Later on, when working on further hyperparameter tuning, they might be included again

In [None]:
# Own Model Approach - feature evaluation
# I know that its pretty quick and dirty and that we might want to use less stochastic and more sophisticated approaches
# like PCA, etc.  


# Init

cv = StratifiedKFold(n_splits=10, random_state=42)
dictio = {'totalScanned','avgTimePerScan','avgValuePerScan','withoutRegisPerPosition','quantiModPerPosition',
          'lineItemVoidsPerTotal','withoutRegisPerTotal','quantiModPerTotal','lineItemVoidsPerTime','withoutRegisPerTime',
          'quantiModPerTime'}

# Eval loop

for f in dictio:
    X_wf = X
    X_wf=X.drop(columns=f, axis=1)

    # Cross Val Init

    score_list_xgb = []
    score_list_lr = []
    score_list_gnb = []
    score_list_tr = []
    score_list_knn = []
    score_list_ranfo = []
    score_list_mlp = []
    score_list_svc = []

    # Cross Val Loop

    for train_index, test_index in cv.split(X_wf,y): 
        X_train, X_test = X_wf.loc[train_index], X_wf.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        
        # Models per split   

        xgb = XGBClassifier()
        xgb.fit(X_train, y_train)
        y_pred = xgb.predict(X_test)
        score_xgb = profit_scorer(y_test, y_pred)
        score_list_xgb.append(score_xgb)

        lr = LogisticRegression(C=2)
        lr.fit(X_train, y_train)
        y_pred = lr.predict(X_test)
        score_lr = profit_scorer(y_test,y_pred)
        score_list_lr.append(score_lr)

        naive_bayes = GaussianNB()
        naive_bayes.fit(X_train, y_train)
        y_pred = naive_bayes.predict(X_test)
        score_gnb = profit_scorer(y_test, y_pred)
        score_list_gnb.append(score_gnb)

        tree = DecisionTreeClassifier()
        tree.fit(X_train, y_train)
        y_pred = tree.predict(X_test)
        score_tr = profit_scorer(y_test, y_pred)
        score_list_tr.append(score_tr)
        
        knn = KNeighborsClassifier()
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        score_knn = profit_scorer(y_test, y_pred)
        score_list_knn.append(score_knn)  

        ranfo = RandomForestClassifier()
        ranfo.fit(X_train, y_train)
        y_pred = ranfo.predict(X_test)
        score_ranfo = profit_scorer(y_test, y_pred)
        score_list_ranfo.append(score_ranfo)  

        mlp = MLPClassifier()
        mlp.fit(X_train, y_train)
        y_pred = mlp.predict(X_test)
        score_mlp = profit_scorer(y_test, y_pred)
        score_list_mlp.append(score_mlp) 

        svc = SVC()
        svc.fit(X_train, y_train)
        y_pred = svc.predict(X_test)
        score_svc = profit_scorer(y_test, y_pred)
        score_list_svc.append(score_svc) 
        

    print('\n leaving out:', f)
    print('XGB \t', sum(score_list_xgb))
    print('LR \t', sum(score_list_lr))
    print('GNB \t', sum(score_list_gnb))
    print('TR \t', sum(score_list_tr))
    print('KNN \t', sum(score_list_knn))
    print('RANFO \t', sum(score_list_ranfo))
    print('MLP \t', sum(score_list_mlp))
    print('SVC \t', sum(score_list_svc))

# Following regards to xgb: 
# These feature seem to have the most influence (leaving out results in a lower cost score)
# totalScanned = 5
# quantiModPerTime = 130
# quantiModPerTotal = 160
# withoutRegisPerPositio = 165
# withoutRegisPerPositio = 165
# withoutRegisPerTime = 170

# These feature seem to have the least influence (leaving out results in a similar cost score)
# avgTimePerScan = 230 at first glance leaving avgTimePerScan out of the model would not enhance xgb but worsen lr
# avgValuePerScan = 215
# lineItemVoidsPerTotal = 205
# quantiModPerPosition = 205
# withoutRegisPerTotal = 195
# lineItemVoidsPerTime = 190


In [None]:
# Own Model Approach - only scaling


# Following outcommented lines: I tryed to figure out whether the cost score differs, when trustlevel is not scaled
# (It probably wont make a difference, but I guessed its more of a ordinal variable)
# I failed because of emergence of NaNs (dont know yet where they come from)
    

# Init

score_list_xgb = []
score_list_lr = []
score_list_gnb = []
score_list_tr = []
score_list_knn = []
score_list_ranfo = []
score_list_mlp = []
score_list_svc = []

# Cross Val Init

cv = StratifiedKFold(n_splits=10, random_state=42)


# Cross Val Loop

for train_index, test_index in cv.split(X,y):   
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    # Scaling per split

    X_train_unscal = X_train
    #X_train_unscal = X_train.drop(['trustLevel'], axis=1)
    scaler = prep.StandardScaler()
    X_train_scal_np = scaler.fit_transform(X=X_train_unscal) # scaled data as np-array

    X_train_cols = X_train_unscal.columns
    X_train_scal = pd.DataFrame(X_train_scal_np, columns=X_train_cols)
    #X_train_scal_unfinished = pd.DataFrame(X_train_scal_np, columns=X_train_cols)
    #X_train_scal= pd.concat([X_train_scal_unfinished, X_train['trustLevel']])
    
    
    X_test_unscal = X_test
    #X_test_unscal = X_test.drop(['trustLevel'], axis=1)
    scaler = prep.StandardScaler()
    X_test_scal_np = scaler.fit_transform(X=X_test_unscal) # scaled data as np-array

    X_test_cols = X_test_unscal.columns
    X_test_scal = pd.DataFrame(X_test_scal_np, columns=X_test_cols)
    #X_test_scal_unfinished = pd.DataFrame(X_test_scal_np, columns=X_train_cols)
    #X_test_scal= pd.concat([X_test_scal_unfinished, X_test['trustLevel']])    
    
    
    # Models per split   

    xgb = XGBClassifier()
    xgb.fit(X_train_scal, y_train)
    y_pred = xgb.predict(X_test_scal)
    score_xgb = profit_scorer(y_test, y_pred)
    score_list_xgb.append(score_xgb)
    
    lr = LogisticRegression(C=2)
    lr.fit(X_train_scal, y_train)
    y_pred = lr.predict(X_test_scal)
    score_lr = profit_scorer(y_test,y_pred)
    score_list_lr.append(score_lr)
    
    naive_bayes = GaussianNB()
    naive_bayes.fit(X_train_scal, y_train)
    y_pred = naive_bayes.predict(X_test_scal)
    score_gnb = profit_scorer(y_test, y_pred)
    score_list_gnb.append(score_gnb)
    
    tree = DecisionTreeClassifier()
    tree.fit(X_train_scal, y_train)
    y_pred = tree.predict(X_test_scal)
    score_tr = profit_scorer(y_test, y_pred)
    score_list_tr.append(score_tr)
    
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    score_knn = profit_scorer(y_test, y_pred)
    score_list_knn.append(score_knn)  

    ranfo = RandomForestClassifier()
    ranfo.fit(X_train, y_train)
    y_pred = ranfo.predict(X_test)
    score_ranfo = profit_scorer(y_test, y_pred)
    score_list_ranfo.append(score_ranfo)  

    mlp = MLPClassifier()
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    score_mlp = profit_scorer(y_test, y_pred)
    score_list_mlp.append(score_mlp) 

    svc = SVC()
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    score_svc = profit_scorer(y_test, y_pred)
    score_list_svc.append(score_svc)

print('Model with engineered features - scaled')
print('XGB \t', sum(score_list_xgb))
print('LR \t', sum(score_list_lr))
print('GNB \t', sum(score_list_gnb))
print('TR \t', sum(score_list_tr))
print('KNN \t', sum(score_list_knn))
print('RANFO \t', sum(score_list_ranfo))
print('MLP \t', sum(score_list_mlp))
print('SVC \t', sum(score_list_svc))

# Notice LR = 230!
# I dont really know, why only LR benefits from additional scaling. I guess the DMC send us already scaled data

In [None]:
# Own Model Approach - upsampling (always worse off)

# I know that its pretty quick and dirty and that we might want to use less stochastic approaches


# Init

cv = StratifiedKFold(n_splits=10, random_state=42)
print('Model with engineered features - upsampled (no scaling)')

# Upsampling Loop

for counter_ratio in range(70,110,3):   # super dirty I know. I just wanted more refined steps than my original range(7,10)
                                        # for loops only allow integer steps --> weird factor 10 fuckering
    def upsam_paras(majo_len):          # upsample parameters; length of majority class
        upsam_ratio = majo_len/(counter_ratio/10)   # '/10' would drop out if with original range(7,10). Still dirty, I know.
        return int(upsam_ratio)
    
    
    # Cross Val Init
    
    score_list_xgb = []
    score_list_lr = []
    score_list_gnb = []
    score_list_tr = []
    score_list_knn = []
    score_list_ranfo = []
    score_list_mlp = []
    score_list_svc = []


    # Cross Val Loop

    for train_index, test_index in cv.split(X,y):
        #X_train, X_test = X.loc[train_index], X.loc[test_index]
        #y_train, y_test = y.loc[train_index], y.loc[test_index]


        # Upsampling per split 

        Xy_train = train.loc[train_index]
        Xy_test = train.loc[test_index]

        Xy_train_majo = Xy_train[Xy_train[('fraud')]==0]
        Xy_tain_mino = Xy_train[Xy_train[('fraud')]==1]
        Xy_train_mino_upsamp = resample(Xy_tain_mino, replace = True, n_samples=upsam_paras(len(Xy_train[Xy_train[('fraud')]==0])), random_state= 123)

        Xy_train_balanced = pd.concat([Xy_train_majo, Xy_train_mino_upsamp])  

        X_train = Xy_train_balanced.drop(['fraud'], axis=1)
        y_train = Xy_train_balanced['fraud']

        X_test = Xy_test.drop(['fraud'], axis=1)
        y_test = Xy_test['fraud']


        # Models per split   

        xgb = XGBClassifier()
        xgb.fit(X_train, y_train)
        y_pred = xgb.predict(X_test)
        score_xgb = profit_scorer(y_test, y_pred)
        score_list_xgb.append(score_xgb)

        lr = LogisticRegression(C=2)
        lr.fit(X_train, y_train)
        y_pred = lr.predict(X_test)
        score_lr = profit_scorer(y_test,y_pred)
        score_list_lr.append(score_lr)

        naive_bayes = GaussianNB()
        naive_bayes.fit(X_train, y_train)
        y_pred = naive_bayes.predict(X_test)
        score_gnb = profit_scorer(y_test, y_pred)
        score_list_gnb.append(score_gnb)

        tree = DecisionTreeClassifier()
        tree.fit(X_train, y_train)
        y_pred = tree.predict(X_test)
        score_tr = profit_scorer(y_test, y_pred)
        score_list_tr.append(score_tr)
        
        knn = KNeighborsClassifier()
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        score_knn = profit_scorer(y_test, y_pred)
        score_list_knn.append(score_knn)  

        ranfo = RandomForestClassifier()
        ranfo.fit(X_train, y_train)
        y_pred = ranfo.predict(X_test)
        score_ranfo = profit_scorer(y_test, y_pred)
        score_list_ranfo.append(score_ranfo)  

        mlp = MLPClassifier()
        mlp.fit(X_train, y_train)
        y_pred = mlp.predict(X_test)
        score_mlp = profit_scorer(y_test, y_pred)
        score_list_mlp.append(score_mlp) 

        svc = SVC()
        svc.fit(X_train, y_train)
        y_pred = svc.predict(X_test)
        score_svc = profit_scorer(y_test, y_pred)
        score_list_svc.append(score_svc)

    print('upsampled to:', upsam_paras(len(Xy_train[Xy_train[('fraud')]==0])))
    print('XGB \t', sum(score_list_xgb))
    print('LR \t', sum(score_list_lr))
    print('GNB \t', sum(score_list_gnb))
    print('TR \t', sum(score_list_tr))
    print('KNN \t', sum(score_list_knn))
    print('RANFO \t', sum(score_list_ranfo))
    print('MLP \t', sum(score_list_mlp))
    print('SVC \t', sum(score_list_svc))

print('Unsampled number of minority - frauds:', len(Xy_train[Xy_train[('fraud')]==1]))

# range(1,10):
# upsampled to: 1598
# XGB 	 -165
# upsampled to: 799
# XGB 	 -30
# upsampled to: 532
# XGB 	 50
# upsampled to: 399
# XGB 	 25
# upsampled to: 319
# XGB 	 15
# upsampled to: 266
# XGB 	 55
# upsampled to: 228
# XGB 	 45
# upsampled to: 199
# XGB 	 120             sweet spot?
# upsampled to: 177
# XGB 	 50

In [None]:
# Own Model Approach - upsampling and scaling

# I know that its pretty quick and dirty and that we might want to use less stochastic approaches


# Init

cv = StratifiedKFold(n_splits=10, random_state=42)
print('Model with engineered features - upsampled & scaled)')

# Upsampling Loop

for counter_ratio in range(70,110,3):   # super dirty I know. I just wanted more refined steps than my original range(7,10)

    def upsam_paras(majo_len):
        upsam_ratio = majo_len/(counter_ratio/10)   # '/10' would drop out if with original range(7,10). Still dirty, I know.
        return int(upsam_ratio)
    
    
    # Cross Val Init
    
    score_list_xgb = []
    score_list_lr = []
    score_list_gnb = []
    score_list_tr = []
    score_list_knn = []
    score_list_ranfo = []
    score_list_mlp = []
    score_list_svc = []


    # Cross Val Loop

    for train_index, test_index in cv.split(X,y):
        #X_train, X_test = X.loc[train_index], X.loc[test_index]
        #y_train, y_test = y.loc[train_index], y.loc[test_index]


        # Upsampling per split 

        Xy_train = train.loc[train_index]
        Xy_test = train.loc[test_index]

        Xy_train_majo = Xy_train[Xy_train[('fraud')]==0]
        Xy_tain_mino = Xy_train[Xy_train[('fraud')]==1]
        Xy_train_mino_upsamp = resample(Xy_tain_mino, replace = True, n_samples=upsam_paras(len(Xy_train[Xy_train[('fraud')]==0])), random_state= 123)

        Xy_train_balanced = pd.concat([Xy_train_majo, Xy_train_mino_upsamp])  

        X_train = Xy_train_balanced.drop(['fraud'], axis=1)
        y_train = Xy_train_balanced['fraud']

        X_test = Xy_test.drop(['fraud'], axis=1)
        y_test = Xy_test['fraud']
        
            # Scaling per split

        X_train_unscal = X_train
        #X_train_unscal = X_train.drop(['trustLevel'], axis=1)
        scaler = prep.StandardScaler()
        X_train_scal_np = scaler.fit_transform(X=X_train_unscal) # scaled data as np-array

        X_train_cols = X_train_unscal.columns
        X_train_scal = pd.DataFrame(X_train_scal_np, columns=X_train_cols)
        #X_train_scal_unfinished = pd.DataFrame(X_train_scal_np, columns=X_train_cols)
        #X_train_scal= pd.concat([X_train_scal_unfinished, X_train['trustLevel']])


        X_test_unscal = X_test
        #X_test_unscal = X_test.drop(['trustLevel'], axis=1)
        scaler = prep.StandardScaler()
        X_test_scal_np = scaler.fit_transform(X=X_test_unscal) # scaled data as np-array

        X_test_cols = X_test_unscal.columns
        X_test_scal = pd.DataFrame(X_test_scal_np, columns=X_test_cols)
        #X_test_scal_unfinished = pd.DataFrame(X_test_scal_np, columns=X_train_cols)
        #X_test_scal= pd.concat([X_test_scal_unfinished, X_test['trustLevel']])



        # Models per split   

        xgb = XGBClassifier()
        xgb.fit(X_train_scal, y_train)
        y_pred = xgb.predict(X_test_scal)
        score_xgb = profit_scorer(y_test, y_pred)
        score_list_xgb.append(score_xgb)

        lr = LogisticRegression(C=2)
        lr.fit(X_train_scal, y_train)
        y_pred = lr.predict(X_test_scal)
        score_lr = profit_scorer(y_test,y_pred)
        score_list_lr.append(score_lr)

        naive_bayes = GaussianNB()
        naive_bayes.fit(X_train_scal, y_train)
        y_pred = naive_bayes.predict(X_test_scal)
        score_gnb = profit_scorer(y_test, y_pred)
        score_list_gnb.append(score_gnb)

        tree = DecisionTreeClassifier()
        tree.fit(X_train_scal, y_train)
        y_pred = tree.predict(X_test_scal)
        score_tr = profit_scorer(y_test, y_pred)
        score_list_tr.append(score_tr)
        
        knn = KNeighborsClassifier()
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        score_knn = profit_scorer(y_test, y_pred)
        score_list_knn.append(score_knn)  

        ranfo = RandomForestClassifier()
        ranfo.fit(X_train, y_train)
        y_pred = ranfo.predict(X_test)
        score_ranfo = profit_scorer(y_test, y_pred)
        score_list_ranfo.append(score_ranfo)  

        mlp = MLPClassifier()
        mlp.fit(X_train, y_train)
        y_pred = mlp.predict(X_test)
        score_mlp = profit_scorer(y_test, y_pred)
        score_list_mlp.append(score_mlp) 

        svc = SVC()
        svc.fit(X_train, y_train)
        y_pred = svc.predict(X_test)
        score_svc = profit_scorer(y_test, y_pred)
        score_list_svc.append(score_svc)


    print('upsampled to:', upsam_paras(len(Xy_train[Xy_train[('fraud')]==0])))
    print('XGB \t', sum(score_list_xgb))
    print('LR \t', sum(score_list_lr))
    print('GNB \t', sum(score_list_gnb))
    print('TR \t', sum(score_list_tr))
    print('KNN \t', sum(score_list_knn))
    print('RANFO \t', sum(score_list_ranfo))
    print('MLP \t', sum(score_list_mlp))
    print('SVC \t', sum(score_list_svc))
    
print('Unsampled number of minority - frauds:', len(Xy_train[Xy_train[('fraud')]==1]))

# Preformes surprisingly bad on xgb and lr. Did I do smth wrong? 

### More Hyperparameter tuning

In [None]:
# Best model until now: XGB without scaling and no upsampling -> 230
# and LR with only scaling -> 230

In [None]:
# next up: XGB & LR Tuning, MetaCost Algorithm (Lecture 4, ensembles)