## Functions and libraries 

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from gplearn.genetic import SymbolicTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.semi_supervised import LabelPropagation, LabelSpreading

## Functions for classifier in cross classification

In [2]:
classifiers = {
#     "RBF SVM": SVC(gamma=2, C=1),
    "Logistic Regression": LogisticRegression( solver='lbfgs', max_iter = 500 ), 
    "Neural Net": MLPClassifier(alpha=1, max_iter=1000),
#     "Random Forest": RandomForestClassifier(n_estimators=500),
#     "AdaBoost" : AdaBoostClassifier(),
    "XGBoost": xgb.XGBClassifier()
}

def profit_scorer(y, y_pred):
#     print(confusion_matrix(y, y_pred))
    profit_matrix = {(0,0): 0, (0,1): -5, (1,0): -25, (1,1): 5}
    return sum(profit_matrix[(pred, actual)] for pred, actual in zip(y_pred, y))

def evaluate_classification(X, y):
    cv = StratifiedKFold(n_splits=10, random_state=42)
    profit_scoring = make_scorer(profit_scorer, greater_is_better=True)
    
    for name, clf in classifiers.items():
#         print(cross_validate(clf, X, y=y, cv=cv, scoring=profit_scoring)['test_score'])
        result = sum(cross_validate(clf, X, y=y, cv=cv, scoring=profit_scoring)['test_score'])
        print(f"{name}: test core = {result} ")
def code_for_test(X,y):
    """
    test code with 10-fold stratified cross validation
    parameters
    X: trainset features after generation
    y: trainset y label
    """
    evaluate_classification(X, y)

## functions for feature selection

In [3]:
def feature_selection(X, y, columns):
    model = LogisticRegression(solver='lbfgs', max_iter=300)
    models = {'logistic': model, 'ridge': Ridge(alpha=1.0), 'Lasso': Lasso()} #'RFE': RFE(model, 5)
    for k, v in models.items():
        print(k)
        fit = v.fit(X, y)
#         if k == 'RFE':
#             print("Num Features: %s" % (fit.n_features_))
#             print("Feature Ranking: %s" % (fit.ranking_))
#             for i, j in sorted(zip(fit.ranking_, columns), reverse=False):
#                 print(i,j)
        if k == 'logistic':
            for i, j in sorted(zip(map(lambda x: round(x, 4), fit.coef_.reshape(-1)), columns), reverse=True):
                print(i,j)
#         else:
#             for i, j in sorted(zip(map(lambda x: round(x, 4), abs(fit.coef_)), columns), reverse=True):
#                 print(i,j)

## Functions for automatic feature generation

In [39]:
function_set = ['add', 'sub', 'mul', 'div', 'log', 'sqrt', 'abs', 'neg', 'max', 'min']  # 
def gp(X, y, gen, n_com):
    gp1 = SymbolicTransformer(generations=gen, population_size=1000,
                             hall_of_fame=1000, n_components=n_com,
                             function_set=function_set,
                             parsimony_coefficient=0.0005,
                             max_samples=0.9, verbose=1,
                             random_state=42, n_jobs=3,
                             feature_names=X.columns)
    x_gp = gp1.fit_transform(X, y)
    code_for_test(x_gp, y)
    return gp1, x_gp

## functions for semi-supervised learning

In [5]:
def classify(X_train, X_test, y_train, clf):
    pred = clf.fit(X_train, y_train).predict(X_test)
    return pred

def cv_cost_semi(X, X_test_other, y):
    cv = StratifiedKFold(n_splits=10, random_state=42)
    cost = {"RBF SVM": 0,
    "Logistic Regression": 0, 
    "Neural Net": 0,
    "Random Forest": 0,
    "AdaBoost" : 0,
    "XGBoost": 0}
    for train_index, test_index in cv.split(X, y):
        # get the split
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y[train_index], y[test_index]
        # generate coresponding test label
        # model = LabelSpreading(gamma=10)
        # model = LabelPropagation(alpha=None, gamma=10, kernel='rbf', max_iter=1000,n_jobs=None, n_neighbors=7, tol=0.001)
        model = LabelSpreading(alpha=0.2, gamma=10, kernel='rbf', max_iter=30, n_jobs=None,n_neighbors=7, tol=0.001)
        y_new_label = model.fit(X_train, y_train).predict(X_test_other)
        X_all = X_train.append(X_test_other)
        
        y_all = y_train.append(pd.DataFrame(y_new_label))
        # evaluation 
        for name, clf in classifiers.items():
            pred = classify(X_all, X_test, y_all, clf)
            cost[name] += profit_scorer(y_test, pred>0.5)
    print(f'cost = {cost}')

## functions for cost-sensitive xgboost

In [6]:
def logistic_obj(y_hat, dtrain, alpha=5, beta=25): # alpha for FN beta for FP
    y = dtrain.get_label()
    pred = 1. / (1. + np.exp(-y_hat))
    grad = pred * (beta + alpha*y - beta*y) - alpha*y  # alpha*(p-1)*y + beta*p*(1-y)
    hess = pred * (1 - pred) * (beta + alpha*y - beta*y)
    return grad, hess

def err_rate(pred, dtrain):
    y = dtrain.get_label()
    pred = 1. / (1. + np.exp(-pred))
    loss_fn = y*np.log(pred)
    loss_fp = (1.0 - y)*np.log(1.0 - pred)
    return 'error', np.sum(-(5*loss_fn+25*loss_fp))/len(y)

def cross_validation(X_train, X_test, y_train, y_test, depth, num_round):
    # load data
    d_train = xgb.DMatrix(X_train, label=y_train)
    d_val = xgb.DMatrix(X_test, label=y_test)
    # build model
    param = {'max_depth': depth, 'eta': 0.2, 'silent': 1, 'seed': 42, 'scale_pos_weight':1}
#     watchlist = [(d_val, 'eval'), (d_train, 'train')]
#     model_trn = xgb.train(param, d_train, num_round, watchlist, obj=logistic_obj, feval=err_rate)
    model_trn = xgb.train(param, d_train, num_round, obj=logistic_obj, feval=err_rate)
    # prediction
    pred = model_trn.predict(d_val) 
    pred = 1. / (1. + np.exp(-pred))
    return pred
def cv_cost_xg(X,y, depth, rounds):
    cv = StratifiedKFold(n_splits=10, random_state=42)
    cost=0
    if type(X) == pd.DataFrame:
        X=X.values
    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        pred = cross_validation(X_train, X_test, y_train, y_test, depth, rounds)
#         print(f' confusion matrix: cost({profit_scorer(y_test, pred>0.5)})\n {confusion_matrix(y_test, pred>0.5)}')
        cost += profit_scorer(y_test, pred>0.5)

    print(f'cost = {cost}')

## Dataset for test
prepare different input dataset and test at 10-fold stratified cross validation set


**1. X (raw data)**

In [45]:
#raw data
train_data = pd.read_csv('data/train.csv', sep = '|')
test_data = pd.read_csv('data/test.csv', sep = '|')
print(f'Train set has {train_data.shape[0]} entries and {train_data.shape[1]} features')
print(f'Test set has {test_data.shape[0]} entries and {test_data.shape[1]} features')
y = train_data['fraud']
X = train_data.drop(columns=['fraud']).astype(float)

Train set has 1879 entries and 10 features
Test set has 498121 entries and 9 features


## manual feature

In [46]:
# delete correlate features
X_manual = X.assign(no_item = X.totalScanTimeInSeconds* X.scannedLineItemsPerSecond)\
                     .drop(columns=['valuePerSecond', 'lineItemVoidsPerPosition','scannedLineItemsPerSecond'])
X_test = test_data.assign(no_item = test_data.totalScanTimeInSeconds* test_data.scannedLineItemsPerSecond)\
                    .drop(columns=['valuePerSecond', 'lineItemVoidsPerPosition','scannedLineItemsPerSecond'])

fit_minmax = MinMaxScaler()
# normalize with encode
X_manual_encode = pd.get_dummies(X_manual, columns=['trustLevel'], prefix='trustLevel')
X_test_encode = pd.get_dummies(X_test, columns=['trustLevel'], prefix='trustLevel')
X_train_manual_enc = pd.DataFrame(fit_minmax.fit_transform(X_manual_encode), columns=X_manual_encode.columns, index=X_manual_encode.index)
X_test_manual_enc = pd.DataFrame(fit_minmax.transform(X_test_encode), columns=X_manual_encode.columns, index=X_test_encode.index)


In [47]:
code_for_test(X_train_manual_enc,y)

Logistic Regression: test core = 205 
Neural Net: test core = 180 
XGBoost: test core = 80 


## auto generate feature

In [48]:
gpresult, xgp = gp(X_train_manual_enc, y, 100, 40)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    11.86         0.094761        9         0.707305         0.651601      5.87m
   1     9.58         0.276511       14         0.722464         0.744651      2.36m
   2     8.42         0.469678       14         0.783092         0.807669      3.11m
   3    11.46          0.63334       17         0.808567         0.733662      3.51m
   4    15.33          0.69389       31         0.813072         0.626655      3.13m
   5    17.37         0.712183       17         0.818479         0.644542      2.81m
   6    18.83         0.732394       21         0.823698         0.546554      2.79m
   7    19.90         0.747221       34         0.823915         0.659249      2.87m
   8    19.84         0.742844       30         0.827044          0.59718  

  94    50.18         0.770984      106          0.92963          0.95864     17.25s
  95    49.72         0.759817       70          0.93057          0.86936     12.62s
  96    50.59         0.769957       50         0.933293         0.795149      9.74s
  97    49.91          0.76969       70         0.934764         0.753906      6.78s
  98    50.27          0.76229       74         0.937564         0.865825      3.16s
  99    49.87         0.767216       70         0.938455         0.707139      0.00s
Logistic Regression: test core = 310 
Neural Net: test core = 350 
XGBoost: test core = 275 


In [47]:
code_for_test(X_gp,y)

Logistic Regression: test core = 205 
Neural Net: test core = 180 
XGBoost: test core = 80 


## semi-supervised


In [None]:
cv_cost_semi(X_train_manual_enc, X_test_manual_enc.iloc[:,:30000], y)

## Define xgboost with cost sensitive
1. better than original sgboost but worse than other algorithm 
2. best score 170

In [None]:
cv_cost_xg(X_train_manual_enc, y, 3 ,100)