## Functions and libraries 

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from gplearn.genetic import SymbolicTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.decomposition import PCA 

## Functions for classifier in cross classification

In [2]:
classifiers = {
#     "RBF SVM": SVC(gamma=2, C=1),
    "Logistic Regression": LogisticRegression( solver='lbfgs', max_iter = 500 ), 
    "Neural Net": MLPClassifier(alpha=1, max_iter=1000),
#     "Random Forest": RandomForestClassifier(n_estimators=500),
#     "AdaBoost" : AdaBoostClassifier(),
    "XGBoost": xgb.XGBClassifier()
}

def profit_scorer(y, y_pred):
#     print(confusion_matrix(y, y_pred))
    profit_matrix = {(0,0): 0, (0,1): -5, (1,0): -25, (1,1): 5}
    return sum(profit_matrix[(pred, actual)] for pred, actual in zip(y_pred, y))

def evaluate_classification(X, y):
    cv = StratifiedKFold(n_splits=10, random_state=42)
    profit_scoring = make_scorer(profit_scorer, greater_is_better=True)
    
    for name, clf in classifiers.items():
#         print(cross_validate(clf, X, y=y, cv=cv, scoring=profit_scoring)['test_score'])
        result = sum(cross_validate(clf, X, y=y, cv=cv, scoring=profit_scoring)['test_score'])
        print(f"{name}: test core = {result} ")
def code_for_test(X,y):
    """
    test code with 10-fold stratified cross validation
    parameters
    X: trainset features after generation
    y: trainset y label
    """
    evaluate_classification(X, y)

## functions for feature selection

In [3]:
def feature_selection(X, y):
    model = LogisticRegression(solver='lbfgs', max_iter=300)
    models = {'logistic': model, 'RFE': RFE(model, 5), 'ridge': Ridge(alpha=1.0), 'Lasso': Lasso()}
    for k, v in models.items():
        print(k)
        fit = v.fit(X, y)
        if k == 'RFE':
            print("Num Features: %s" % (fit.n_features_))
            print("Feature Ranking: %s" % (fit.ranking_))
            for i, j in sorted(zip(fit.ranking_, X.columns), reverse=False):
                print(i,j)

        elif k == 'logistic':
            for i, j in sorted(zip(map(lambda x: round(x, 4), fit.coef_.reshape(-1)), X.columns), reverse=True):
                print(i,j)
        else:
            for i, j in sorted(zip(map(lambda x: round(x, 4), abs(fit.coef_)), X.columns), reverse=True):
                print(i,j)

## Functions for automatic feature generation

In [4]:
function_set = ['add', 'sub', 'mul', 'div', 'log', 'sqrt', 'abs', 'neg', 'max', 'min']  # 
def gp(X, y, gen, n_com):
    gp = SymbolicTransformer(generations=gen, population_size=1000,
                             hall_of_fame=100, n_components=n_com,
                             function_set=function_set,
                             parsimony_coefficient=0.0005,
                             max_samples=0.9, verbose=1,
                             random_state=42, n_jobs=8,
                             feature_names=X.columns)
    x_gp = gp.fit_transform(X, y)
    return gp, x_gp
    

## functions for semi-supervised learning

In [5]:
def classify(X_train, X_test, y_train, clf):
    pred = clf.fit(X_train, y_train).predict(X_test)
    return pred

def cv_cost_semi(X, X_test_other, y):
    cv = StratifiedKFold(n_splits=10, random_state=42)
    cost = {"RBF SVM": 0,
    "Logistic Regression": 0, 
    "Neural Net": 0,
    "Random Forest": 0,
    "AdaBoost" : 0,
    "XGBoost": 0}
    for train_index, test_index in cv.split(X, y):
        # get the split
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y[train_index], y[test_index]
        # generate coresponding test label
        # model = LabelSpreading(gamma=10)
        # model = LabelPropagation(alpha=None, gamma=10, kernel='rbf', max_iter=1000,n_jobs=None, n_neighbors=7, tol=0.001)
        model = LabelSpreading(alpha=0.2, gamma=10, kernel='rbf', max_iter=30, n_jobs=None,n_neighbors=7, tol=0.001)
        y_new_label = model.fit(X_train, y_train).predict(X_test_other)
        X_all = X_train.append(X_test_other)
        
        y_all = y_train.append(pd.DataFrame(y_new_label))
        # evaluation 
        for name, clf in classifiers.items():
            pred = classify(X_all, X_test, y_all, clf)
            cost[name] += profit_scorer(y_test, pred>0.5)
    print(f'cost = {cost}')

## functions for cost-sensitive xgboost

In [6]:
def logistic_obj(y_hat, dtrain, alpha=5, beta=25): # alpha for FN beta for FP
    y = dtrain.get_label()
    pred = 1. / (1. + np.exp(-y_hat))
    grad = pred * (beta + alpha*y - beta*y) - alpha*y  # alpha*(p-1)*y + beta*p*(1-y)
    hess = pred * (1 - pred) * (beta + alpha*y - beta*y)
    return grad, hess

def err_rate(pred, dtrain):
    y = dtrain.get_label()
    pred = 1. / (1. + np.exp(-pred))
    loss_fn = y*np.log(pred)
    loss_fp = (1.0 - y)*np.log(1.0 - pred)
    return 'error', np.sum(-(5*loss_fn+25*loss_fp))/len(y)

def cross_validation(X_train, X_test, y_train, y_test, depth, num_round):
    # load data
    d_train = xgb.DMatrix(X_train, label=y_train)
    d_val = xgb.DMatrix(X_test, label=y_test)
    # build model
    param = {'max_depth': depth, 'eta': 0.2, 'silent': 1, 'seed': 42, 'scale_pos_weight':1}
#     watchlist = [(d_val, 'eval'), (d_train, 'train')]
#     model_trn = xgb.train(param, d_train, num_round, watchlist, obj=logistic_obj, feval=err_rate)
    model_trn = xgb.train(param, d_train, num_round, obj=logistic_obj, feval=err_rate)
    # prediction
    pred = model_trn.predict(d_val) 
    pred = 1. / (1. + np.exp(-pred))
    return pred
def cv_cost_xg(X,y, depth, rounds):
    cv = StratifiedKFold(n_splits=10, random_state=42)
    cost=0
    if type(X) == pd.DataFrame:
        X=X.values
    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        pred = cross_validation(X_train, X_test, y_train, y_test, depth, rounds)
#         print(f' confusion matrix: cost({profit_scorer(y_test, pred>0.5)})\n {confusion_matrix(y_test, pred>0.5)}')
        cost += profit_scorer(y_test, pred>0.5)

    print(f'cost = {cost}')

## Dataset for test
prepare different input dataset and test at 10-fold stratified cross validation set


**1. X (raw data)**

In [7]:
train_data = pd.read_csv('data/train.csv', sep = '|')
test_data = pd.read_csv('data/test.csv', sep = '|')
print(f'Train set has {train_data.shape[0]} entries and {train_data.shape[1]} features')
print(f'Test set has {test_data.shape[0]} entries and {test_data.shape[1]} features')
y = train_data['fraud']
X = train_data.drop(columns=['fraud']).astype(float)


Train set has 1879 entries and 10 features
Test set has 498121 entries and 9 features


In [20]:
# encode trustlevel 1,2,other
X_encode = pd.get_dummies(X, columns=['trustLevel'], prefix='trust')
X_encode = X_encode.assign(trust_other = (X_encode['trust_3.0']+X_encode['trust_4.0']+X_encode['trust_5.0']+X_encode['trust_6.0']))
X_encode = X_encode.drop(columns=['trust_3.0', 'trust_4.0', 'trust_5.0','trust_6.0'])
## test data
X_encode_test = pd.get_dummies(test_data, columns=['trustLevel'], prefix='trust')
X_encode_test = X_encode_test.assign(trust_other = (X_encode_test['trust_3']+X_encode_test['trust_4']+X_encode_test['trust_5']+X_encode_test['trust_6']))
X_encode_test = X_encode_test.drop(columns=['trust_3', 'trust_4', 'trust_5','trust_6'])
# normalize
fit_minmax = MinMaxScaler()
X_encode_norm = pd.DataFrame(fit_minmax.fit_transform(X_encode), columns=X_encode.columns, index=X_encode.index)
X_encode_norm_test = pd.DataFrame(fit_minmax.transform(X_encode_test), columns=X_encode_test.columns, index=X_encode_test.index)

## Manual feature

In [31]:
X_manual_nodrop = X_encode.assign(no_item = X_encode.totalScanTimeInSeconds* X_encode.scannedLineItemsPerSecond)
X_manual_nodrop = pd.DataFrame(fit_minmax.fit_transform(X_manual_nodrop), columns=X_manual_nodrop.columns, index=X_manual_nodrop.index)
# X_manual = X_manual_nodrop.drop(columns=['grandTotal','quantityModifications'])
X_manual = X_manual_nodrop.drop(columns=['lineItemVoidsPerPosition'])
X_manual = pd.DataFrame(fit_minmax.fit_transform(X_manual), columns=X_manual.columns, index=X_manual.index)


X_manual_test_nodrop = X_encode_test.assign(no_item = X_encode_test.totalScanTimeInSeconds* X_encode_test.scannedLineItemsPerSecond)
X_manual_test_nodrop = pd.DataFrame(fit_minmax.fit_transform(X_manual_test_nodrop), columns=X_manual_test_nodrop.columns, index=X_manual_test_nodrop.index)
X_manual_test = X_manual_test_nodrop.drop(columns=['lineItemVoidsPerPosition'])
X_manual_test = pd.DataFrame(fit_minmax.fit_transform(X_manual_test), columns=X_manual_test.columns, index=X_manual_test.index)


pca = PCA(n_components = 'mle')
X_manual_PCA = pca.fit_transform(X_manual)
X_manual_test_PCA = pca.transform(X_manual_test)

In [28]:
code_for_test(X_manual_nodrop, y)

Logistic Regression: test core = 225 
Neural Net: test core = 220 
XGBoost: test core = 205 


In [26]:
code_for_test(X_manual, y)

Logistic Regression: test core = 260 
Neural Net: test core = 220 
XGBoost: test core = 145 


In [27]:
code_for_test(X_manual_PCA, y)

Logistic Regression: test core = 260 
Neural Net: test core = 210 
XGBoost: test core = 70 


## auto generate feature

In [38]:
# orginal data encode trust 1,2,other and normalize
GP, x_auto = gp(X_encode_norm, y, 30, 40)
print('without autogeneration')
code_for_test(X_encode_norm, y)
print('autogeneration')
code_for_test(x_auto, y)
print('auto-features')
print(GP)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    12.12         0.111257       19         0.507449         0.435507     21.29s
   1     9.05         0.288971        9         0.552235         0.617424      1.23m
   2    11.60         0.349211       11         0.651679         0.456674      1.59m
   3    15.44         0.416605       14         0.720632         0.548557      1.68m
   4    20.47         0.465525       26         0.713904         0.623763      1.86m
   5    23.80         0.523853       26         0.760131         0.625156      1.99m
   6    29.09         0.563434       40         0.778025         0.607471      2.07m
   7    35.17         0.612473       28          0.79819          0.49402      2.17m
   8    39.98         0.637079       52         0.795612         0.602135  

In [36]:
# drop lineItemVoidsPerPosition 
GP, x_auto_manual = gp(X_manual, y, 40, 40)
print('without autogeneration')
code_for_test(X_manual, y)
print('autogeneration')
code_for_test(x_auto_manual, y)
print('auto-features')
print(GP)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    12.12         0.116677       11          0.67572         0.542619      3.87m
   1     7.93         0.318342        5         0.740263         0.669107      1.87m
   2     9.59         0.426899        5         0.755607         0.530965      2.57m
   3     8.76         0.589247       22         0.783131         0.590051      2.65m
   4    10.95          0.63786       16         0.796931         0.638575      2.45m
   5    16.49         0.676414       19         0.804552         0.629852      2.60m
   6    20.32         0.689028       48         0.823728         0.678536      2.74m
   7    23.84         0.704908       50         0.821595         0.669015      2.93m
   8    24.05         0.699239       58         0.833976         0.643078  

In [37]:
# without drop
GP, x_auto_manual = gp(X_manual_nodrop, y, 40, 40)
print('without autogeneration')
code_for_test(X_manual_nodrop,y)
print('autogeneration')
code_for_test(x_auto_manual, y)
print('auto-features')
print(GP)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    11.86         0.121434       11         0.598137         0.466999     31.44s
   1     8.38         0.298296       17         0.729586         0.529154      1.77m
   2    12.00         0.394533       19         0.753587         0.746162      2.48m
   3    14.79         0.527792       29          0.77866         0.706179      2.58m
   4    20.16         0.621892       20         0.805338         0.628288      2.54m
   5    23.15         0.658883       71           0.8197         0.782564      2.70m
   6    26.46         0.708991       46         0.831469         0.651422      3.05m
   7    27.50         0.739954       33         0.843712         0.546862      3.34m
   8    32.47         0.763343       44         0.851229         0.521246  

## semi-supervised


In [109]:
cv_cost_semi(X_manual, X_manual_test.iloc[:,:30000], y)

## Define xgboost with cost sensitive
1. better than original sgboost but worse than other algorithm 
2. best score 170

In [116]:
cv_cost_xg(X_manual, y, 3 ,100)

cost = 270
