## Different perspective
1. PCA
2. manual
   - encode trustLevel
   - delete 'valuePerSecond', 'scannedLineItemsPerSecond', 'lineItemVoidsPerPosition'
   - generate no. item = totalScanTimeInSeconds * scannedLineItemsPerSecond
3. automatic generation

## Code for test
prepare different input dataset and test at 10-fold stratified cross validation set
1. train_data (raw data)
2. X_train_norm_enc data (normalized and encode)
3. X_train_manual (manual)
4. X_train_PCA (PCA)

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
%matplotlib inline
from classifiers import *
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from pomegranate import *

%matplotlib inline

## Original


In [23]:
train_data = pd.read_csv('data/train.csv', sep = '|')
test_data = pd.read_csv('data/test.csv', sep = '|')
print(f'Train set has {train_data.shape[0]} entries and {train_data.shape[1]} features')
print(f'Test set has {test_data.shape[0]} entries and {test_data.shape[1]} features')

y = train_data['fraud']
X = train_data.drop(columns=['fraud']).astype(float)
fit_minmax = MinMaxScaler()
X_encode = pd.get_dummies(X, columns=['trustLevel'], prefix='trustLevel')
X_train_norm_enc = pd.DataFrame(fit_minmax.fit_transform(X_encode), columns=X_encode.columns, index=X_encode.index)
print(X_train_norm_enc.shape)
X_test_encode = pd.get_dummies(test_data, columns=['trustLevel'], prefix='trustLevel')
X_test_norm_enc = pd.DataFrame(fit_minmax.transform(X_test_encode), columns=X_test_encode.columns, index=X_test_encode.index)


Train set has 1879 entries and 10 features
Test set has 498121 entries and 9 features
(1879, 14)


  return self.partial_fit(X, y)


## Manual


In [24]:
## delete correlate features
X_manual = X.assign(no_item = X.totalScanTimeInSeconds* X.scannedLineItemsPerSecond)\
                     .drop(columns=['valuePerSecond', 'scannedLineItemsPerSecond', 'lineItemVoidsPerPosition'])
fit_minmax = MinMaxScaler()
X_manual_encode = pd.get_dummies(X_manual, columns=['trustLevel'], prefix='trustLevel')
X_train_manual = pd.DataFrame(fit_minmax.fit_transform(X_manual_encode), columns=X_manual_encode.columns, index=X_manual_encode.index)
print(X_train_manual.shape)

X_test = test_data.assign(no_item = test_data.totalScanTimeInSeconds* test_data.scannedLineItemsPerSecond)\
                     .drop(columns=['valuePerSecond', 'scannedLineItemsPerSecond', 'lineItemVoidsPerPosition'])
X_test_encode = pd.get_dummies(X_test, columns=['trustLevel'], prefix='trustLevel')
X_test_manual = pd.DataFrame(fit_minmax.transform(X_test_encode), columns=X_test_encode.columns, index=X_test_encode.index)
X_train_manual.columns = ['totalScanTimeInSeconds' , 'grandTotal', 'lineItemVoids',
                        'scansWithoutRegistration', 'quantityModifications', 'no_item' ,
                        'trustLevel_1', 'trustLevel_2' , 'trustLevel_3', 
                        'trustLevel_4', 'trustLevel_5', 'trustLevel_6'
                         ]
print(X_test_manual.shape)


(1879, 12)
(498121, 12)


  return self.partial_fit(X, y)


### Weak Supervised Learning - Adding the test set to the train set 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_train_manual, y, test_size=0.05)

train_df = pd.DataFrame(X_train)
train_df = train_df.append(X_test_manual)
train_df['fraud'] = y_train

#Ys in the lengt of X_test_manuel have to be added so that the shape is similiar
y_na = pd.DataFrame(np.ones((X_test_manual.shape[0], 1)))
y_na = np.negative(y_na)
train_df['fraud'].append(y_na)



y_adj_train = train_df['fraud']
y_adj_train = y_adj_train.fillna(-1)
X_adj_train = train_df.drop(['fraud'], axis = 1)



In [5]:
#Running the classifier
def profit_scorer(y, y_pred):
#     print(confusion_matrix(y, y_pred))
    profit_matrix = {(0,0): 0, (0,1): -5, (1,0): -25, (1,1): 5}
    return sum(profit_matrix[(pred, actual)] for pred, actual in zip(y_pred, y))

def evaluate_classification(X, y, classifier):
    cv = StratifiedKFold(n_splits=10, random_state=42)
    profit_scoring = make_scorer(profit_scorer, greater_is_better=True)
    
    for name, clf in classifier.items():
#         print(cross_validate(clf, X, y=y, cv=cv, scoring=profit_scoring)['test_score'])
        result = sum(cross_validate(clf, X, y=y, cv=cv, scoring=profit_scoring)['test_score'])
        print(f"{name}: test core = {result} ")
        



In [30]:
classifier = {
    'LabelPropagation': LabelPropagation(alpha=None, gamma=10, kernel='rbf', max_iter=1000,
         n_jobs=None, n_neighbors=7, tol=0.001),
    'LabelSpreading': LabelSpreading(alpha=0.2, gamma=10, kernel='rbf', max_iter=30, n_jobs=None,
        n_neighbors=7, tol=0.001)
}

In [19]:
model = LabelSpreading(gamma=10)
model.fit(X_adj_train[:20000], y_adj_train[:20000])

LabelSpreading(alpha=0.2, gamma=10, kernel='rbf', max_iter=30, n_jobs=None,
        n_neighbors=7, tol=0.001)

In [20]:
y_pred = model.predict(X_test)
profit_scorer(y, y_pred)

-20

In [31]:
evaluate_classification(X_train_manual, y, classifier)

LabelPropagation: test core = 5 
LabelSpreading: test core = -45 


In [29]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ['rbf'], 
              'gamma':[1, 5, 10, 20, 30],
             }
profit_scoring = make_scorer(profit_scorer, greater_is_better=True)


clf = GridSearchCV(LabelSpreading(), parameters, scoring = profit_scoring, verbose = 10, cv = 10)
clf.fit(X_train_manual, y)
clf.best_estimator_

Fitting 10 folds for each of 5 candidates, totalling 50 fits
[CV] gamma=1, kernel=rbf .............................................
[CV] ................... gamma=1, kernel=rbf, score=-55, total=   0.1s
[CV] gamma=1, kernel=rbf .............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] ................... gamma=1, kernel=rbf, score=-55, total=   0.1s
[CV] gamma=1, kernel=rbf .............................................
[CV] ................... gamma=1, kernel=rbf, score=-55, total=   0.1s
[CV] gamma=1, kernel=rbf .............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.5s remaining:    0.0s


[CV] ................... gamma=1, kernel=rbf, score=-55, total=   0.1s
[CV] gamma=1, kernel=rbf .............................................
[CV] ................... gamma=1, kernel=rbf, score=-50, total=   0.1s
[CV] gamma=1, kernel=rbf .............................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.9s remaining:    0.0s


[CV] ................... gamma=1, kernel=rbf, score=-50, total=   0.1s
[CV] gamma=1, kernel=rbf .............................................
[CV] ................... gamma=1, kernel=rbf, score=-50, total=   0.1s
[CV] gamma=1, kernel=rbf .............................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    1.2s remaining:    0.0s


[CV] ................... gamma=1, kernel=rbf, score=-50, total=   0.1s
[CV] gamma=1, kernel=rbf .............................................
[CV] ................... gamma=1, kernel=rbf, score=-50, total=   0.1s
[CV] gamma=1, kernel=rbf .............................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.6s remaining:    0.0s


[CV] ................... gamma=1, kernel=rbf, score=-50, total=   0.1s
[CV] gamma=5, kernel=rbf .............................................
[CV] ................... gamma=5, kernel=rbf, score=-35, total=   0.1s
[CV] gamma=5, kernel=rbf .............................................
[CV] .................... gamma=5, kernel=rbf, score=15, total=   0.1s
[CV] gamma=5, kernel=rbf .............................................
[CV] .................... gamma=5, kernel=rbf, score=-5, total=   0.1s
[CV] gamma=5, kernel=rbf .............................................
[CV] ................... gamma=5, kernel=rbf, score=-25, total=   0.1s
[CV] gamma=5, kernel=rbf .............................................
[CV] ................... gamma=5, kernel=rbf, score=-10, total=   0.1s
[CV] gamma=5, kernel=rbf .............................................
[CV] ................... gamma=5, kernel=rbf, score=-20, total=   0.1s
[CV] gamma=5, kernel=rbf .............................................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    8.7s finished


LabelSpreading(alpha=0.2, gamma=10, kernel='rbf', max_iter=30, n_jobs=None,
        n_neighbors=7, tol=0.001)