In [49]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [72]:
def pretty_print_coefs(coefs, names = None, sort = False):
    if names == None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    return " + ".join("%s * %s" % (round(coef, 3), name)
                                   for coef, name in lst)


In [73]:
train_data = pd.read_csv('data/train.csv', sep = '|')
test_data = pd.read_csv('data/test.csv', sep = '|')

In [74]:
## normalize w/ encode 
y = train_data['fraud']
X = train_data.drop(columns=['fraud']).astype(float)
X_all = X.append(test_data, sort=False)
X_all= pd.get_dummies(X_all, columns=['trustLevel'], prefix='trustLevel')
X_norm_encode = pd.DataFrame(MinMaxScaler().fit_transform(X_all), columns=X_all.columns, index=X_all.index)
print(X_norm_encode.shape)
X_train_norm_enc = X_norm_encode.iloc[:1879,:]
X_test_norm_enc = X_norm_encode.iloc[1879:,:]
# print(X_train_norm_enc)

(500000, 14)


  return self.partial_fit(X, y)


In [71]:
## normalized w/o encode 
train_data = pd.read_csv('data/train.csv', sep = '|')
test_data = pd.read_csv('data/test.csv', sep = '|')
y = train_data['fraud']
X = train_data.drop(columns=['fraud']).astype(float)
X_all = X.append(test_data, sort=False)
X_norm = pd.DataFrame(MinMaxScaler().fit_transform(X_all), columns=X_all.columns, index=X_all.index)
print(X_norm.shape)
X_train_norm = X_norm.iloc[:1879,:]
X_test_norm = X_norm.iloc[1879:,:]
# print(X_train_norm)


(500000, 9)


In [76]:
# Feature extraction with linear regression
regr = linear_model.LinearRegression()
regr.fit(X_train_norm, y)
# Summarize scores
sorted(zip(map(lambda x: round(x, 4), abs(regr.coef_)), X_train_norm.columns), reverse=True)

[(0.4399, 'scannedLineItemsPerSecond'),
 (0.2253, 'lineItemVoidsPerPosition'),
 (0.2159, 'trustLevel'),
 (0.1053, 'valuePerSecond'),
 (0.0933, 'totalScanTimeInSeconds'),
 (0.0716, 'lineItemVoids'),
 (0.0593, 'scansWithoutRegistration'),
 (0.0146, 'grandTotal'),
 (0.003, 'quantityModifications')]

In [146]:
# Feature extraction with selectKBest
score = {'chi2': chi2, 'f_classif':f_classif, 'mutual_info_classif':mutual_info_classif }
    
for k, v in score.items():
    print(f'=={k}==')
    test = SelectKBest(score_func=v, k=3) # choose top3
    fit = test.fit(X_train_norm, y)
    # Summarize scores
    for i, j in sorted(zip(map(lambda x: round(x, 2), fit.scores_), X_train_norm.columns), reverse=True):
        print(i,j)

==chi2==
46.72 trustLevel
3.78 totalScanTimeInSeconds
3.28 lineItemVoidsPerPosition
2.07 scansWithoutRegistration
1.5 lineItemVoids
0.12 valuePerSecond
0.04 scannedLineItemsPerSecond
0.0 quantityModifications
0.0 grandTotal
==f_classif==
213.78 trustLevel
23.17 totalScanTimeInSeconds
15.37 lineItemVoidsPerPosition
10.37 scansWithoutRegistration
7.6 lineItemVoids
1.57 valuePerSecond
1.0 scannedLineItemsPerSecond
0.0 quantityModifications
0.0 grandTotal
==mutual_info_classif==
0.07 trustLevel
0.03 scannedLineItemsPerSecond
0.03 lineItemVoidsPerPosition
0.01 totalScanTimeInSeconds
0.01 scansWithoutRegistration
0.01 quantityModifications
0.0 valuePerSecond
0.0 lineItemVoids
0.0 grandTotal


In [142]:
feature_KBest(3, X_train_norm, y, chi2, f_classif, mutual_info_classif)

46.72 trustLevel
3.78 totalScanTimeInSeconds
3.28 lineItemVoidsPerPosition
2.07 scansWithoutRegistration
1.5 lineItemVoids
0.12 valuePerSecond
0.04 scannedLineItemsPerSecond
0.0 quantityModifications
0.0 grandTotal
213.78 trustLevel
23.17 totalScanTimeInSeconds
15.37 lineItemVoidsPerPosition
10.37 scansWithoutRegistration
7.6 lineItemVoids
1.57 valuePerSecond
1.0 scannedLineItemsPerSecond
0.0 quantityModifications
0.0 grandTotal
0.07 trustLevel
0.03 scannedLineItemsPerSecond
0.03 lineItemVoidsPerPosition
0.0 valuePerSecond
0.0 totalScanTimeInSeconds
0.0 scansWithoutRegistration
0.0 quantityModifications
0.0 lineItemVoids
0.0 grandTotal


In [135]:

models = {'logistic': LogisticRegression(solver='lbfgs', max_iter=300), 'RFE': RFE(model, 3), 
         'ridge': Ridge(alpha=1.0), 'Lasso': Lasso()}
for k, v in models.items():
    print(k)
    fit = v.fit(X_train_norm, y)
    if k == 'RFE':
        print("Num Features: %s" % (fit.n_features_))
        print("Feature Ranking: %s" % (fit.ranking_))
        for i, j in sorted(zip(fit.ranking_, X_train_norm.columns), reverse=False):
            print(i,j)
            
    elif k == 'logistic':
        for i, j in sorted(zip(map(lambda x: round(x, 4), fit.coef_.reshape(-1)), X_train_norm.columns), reverse=True):
            print(i,j)
    else:
        for i, j in sorted(zip(map(lambda x: round(x, 4), abs(fit.coef_)), X_train_norm.columns), reverse=True):
            print(i,j)
        

logistic
1.9286 totalScanTimeInSeconds
1.2393 scansWithoutRegistration
1.1487 lineItemVoids
0.3701 grandTotal
0.1049 quantityModifications
0.0038 scannedLineItemsPerSecond
-0.0365 valuePerSecond
-3.099 lineItemVoidsPerPosition
-6.5162 trustLevel
RFE
Num Features: 3
Feature Ranking: [1 1 4 3 2 5 7 6 1]
1 lineItemVoidsPerPosition
1 totalScanTimeInSeconds
1 trustLevel
2 scansWithoutRegistration
3 lineItemVoids
4 grandTotal
5 quantityModifications
6 valuePerSecond
7 scannedLineItemsPerSecond
ridge
0.2182 lineItemVoidsPerPosition
0.2146 trustLevel
0.0905 totalScanTimeInSeconds
0.0703 lineItemVoids
0.0588 scansWithoutRegistration
0.0424 scannedLineItemsPerSecond
0.0192 valuePerSecond
0.0139 grandTotal
0.0027 quantityModifications
Lasso
0.0 valuePerSecond
0.0 trustLevel
0.0 totalScanTimeInSeconds
0.0 scansWithoutRegistration
0.0 scannedLineItemsPerSecond
0.0 quantityModifications
0.0 lineItemVoidsPerPosition
0.0 lineItemVoids
0.0 grandTotal


In [140]:
# Feature extraction with randomforest
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train_norm, y)
sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), X_train_norm.columns), reverse=True)

[(0.2276, 'scannedLineItemsPerSecond'),
 (0.2035, 'totalScanTimeInSeconds'),
 (0.1965, 'trustLevel'),
 (0.1299, 'lineItemVoidsPerPosition'),
 (0.0835, 'lineItemVoids'),
 (0.0598, 'valuePerSecond'),
 (0.049, 'scansWithoutRegistration'),
 (0.0346, 'grandTotal'),
 (0.0158, 'quantityModifications')]