In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [2]:
def pretty_print_coefs(coefs, names = None, sort = False):
    if names == None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    return " + ".join("%s * %s" % (round(coef, 3), name)
                                   for coef, name in lst)


In [3]:
df = pd.read_csv('data/train.csv', delimiter="|")
label = df['fraud']
features = df.drop('fraud', axis=1)
array = df.values
attributes = df.columns[0:-1]
# print(array)
X = array[:,0:9]
Y = array[:,9]

In [5]:
# Feature extraction with linear regression
regr = linear_model.LinearRegression()
regr.fit(X,Y)
# Summarize scores
sorted(zip(map(lambda x: round(x, 4), abs(regr.coef_)), attributes), reverse=True)

[(0.0432, 'trustLevel'),
 (0.0205, 'lineItemVoidsPerPosition'),
 (0.0147, 'scannedLineItemsPerSecond'),
 (0.0065, 'lineItemVoids'),
 (0.0059, 'scansWithoutRegistration'),
 (0.0011, 'valuePerSecond'),
 (0.0006, 'quantityModifications'),
 (0.0001, 'totalScanTimeInSeconds'),
 (0.0001, 'grandTotal')]

In [6]:
# Feature extraction with chi square
test = SelectKBest(score_func=chi2, k=3) # choose top3
fit = test.fit(X, Y)
# Summarize scores
sorted(zip(map(lambda x: round(x, 4), fit.scores_), attributes), reverse=True)

[(6903.0975, 'totalScanTimeInSeconds'),
 (164.9444, 'trustLevel'),
 (36.0422, 'lineItemVoidsPerPosition'),
 (20.7397, 'scansWithoutRegistration'),
 (16.4871, 'lineItemVoids'),
 (11.973, 'valuePerSecond'),
 (1.3354, 'scannedLineItemsPerSecond'),
 (0.0624, 'grandTotal'),
 (0.0016, 'quantityModifications')]

In [7]:
# Feature extraction with f score
test = SelectKBest(score_func=f_classif, k=3) # choose top3
fit = test.fit(X, Y)
# Summarize scores
features = fit.transform(X)
sorted(zip(map(lambda x: round(x, 4), fit.scores_), attributes), reverse=True)

[(213.782, 'trustLevel'),
 (23.1653, 'totalScanTimeInSeconds'),
 (15.3679, 'lineItemVoidsPerPosition'),
 (10.3695, 'scansWithoutRegistration'),
 (7.5983, 'lineItemVoids'),
 (1.566, 'valuePerSecond'),
 (1.0008, 'scannedLineItemsPerSecond'),
 (0.0038, 'grandTotal'),
 (0.0014, 'quantityModifications')]

In [8]:
# Feature extraction with mutual information
test = SelectKBest(score_func=mutual_info_classif, k=3) # choose top3
fit = test.fit(X, Y)
# Summarize scores
features = fit.transform(X)
sorted(zip(map(lambda x: round(x, 4), fit.scores_), attributes), reverse=True)

[(0.0759, 'trustLevel'),
 (0.0331, 'scannedLineItemsPerSecond'),
 (0.031, 'lineItemVoidsPerPosition'),
 (0.0088, 'totalScanTimeInSeconds'),
 (0.0049, 'scansWithoutRegistration'),
 (0.0044, 'valuePerSecond'),
 (0.0026, 'lineItemVoids'),
 (0.0, 'quantityModifications'),
 (0.0, 'grandTotal')]

In [9]:
# Feature extraction with logistic regression
model = LogisticRegression(solver='lbfgs', max_iter=300)
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: %s" % (fit.n_features_))
print("Feature Ranking: %s" % (fit.ranking_))
sorted(zip(map(lambda x: round(x, 4), fit.ranking_), attributes), reverse=False)

Num Features: 3
Feature Ranking: [1 7 6 3 4 5 2 1 1]


[(1, 'lineItemVoidsPerPosition'),
 (1, 'trustLevel'),
 (1, 'valuePerSecond'),
 (2, 'scannedLineItemsPerSecond'),
 (3, 'lineItemVoids'),
 (4, 'scansWithoutRegistration'),
 (5, 'quantityModifications'),
 (6, 'grandTotal'),
 (7, 'totalScanTimeInSeconds')]

In [10]:
# Feature extraction with ridge regularization regression
ridge = Ridge(alpha=1.0)
ridge.fit(X,Y)
# Summarize scores
print ("Ridge model:", pretty_print_coefs(ridge.coef_))
sorted(zip(map(lambda x: round(x, 4), abs(ridge.coef_)), attributes), reverse=True)

Ridge model: -0.043 * X0 + 0.0 * X1 + 0.0 * X2 + 0.007 * X3 + 0.006 * X4 + 0.001 * X5 + 0.014 * X6 + -0.001 * X7 + -0.02 * X8


[(0.0432, 'trustLevel'),
 (0.0205, 'lineItemVoidsPerPosition'),
 (0.0145, 'scannedLineItemsPerSecond'),
 (0.0065, 'lineItemVoids'),
 (0.0059, 'scansWithoutRegistration'),
 (0.001, 'valuePerSecond'),
 (0.0006, 'quantityModifications'),
 (0.0001, 'totalScanTimeInSeconds'),
 (0.0001, 'grandTotal')]

In [11]:
# Feature extraction with Lasso regularization regression
lasso = Lasso()
lasso.fit(X,Y)
# Summarize scores
print ("lasso model:", pretty_print_coefs(lasso.coef_))
sorted(zip(map(lambda x: round(x, 4), abs(lasso.coef_)), attributes), reverse=True)

lasso model: -0.0 * X0 + 0.0 * X1 + 0.0 * X2 + 0.0 * X3 + 0.0 * X4 + 0.0 * X5 + 0.0 * X6 + -0.0 * X7 + -0.0 * X8


[(0.0, 'valuePerSecond'),
 (0.0, 'trustLevel'),
 (0.0, 'totalScanTimeInSeconds'),
 (0.0, 'scansWithoutRegistration'),
 (0.0, 'scannedLineItemsPerSecond'),
 (0.0, 'quantityModifications'),
 (0.0, 'lineItemVoidsPerPosition'),
 (0.0, 'lineItemVoids'),
 (0.0, 'grandTotal')]

In [12]:
# Feature extraction with randomforest
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X, Y)
sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), attributes), reverse=True)

[(0.2361, 'scannedLineItemsPerSecond'),
 (0.2014, 'totalScanTimeInSeconds'),
 (0.1966, 'trustLevel'),
 (0.1258, 'lineItemVoidsPerPosition'),
 (0.0843, 'lineItemVoids'),
 (0.0536, 'valuePerSecond'),
 (0.0511, 'scansWithoutRegistration'),
 (0.037, 'grandTotal'),
 (0.0142, 'quantityModifications')]