In [1]:
import pandas as pd
import numpy as np
import os

# feature selection


https://www.datacamp.com/tutorial/feature-selection-python

feature ranks are shown in the adbench_results.csv, sheetname = feature selection

In [49]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.feature_selection import RFE, f_regression
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from minepy import MINE
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn import metrics

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import ShuffleSplit, StratifiedKFold, StratifiedShuffleSplit


In [None]:
train = pd.read_csv('train_features.csv')
test = pd.read_csv('test_features.csv')

In [None]:
Y = train["Label"].values
Y_test = test["Label"].values

X_train = train.drop(["Label"], axis=1).values
X_test = test.drop(["Label"], axis=1).values

scaler = StandardScaler()
scaler.fit(X_train)

X = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
np.random.seed(0)

names = list(train.drop(["Label"], axis=1).columns)
ranks = {}

def rank_to_dict(ranks, names, order=1):
    minmax = MinMaxScaler()
    ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x, 2), ranks)
    return dict(zip(names, ranks ))

lr = LinearRegression(normalize=True)
lr.fit(X, Y)
ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)

ridge = Ridge(alpha=7)
ridge.fit(X, Y)
ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)


lasso = Lasso(alpha=.05)
lasso.fit(X, Y)
ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)

#stop the search when 5 features are left (they will get equal scores)
rfe = RFE(lr, n_features_to_select=5)
rfe.fit(X,Y)
ranks["RFE"] = rank_to_dict(rfe.ranking_.astype(float), names, order=-1)

In [None]:
# time-consuming
rf = RandomForestRegressor()
rf.fit(X,Y)
ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

In [None]:
f, pval  = f_regression(X, Y, center=True)
ranks["Corr."] = rank_to_dict(f, names)

In [None]:
# time-consuming

mine = MINE()
mic_scores = []
for i in range(X.shape[1]):
    mine.compute_score(X[:,i], Y)
    m = mine.mic()
    mic_scores.append(m)

ranks["MIC"] = rank_to_dict(mic_scores, names)

In [None]:
r = {}
for name in names:
    try:
        r[name] = round(np.mean([ranks[method][name] 
                             for method in ranks.keys()]), 2)
    except:
        print(name)

In [None]:
methods = sorted(ranks.keys())
ranks["Mean"] = r
methods.append("Mean")

print ("\t%s" % "\t".join(methods))
for name in names:
    try:
        print ("%s\t%s" % (name, "\t".join(map(str, 
                         [ranks[method][name] for method in methods]))))
    except:
        pass

# Directly apply XGBoost

results are shown in the adbench_results.csv, sheetname = xgboost

In [9]:
train = pd.read_csv('train_features.csv')
test = pd.read_csv('test_features.csv')

In [37]:
ori_col =  ['SettlementAmount','InstructedAmount',
            'Label',
            'hour',
 'Sender_hour_freq',
 'sender_currency_freq',
 'sender_currency_avg_amount',
 'sender_receiver_freq']

In [38]:
names = list(train.drop(ori_col, axis=1).columns)

In [39]:
cols = ['SettlementAmount','InstructedAmount',
 'hour',
 'Sender_hour_freq',
 'sender_currency_freq',
 'sender_currency_avg_amount',
 'sender_receiver_freq']

In [40]:
Y = train["Label"].values
Y_test = test["Label"].values


res = {}

new_cols = cols

X_train = train[new_cols].values
X_test = test[new_cols].values

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# training
xgb = XGBClassifier(n_estimators=100)
xgb.fit(X_train, Y)
pred_proba_xgb = xgb.predict_proba(X_test)[:, 1]

res['initial'] = metrics.average_precision_score(y_true=Y_test, y_score=pred_proba_xgb)
print(res['initial'])

for name in names:
    new_cols = cols.copy()
    new_cols.append(name)
    
    X_train = train[new_cols].values
    X_test = test[new_cols].values

    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    # training
    xgb = XGBClassifier(n_estimators=100)
    xgb.fit(X_train, Y)
    pred_proba_xgb = xgb.predict_proba(X_test)[:, 1]

    res[name] = metrics.average_precision_score(y_true=Y_test, y_score=pred_proba_xgb)
    print(new_cols[-1], res[name])

0.8714357045073786
num_hops 0.8616706281248703
Sender_freq 0.8696121344584612
Receiver_freq 0.8676577808789161
receiver_currency_freq 0.8467111997777877
receiver_currency_avg_amount 0.8653493684217466
Sender_out_degree 0.8723253814291491
Sender_in_degree 0.8693890708617604
Receiver_out_degree 0.8723253814291491
Receiver_in_degree 0.8693890708617604
sender_receiver_currency_freq 0.871031587669454
sender_receiver_currency_avg_amount 0.8680594359286492
OrderingAccount_freq 0.8474713406573251
BeneficiaryAccount_freq 0.8725333136094878
OrderingAccount_currency_freq 0.7638745449438862
OrderingAccount_currency_avg_amount 0.8209403126783755
BeneficiaryAccount_currency_freq 0.8204561571452327
BeneficiaryAccount_currency_avg_amount 0.8609135371167154
OrderingAccount_out_degree 0.7502796038561261
OrderingAccount_in_degree 0.8676605381099526
BeneficiaryAccount_out_degree 0.7502796038561261
BeneficiaryAccount_in_degree 0.8676605381099526
OrderingAccount_BeneficiaryAccount_freq 0.49682662525696525
O

# PCA

There are around 70 features in total, after PCA, top 10 PC can explain 91% of the variance. 

However, the results are really bad

In [63]:
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn import metrics

In [None]:
train = pd.read_csv('train_features.csv')
test = pd.read_csv('test_features.csv')

In [None]:
Y = train["Label"].values
Y_test = test["Label"].values

X_train = train.drop(["Label"], axis=1).values
X_test = test.drop(["Label"], axis=1).values

scaler = StandardScaler()
scaler.fit(X_train)

X = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [40]:
pca = PCA()
pca.fit(X)

PCA()

In [43]:
np.cumsum(pca.explained_variance_ratio_)

array([0.39001175, 0.55766783, 0.66059804, 0.72390357, 0.77615799,
       0.81807436, 0.84934067, 0.87432506, 0.89636264, 0.91532038,
       0.92982349, 0.94148202, 0.95021887, 0.95751506, 0.96457816,
       0.96993984, 0.97489278, 0.97928893, 0.98297448, 0.98634787,
       0.98877386, 0.99066147, 0.99209291, 0.99335048, 0.99436456,
       0.99524555, 0.99598958, 0.99661546, 0.99709483, 0.99754739,
       0.99789835, 0.99816232, 0.99841444, 0.99864179, 0.99885526,
       0.99905431, 0.99923173, 0.99936229, 0.99946885, 0.99957294,
       0.99967396, 0.99973346, 0.99979051, 0.99983601, 0.99988009,
       0.99991008, 0.99993131, 0.9999453 , 0.9999572 , 0.99996759,
       0.99997648, 0.99998347, 0.99998979, 0.99999405, 0.99999594,
       0.99999731, 0.99999828, 0.99999907, 0.99999956, 0.9999999 ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        ])

In [58]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='mean',missing_values=np.nan)
X_test_imp = imp.fit(X).transform(X_test)

In [52]:
res_pca = {}
for n_ in [3, 5, 7, 9, 12]:
    # PCA
    pca = PCA(n_components = n_)
    X_pca = pca.fit(X).transform(X)
    X_test_pca = pca.fit(X).transform(X_test_imp)
    
    
    # normalization
    scaler = StandardScaler()
    scaler.fit(X_train)

    X_pca = scaler.transform(X_pca)
    X_test_pca = scaler.transform(X_test_pca)

    
    # fit the model
    xgb = XGBClassifier(n_estimators=100)
    xgb.fit(X_pca, Y)
    
    #pred_xgb = xgb.predict(X_test_pca)
    pred_proba_xgb = xgb.predict_proba(X_test_pca)[:, 1]

    res_pca[n_] = metrics.average_precision_score(y_true=Y_test, y_score=pred_proba_xgb)
    

KeyboardInterrupt: 