In [202]:
## Import packages
import pandas as pd
import numpy as np
import math
import copy
import time


In [203]:
# on Mac
datadir = '/Users/kittipat/research/2015/hack2015/'
infilename = 'hackathon2015_train_10000'

In [None]:
# for hack server
datadir = '/local/kittipat/hack15/'
infilename = 'hackathon2015_train'

In [None]:
# for seller analytics server
datadir = '/mine/fraud/kittipat/2015/hack15/'
infilename = 'hackathon2015_train'

In [237]:
exp_id = '800k_tree1500_raw'

In [205]:
# import data into np array
f = open(''.join([datadir, infilename, '.csv']))

t1 = time.time()
feature_names = f.readline()  # skip the header
data = np.loadtxt(fname=f, delimiter=',')
t2 = time.time()
print("load data to np: %s sec" % (t2 - t1))

print("data shape:",data.shape)

# split into index, X and y
t1 = time.time()
index = data[:,0]
y = data[:,-1]
X = np.delete(data, [0, data.shape[1]-1], 1)
t2 = time.time()
print("split data: %s sec" % (t2 - t1))
del data

print("X shape:",X.shape)


load data to np: 3.62078595161438 sec
data shape: (10000, 575)
split data: 0.0980539321899414 sec
X shape: (10000, 573)


In [206]:
# features to use
feature_names = feature_names.split(',')[1:-1]

## Partition the data into train, validation and test

In [207]:
train_ratio = 0.8
(num_rows, num_col) = X.shape
num_train = math.ceil(num_rows * train_ratio)
num_test = num_rows - num_train
print("#train:%d, #test:%d" % (num_train, num_test))

train = X[:num_train,:]
test = X[num_train:,:]
del X
print("train:",train.shape)
print("test:",test.shape)

# label
y_train = y[:num_train]
y_test = y[num_train:]

# index
index_train = index[:num_train]
index_test = index[num_train:]

#train:8000, #test:2000
train: (8000, 573)
test: (2000, 573)


## Feature processing

In [229]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(train)
# scaling the features
train_scaled = scaler.transform(train)
test_scaled = scaler.transform(test)

## Perceptron

In [236]:
from sklearn.linear_model import Perceptron
clf_perc = Perceptron(penalty=None, alpha=0.01, n_iter=10, eta0=1.0, n_jobs=16)

# fit perceptron
t1 = time.time()
clf_perc = clf_perc.fit(train_scaled, y_train)
t2 = time.time()
print("fit the model: %s sec" % (t2 - t1))

# scoring
yhat_perc_prob_test = clf_perc.predict(test_scaled)


fit the model: 0.3273911476135254 sec


## Logistic regression with ELNET

In [210]:
from sklearn.linear_model import ElasticNet

alpha = 0.1
l1_ratio = 0.05
clf_enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)

# fit LR-elnet
t1 = time.time()
clf_enet = clf_enet.fit(train_scaled, y_train)
t2 = time.time()
print("fit the model: %s sec" % (t2 - t1))

# scoring
yhat_elnet_prob_test = clf_enet.predict(test_scaled)


In [235]:
yhat_elnet_prob_test

array([ 0.12902499,  0.13860737,  0.28105311, ...,  0.12937989,
        0.2618391 ,  0.22486954])

## Run RF

In [None]:
from sklearn.ensemble import RandomForestClassifier

# construct a classifier object for random target
clf_rf = RandomForestClassifier(n_estimators=1500, criterion='gini', max_depth=None, 
                             min_samples_split=2, min_samples_leaf=5, max_features='auto', 
                             max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=16, 
                             random_state=None, verbose=0, min_density=None, compute_importances=None)

# fit RF model to the train data
t1 = time.time()
clf_rf = clf_rf.fit(train, y_train)
t2 = time.time()
print("fit the model: %s sec" % (t2 - t1))

In [None]:
# binary classification output
#yhat_test = clf_rf.predict(test)
#pd.crosstab(yhat_test, y_test, rownames=['predicted'], colnames=['actual'])

In [None]:
# Probability output [good, bad]
t1 = time.time()
yhat_rf_prob_test = clf_rf.predict_proba(test)
t2 = time.time()
print("prediction: %s sec" % (t2 - t1))

In [None]:
# Adjust the threshold. 
# If you set threshold = 0.5, the result is the same as clf_rf.predict(validate)
bad_threshold = 0.4
yhat = (yhat_prob_test[:,1] >= bad_threshold) + 0
pd.crosstab(yhat, y_test, rownames=['predicted'], colnames=['actual'])

In [None]:
# Get the variable importance
var_imp = {'var_name':feature_names, 'imp':clf_rf.feature_importances_.tolist()}
RF_imp = pd.DataFrame(var_imp, columns=['var_name','imp'])
RF_imp.sort(columns='imp',axis=0,ascending=False,inplace=True, kind='quicksort', na_position='last')

# display the features
#HTML(pd.DataFrame(RF_imp).to_html())

## Output the results

In [238]:
# output the test score

# for RF
yhat = yhat_rf_prob_test[:,1]
y_true = y_test

# for LR
#yhat = yhat_elnet_prob_test
#y_true = y_test

# for perceptron
#yhat = yhat_perc_prob_test
#y_true = y_test

# write to csv file
output_file = pd.DataFrame({'score': yhat, 'tag': y_true})
saved_filename = ''.join([datadir,infilename,'_',exp_id,'_predicted.csv'])
output_file.to_csv(path_or_buf=saved_filename, sep=',', na_rep='', header=True, index=False)

#python auccalc.py -f out_test.csv -t 1 -s 0 -l 0.0 -h 0.15


In [239]:
# save model
import pickle

clf2save = clf_rf
#clf2save = clf_enet
#clf2save = clf_perc

saved_filename = ''.join([datadir,infilename,'_',exp_id,'.pck'])
fo = open(saved_filename, "wb")
pickle.dump(clf2save, fo)
fo.close()


In [None]:
# load model
fo = open(saved_filename, "rb")
clf_rf = pickle.load(fo)
fo.close()

In [None]:
# output the feature to text file
saved_filename = ''.join([datadir,infilename,'_',exp_id,'_features.csv'])
RF_imp.to_csv(path_or_buf=saved_filename, sep=',', na_rep='', header=True, index=False)

In [None]:
# We should try:
# gredient boosted tree
# standardize the features
# z-score the features
# outlier detection
# GLM
# clustering (k-mean, RF) the transaction and model for each cluster
# NN
# semi-supervise learning
# kernel method + SVM
# Decision tree
# multi-stage model
# L1 + L2 norm LR

## Gredient boosted

In [None]:
# Fit classifier with out-of-bag estimates
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
params = {'n_estimators': 500, 'max_depth': 3,
          'learning_rate': 0.01, 'min_samples_leaf': 5}
clf_gbc = GradientBoostingClassifier(**params)

t1 = time.time()
clf_gbc.fit(train, y_train)
t2 = time.time()
print("GBC training: %s sec" % (t2 - t1))

yhat_test_gbc = clf_gbc.score(test, y_test)
print(yhat_test_gbc)


# Probability output [good, bad]
t1 = time.time()
yhat_gbc_prob_test = clf_gbc.predict_proba(test)
#yhat_prob_test
t2 = time.time()
print("prediction: %s sec" % (t2 - t1))

# output the test score
output_file = pd.DataFrame({'score': yhat_gbc_prob_test[:,1], 'tag': y_test})
saved_filename = ''.join([datadir,infilename,'_',exp_id,'_predicted.csv'])
output_file.to_csv(path_or_buf=saved_filename, sep=',', na_rep='', header=True, index=False)

#python auccalc.py -f out_test.csv -t 1 -s 0 -l 0.0 -h 0.15



In [None]:
# save model
import pickle

saved_filename = ''.join([datadir,infilename,'_',exp_id,'.pck'])
fo = open(saved_filename, "wb")
pickle.dump(clf_gbc, fo)
fo.close()


## K-mean

In [247]:
from sklearn.cluster import MiniBatchKMeans

# k-mean
clf_mbk = MiniBatchKMeans(n_clusters=10, init='k-means++', max_iter=100, batch_size=100, verbose=True)
clf_mbk = clf_mbk.fit(train_scaled)

# size of each cluster
mbk_train = pd.DataFrame({'label':clf_mbk.labels_ })
mbk_train.label.value_counts()

Init 1/3 with method: k-means++
Inertia for init 1/3: 141781.785408
Init 2/3 with method: k-means++
Inertia for init 2/3: 132158.614551
Init 3/3 with method: k-means++
Inertia for init 3/3: 139391.598526
Minibatch iteration 1/8000: mean batch inertia: 425.629526, ewa inertia: 425.629526 
Minibatch iteration 2/8000: mean batch inertia: 469.522060, ewa inertia: 426.726702 
Minibatch iteration 3/8000: mean batch inertia: 539.011016, ewa inertia: 429.533459 
Minibatch iteration 4/8000: mean batch inertia: 540.932714, ewa inertia: 432.318092 
Minibatch iteration 5/8000: mean batch inertia: 595.819332, ewa inertia: 436.405112 
Minibatch iteration 6/8000: mean batch inertia: 502.485069, ewa inertia: 438.056905 
Minibatch iteration 7/8000: mean batch inertia: 687.512932, ewa inertia: 444.292526 
Minibatch iteration 8/8000: mean batch inertia: 458.289061, ewa inertia: 444.642396 
Minibatch iteration 9/8000: mean batch inertia: 439.888503, ewa inertia: 444.523563 
[MiniBatchKMeans] Reassigning 3

## Affinity Propagation

In [268]:
# Compute Affinity Propagation
from sklearn.cluster import AffinityPropagation
clf_af = AffinityPropagation(preference=-50, damping=0.5, max_iter=200, convergence_iter=15, verbose=True)
clf_af = clf_af.fit(train_scaled)
clf_af.cluster_centers_indices_
clf_af.labels_

Converged after 15 iterations.


array([   0,    1,    2, ..., 7997, 7998, 7999])

## variable correlation

In [None]:
# Here we want to know a few things:
# 1) How many unique values
# 2) correlation corr(x,y)
# 3) mutual information MI(x,y)
# 4) distribution of bad/good in each bin

In [None]:
def variable_info(x, name):
    
    # number of unique value
    unique_x = np.unique(x)
    num_unique = len(unique_x)
    
    # Pearson correlation coefficient
    pcorr = np.corrcoef(x, y_train)[0,1]
    
    import sklearn.metrics
    mi = sklearn.metrics.mutual_info_score(y_train, x)

    var_info = pd.DataFrame({'name':[name],
                             'num_unique':[num_unique],
                             'pcorr':[pcorr],
                             'mi':[mi]})
    var_info = var_info[['name', 'num_unique', 'pcorr', 'mi']]
    return var_info

In [None]:
# calculate variables info
var_info = pd.DataFrame({'name':[],
                             'num_unique':[],
                             'pcorr':[],
                             'mi':[]})
for i in range(train.shape[1]):
    var_info = pd.concat(objs=[var_info, variable_info(train[:,i],"v%d" % (i+1))], axis=0)
    
var_info = var_info[['name', 'num_unique', 'pcorr', 'mi']] 

# set var name as index
var_info.set_index(keys='name', inplace=True)

In [None]:
# load RF feature important
infilename = 'hackathon2015_train_800k_tree500_split2_leaf5_features'
tp = pd.read_csv(filepath_or_buffer=''.join([datadir, infilename, '.csv']),
                     sep=',', na_values=['.', ''],
                     header=0, iterator=True, chunksize=1000)
rf_car_imp = pd.concat(list(tp), ignore_index=True)
# set var name as index
rf_car_imp.set_index(keys='var_name', inplace=True)

In [None]:
# combine all the information
var_info = pd.concat(objs=[var_info, rf_car_imp], axis=1)
var_info.sort(columns=['imp', 'mi', 'pcorr'], ascending=False, inplace=True)
var_info

In [None]:
# output the feature information
var_info.to_csv(path_or_buf='variable_info.csv', sep=',', na_rep='', header=True, index=True)

In [276]:
# see the variable binning
x = train[:,83]
dfxy = pd.DataFrame({'x':x, 'y':y_train.astype(int)})
var_bin = pd.crosstab(index=dfxy.x, columns=[dfxy.y])
var_bin['P1'] = var_bin[1]/(var_bin[0]+var_bin[1])
var_bin['P0'] = var_bin[0]/(var_bin[0]+var_bin[1])
var_bin

y,0,1,P1,P0
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-2.90957,4,3,0.428571,0.571429
-2.909552,3,0,0.0,1.0
-2.909534,1,0,0.0,1.0
-2.909517,1,0,0.0,1.0
-2.909499,1,0,0.0,1.0
-2.909481,4,0,0.0,1.0
-2.909446,2,0,0.0,1.0
-2.909411,4,0,0.0,1.0
-2.909393,1,0,0.0,1.0
-2.909358,1,0,0.0,1.0


In [272]:
var_bin

y,0,1,P1,P0
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-10.567689,26,45,0.633803,0.366197
0.094628,6583,1346,0.169757,0.830243
