## Model Factory

In [1]:
#libraries for plotting
import matplotlib.gridspec as gridspec
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

In [15]:
#libraries for modeling
from multiprocessing.pool import ThreadPool
from pyspark import SparkContext
from pyspark.sql import SQLContext, SparkSession, Window, Row
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.sql.functions as F
import itertools
from itertools import repeat
import pickle
import pyspark

from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

class CreateBestModel:
    def __init__(self, algo, avgprecision, avgrecall, avgfscore, hyperparams, ootmodel, ootprecision, ootrecall, ootfscore):
        self.algo = algo
        self.gsPrecision = avgprecision
        self.gsFScore = avgfscore
        self.gsRecall = avgrecall
        self.hyperParams = hyperparams
        self.model = ootmodel
        self.ootPrecision = ootprecision
        self.ootFScore = ootfscore
        self.ootRecall = ootrecall

#function-based
def sample(df, sampling_method, ratio):

    notfraud = df.select('*').where(df.Class == 0.0)
    fraud = df.select('*').where(df.Class == 1.0)

    if sampling_method == "over":
        nrows = notfraud.select("Class").count()
        sample_size = int(nrows*ratio/(1-ratio))
        sampled = fraud.rdd.takeSample(True, sample_size, 46)
        fraud = sqlContext.createDataFrame(sampled)

    elif sampling_method == "under":
        nrows = fraud.select("Class").count()
        sample_size = int(nrows*(1-ratio)/ratio)
        sampled = notfraud.rdd.takeSample(False, sample_size, 46)
        notfraud = sqlContext.createDataFrame(sampled)
    else:
        return df
    
    sampled = fraud.union(notfraud)

    #shuffle undersampled dataframe
    nrows = sampled.select("Class").count()
    shuffled = sampled.rdd.takeSample(False, nrows, 46)
    shuffled_df = sqlContext.createDataFrame(shuffled)

    return shuffled_df

def generateParamGrid(*args):
    
    grid = list(itertools.product(*args))
    return grid

def generateClassifier(algo, params, features):

    ############################################################################
    #TODO: complete this section

    def lr(params,features):
        print(params)
        if len(params) > 2:
            lrClassifier = LogisticRegression(featuresCol = 'features',
                                          labelCol = 'Class',
                                          threshold=params[0],
                                           maxIter=params[1],
                                           weightCol=params[2])
                                          #regParam=params[2])
                                          #elasticNetParam=params[2])
        else:
            lrClassifier = LogisticRegression(featuresCol = 'features',
                                          labelCol = 'Class',
                                          threshold=params[0],
                                           maxIter=params[1])
        return lrClassifier


    def gbm(params,features):
        gbmClassifier = GBTClassifier(featuresCol = 'features',
                                      labelCol = 'Class',
                                      maxDepth = params[0],
                                      minInfoGain = params[1])
        return gbmClassifier

    def rf(params,features):
        rfClassifier = RandomForestClassifier(featuresCol='features',
                                              labelCol='Class',
                                              maxDepth=params[0],
                                              minInfoGain=params[1],
                                              numTrees=params[2])

        return rfClassifier

    def mlp(params,features):
        input_layers = len(features)
        layers = [input_layers, params[1], 2]
        print(layers)
        mlpClassifier = MultilayerPerceptronClassifier(featuresCol = 'features',
                                                       labelCol = 'Class',
                                                       maxIter = params[0],
                                                       layers = layers,
                                                       stepSize = params[2])
        return mlpClassifier

    def svm(params, features):
        svmClassifier = LinearSVC(featuresCol = 'features',
                         labelCol='Class', 
                         maxIter=params[0],
                         regParam=params[1],
                         tol =params[2]
                         )
        
        return svmClassifier

    def xg(params,features):
        return
    ############################################################################

    getClassifier = {
        'lr':lr,
        'gbm':gbm,
        'rf':rf,
        'mlp':mlp,
        'svm':svm,
        'xg':xg}

    return getClassifier[algo](params,features)

def crossValidate(df, folds, k, classifier, features, sampling_method, ratio, pool):

    def build(fold, df, classifier, features, sampling_method, ratio):

        validation = fold
        train = df.subtract(fold)

#         #add class weight
#         notfraud_count = train.select("Class").where(train.Class == 0.0).count()
#         total_count = train.select("Class").count()
#         balance_ratio = notfraud_count / total_count
#         train=train.withColumn("classWeights", F.when(train.Class == 1.0,balance_ratio).otherwise(1-balance_ratio))
        
        train = sample(train, sampling_method, ratio)
        fraud_count = train.select("Class").where(train.Class == 1.0).count()
        tot_count = train.select("Class").count()
        fraud_ratio = fraud_count / tot_count
        print("train: " + str(tot_count))
        print("fraud ratio: " + str(fraud_ratio))
        
        vectorAssembler = VectorAssembler(inputCols = features, outputCol = 'features')
        vector_train = vectorAssembler.transform(train)
        vector_validate = vectorAssembler.transform(validation)
        model = classifier.fit(vector_train)
        pred = model.transform(vector_validate)
        pos = pred.filter(pred.prediction == 1.0).count()
        if pos != 0:
            precision = pred.filter(pred.Class == pred.prediction).filter(pred.Class == 1.0).count() / pos
        else:
            precision = 0
        fraud = pred.filter(pred.Class == 1.0).count()
        if fraud != 0:
            recall = pred.filter(pred.Class == pred.prediction).filter(pred.Class == 1.0).count() / fraud
        else:
            recall = 0
        precision_recall = precision + recall
        if precision_recall != 0:
            f_score = 2 * precision * recall /(precision_recall)
        else:
            f_score = 0
        #print("\n precision, recall, f_score: " + str(precision) + ", " + str(recall) + ", " + str(f_score))
        return [precision, recall, f_score]

    #call multiprocessing here
    cvperformance = pool.map(lambda fold: build(fold, df, classifier, features, sampling_method, ratio), folds)

    #calculate metrics
    precision_sum = sum([x[0] for x in cvperformance])
    recall_sum = sum([x[1] for x in cvperformance])

    avg_precision = precision_sum/k
    avg_recall = recall_sum/k
    if avg_precision+avg_recall == 0:
        avg_fscore = 0
    else:
        avg_fscore = 2 * avg_precision * avg_recall /(avg_precision+avg_recall)
    return [avg_precision,avg_recall,avg_fscore]

def gridSearch(df, folds, k, algo, grid, features, sampling_method, ratio, pool):

    best_hyper = None
    best_precision = 0
    best_recall = 0
    best_fscore = 0

    for i in range(len(grid)):
        params = list(grid[i])
        print(params)
        classifier = generateClassifier(algo, params, features)
        modelPerformance = crossValidate(df, folds, k, classifier, features, sampling_method, ratio, pool)
        print(modelPerformance)
        if modelPerformance[2] > best_fscore:
            best_hyper = params
            best_precision = modelPerformance[0]
            best_recall = modelPerformance[1]
            best_fscore = modelPerformance[2]

    return best_hyper, best_precision, best_recall, best_fscore

def TrainTest(traindf,testdf,algo,features,params):
    vectorAssembler = VectorAssembler(inputCols = features, outputCol = 'features')
    classifier = generateClassifier(algo, params, features)
    vector_train = vectorAssembler.transform(traindf)
    vector_test = vectorAssembler.transform(testdf)
    m = classifier.fit(vector_train)
    pred = m.transform(vector_test)
    pos = pred.filter(pred.prediction == 1.0).count()
    if pos != 0:
        precision = pred.filter(pred.Class == pred.prediction).filter(pred.Class == 1.0).count() / pos
    else:
        precision = 0
    fraud = pred.filter(pred.Class == 1.0).count()
    if fraud != 0:
        recall = pred.filter(pred.Class == pred.prediction).filter(pred.Class == 1.0).count() / fraud
    else:
        recall = 0
    precision_recall = precision + recall
    if precision_recall != 0:
        f_score = 2 * precision * recall /(precision_recall)
    else:
        f_score = 0
    print("\n precision, recall, f_score: " + str(precision) + ", " + str(recall) + ", " + str(f_score))
    predictionAndLabels = pred.select('Class','prediction').rdd.map(lambda lp: (float(lp.prediction), lp.Class))
    metrics = MulticlassMetrics(predictionAndLabels)
    print(metrics.confusionMatrix().toArray())
    #evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="Class",metricName="areaUnderPR")
    #print("Area under Precision Recall Curve = %s" % evaluator.evaluate(pred))
    
    return m, precision, recall, f_score

def tune(df, k, stratification_flag, sampling_method, ratio, modelobj_flag, features, algo, *args, **kwargs):

    """
    Entry point of this suite of functions. returns cv metrics or a model object
    Example:
        >>> cv_hyper, cv_precision, cv_recall, cv_fscore = tune(df, 5, True,
        'None', 0, False, features, 'mlp', [100], [15], [0.03])
    :param df: data for modeling purpose
    :type df: : pyspark dataframe
    :param k: number of folds for cross validation
    :type k: int
    :param stratification_flag: specifies whether fraud ratio is fixed for each fold. True for stratification
    :type stratification_flag: boolean
    :param sampling_method: "over" for oversampling minority class, "under" for undersampling majority class, "None"
    :type sampling_method: str
    :param ratio: targeted fraud ratio after sampling.
    :type ratio: float
    :param modelobj_flag: specifies whether to return a model object for out of time test. if False, returns cv performancce
    :type modelobj_flag: float
    :param features: features for training
    :type features: list
    :param *args: a sequence of params for hyperparams tuning. ex. [values for params1], [values for params2],...
    :type *args: list
    :returns: model object or cross validation metrics depending on modelobj_flag
    """


    pool = ThreadPool(3)

    #reduce df dimenions to include features and class
    cols = features+['Class', 'index']
    df = df.select(cols)
    df = df.select(*(F.col(c).cast("double").alias(c) for c in df.columns))
    df.cache()
    #df.drop("index")
    
    ########################ClassWeights#################################
    if algo in ["lr", "svm"] and ["ClassWeigts"] in args:
        #add class weight
        balance_ratio = args[-1][0]
        df=df.withColumn("classWeights", when(df.Class == 1.0,balance_ratio).otherwise(1-balance_ratio))
    ########################ClassWeights#################################
    
    folds = []
    
    if stratification_flag == False:
        tot_count = df.select("Class").count()
        n = int(tot_count / k)

        #create sub-dataframe iteratively
        fold_start = 1
        fold_end = n
        for i in range(k):
            fold = df.select('*').where(df.index.between(fold_start, fold_end))
            folds.append(fold)
            fold_start = fold_end + 1
            fold_end = fold_start + n
            if i == k-2:
                end = tot_count
                
    if stratification_flag == True:
        fraud = df.select("*").where(df.Class == 1.0)
        #shuffle undersampled dataframe
        nrows = fraud.select("Class").count()
        shuffled = fraud.rdd.takeSample(False, nrows, 46)
        fraud = sqlContext.createDataFrame(shuffled)
        #add row index to dataframe
        fraud = fraud.withColumn('dummy', F.lit('7'))
        fraud = fraud.withColumn("temp_index", F.row_number().over(Window.partitionBy("dummy").orderBy("dummy")))
        fraud = fraud.drop('dummy')
        fraud_count = fraud.select("Class").count()
        each_fraud = int(fraud_count/k)

        notfraud = df.select("*").where(df.Class == 0.0)
        nrows = notfraud.select("Class").count()
        shuffled = notfraud.rdd.takeSample(False, nrows, 46)
        notfraud = sqlContext.createDataFrame(shuffled)
        #add row index to dataframe
        notfraud = notfraud.withColumn('dummy', F.lit('7'))
        notfraud = notfraud.withColumn("temp_index", F.row_number().over(Window.partitionBy("dummy").orderBy("dummy")))
        notfraud = notfraud.drop('dummy')
        notfraud_count = notfraud.select("Class").count()
        each_notfraud = int(notfraud_count/k)

        fraud_start = 1
        fraud_end = each_fraud
        notfraud_start = 1
        notfraud_end = each_notfraud

        for i in range(k):
            fraud_fold  = fraud.select('*').where(fraud.temp_index.between(fraud_start, fraud_end))
            notfraud_fold = notfraud.select('*').where(notfraud.temp_index.between(notfraud_start, notfraud_end))
            fold = fraud_fold.union(notfraud_fold).drop("temp_index")
            folds.append(fold)
            fraud_start = fraud_end + 1
            fraud_end = fraud_start + each_fraud
            notfraud_start = notfraud_end + 1
            notfraud_end = notfraud_start + each_notfraud
            if i == k-2:
                fraud_end = fraud_count
                notfraud_end = notfraud_count


    #generate hyperparam combo
    grid = generateParamGrid(*args)

    #conduct grid search:
    best_hyper, best_precision, best_recall, best_fscore = gridSearch(df, folds, k, algo, grid, features, sampling_method, ratio, pool)

    if modelobj_flag == True:
        #generate a model obj
        traindf = sample(df, sampling_method, ratio)
        testdf = sqlContext.read.csv("oot.csv", header = True)
        cols = features+['Class', 'index']
        testdf = testdf.select(cols)
        testdf = testdf.select(*(F.col(c).cast("double").alias(c) for c in testdf.columns))
        model, precision, recall, fscore = ootTest(traindf, testdf, algo,features,best_hyper)
        
        modelobj = CreateBestModel(algo, best_precision, best_recall, best_fscore, best_hyper, 
                                   model, precision, recall, fscore)
        return modelobj

    return best_hyper, best_precision, best_recall, best_fscore

def save(content, filename):

    pickle.dump(content, open(filename, "wb"))

def load(filename):

    content = pickle.load(open(filename, "rb"))
    return content

In [4]:
sc=pyspark.SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
print(sc._conf.get('spark.executor.memory'))
trainData = sqlContext.read.csv("base_train.csv", header = True)
testData = sqlContext.read.csv("base_test.csv", header = True)
features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 
            'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28']
oot = sqlContext.read.csv("oot.csv", header = True)
cols = features+['Class', 'index']
trainData = trainData.select(cols)
trainData = trainData.select(*(F.col(c).cast("double").alias(c) for c in trainData.columns))
testData = testData.select(cols)
testData = testData.select(*(F.col(c).cast("double").alias(c) for c in testData.columns))
oot = oot.select(cols)
oot = oot.select(*(F.col(c).cast("double").alias(c) for c in oot.columns))

15g


## 1. Base Model: Interpolation using default params [100, 15, 0.03]

In [16]:
features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 
            'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28']
#maxIter, number of neurons in hidden layer, stepsize
params = [100, 15, 0.03]
m, p, r, f = TrainTest(trainData,testData,'mlp',features,params)

[29, 15, 2]

 precision, recall, f_score: 0.8735632183908046, 0.8351648351648352, 0.853932584269663
[[4.5294e+04 1.1000e+01]
 [1.5000e+01 7.6000e+01]]


## 2. Random vs stratified 5 fold cross validation using default params

### random cv

In [63]:
sqlContext.clearCache()

df = sqlContext.read.csv("base_train.csv", header = True)
features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28']
hiddenlayer = int((len(features) + 2) / 2)
cv_hyper, cv_precision, cv_recall, cv_fscore = tune(df, 5, False, 'None', 0, False, features, 'mlp', [100], [hiddenlayer], [0.03])
print("avg precision:", cv_precision)
print("avg recall:", cv_recall)
print("avg f-score:", cv_fscore)

[100, 15, 0.03]
[29, 15, 2]
train: 152562
fraud ratio: 0.0014420366801693738
train: 152538
fraud ratio: 0.0017372720240202441
train: 152482
fraud ratio: 0.0016592122348867407
train: 152553
fraud ratio: 0.0018223174896593315
train: 152499
fraud ratio: 0.0016655846923586384
[0.8855566097406704, 0.7239147286821705, 0.7966186878497847]
avg precision: 0.8855566097406704
avg recall: 0.7239147286821705
avg f-score: 0.7966186878497847


### stratified 5 fold cross validation using default params

In [70]:
df = sqlContext.read.csv("base_train.csv", header = True)
features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 
            'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 
            'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 
            'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28']
hiddenlayer = int((len(features) + 2) / 2)
cv_hyper, cv_precision, cv_recall, cv_fscore = tune(df, 5, True, 'None', 0, False, features, 'mlp', [100], [hiddenlayer], [0.03])
print("avg precision:", cv_precision)
print("avg recall:", cv_recall)
print("avg f-score:", cv_fscore)

[100, 15, 0.03]
[29, 15, 2]
train: 145266
fraud ratio: 0.0016934451282474908
train: 145268
fraud ratio: 0.001700305641985847
train: 145266
fraud ratio: 0.0016934451282474908
train: 145266
fraud ratio: 0.0016934451282474908
train: 145270
fraud ratio: 0.0017002822330832244
[0.898414964064903, 0.7662612374405076, 0.8270924537479238]
avg precision: 0.898414964064903
avg recall: 0.7662612374405076
avg f-score: 0.8270924537479238


## 3. Grid Search

In [5]:
df = sqlContext.read.csv("base_train.csv", header = True)
features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 
            'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 
            'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 
            'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28']
hiddenlayer = int((len(features) + 2) / 2)
cv_hyper, cv_precision, cv_recall, cv_fscore = tune(df, 5, True, 'None', 0, False, features, 'mlp', [100,200],
                                                    [hiddenlayer-1, hiddenlayer, hiddenlayer+1], [0.03, 0.01])
print("gs precision:", cv_precision)
print("gs recall:", cv_recall)
print("gs f-score:", cv_fscore)
print("gs hyper:", cv_hyper)

[100, 14, 0.03]
[29, 14, 2]
train: 145268
fraud ratio: 0.001700305641985847
train: 145266
fraud ratio: 0.0016934451282474908
train: 145266
fraud ratio: 0.0016934451282474908
train: 145266
fraud ratio: 0.0016934451282474908
train: 145270
fraud ratio: 0.0017002822330832244
[0.8660056701164771, 0.763088313061872, 0.8112961102734016]
[100, 14, 0.01]
[29, 14, 2]
train: 145266
fraud ratio: 0.0016934451282474908
train: 145268
fraud ratio: 0.001700305641985847
train: 145266
fraud ratio: 0.0016934451282474908
train: 145266
fraud ratio: 0.0016934451282474908
train: 145270
fraud ratio: 0.0017002822330832244
[0.8669156802277007, 0.7727657324167108, 0.8171376773665636]
[100, 15, 0.03]
[29, 15, 2]
train: 145268
fraud ratio: 0.001700305641985847
train: 145266
fraud ratio: 0.0016934451282474908
train: 145266
fraud ratio: 0.0016934451282474908
train: 145266
fraud ratio: 0.0016934451282474908
train: 145270
fraud ratio: 0.0017002822330832244
[0.88490513356185, 0.7695399259650978, 0.8232002955263191]
[100

#### Use the supposedly best params for interpolation -> worse interpolation

In [24]:
features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 
            'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28']
#maxIter, number of neurons in hidden layer, stepsize
params = [200, 14, 0.03]
m, p, r, f = TrainTest(trainData,testData,'mlp',features,params)

[29, 14, 2]

 precision, recall, f_score: 0.8765432098765432, 0.7802197802197802, 0.8255813953488372
[[4.5295e+04 1.0000e+01]
 [2.0000e+01 7.1000e+01]]


#### Use the second best params but search over stepSize

In [37]:
df = sqlContext.read.csv("base_train.csv", header = True)
features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 
            'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 
            'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 
            'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28']
hiddenlayer = int((len(features) + 2) / 2)
cv_hyper, cv_precision, cv_recall, cv_fscore = tune(df, 5, True, 'None', 0, False, features, 'mlp', [100],
                                                    [15], [0.06, 0.03, 0.01, 0.005])
print("gs precision:", cv_precision)
print("gs recall:", cv_recall)
print("gs f-score:", cv_fscore)
print("gs hyper:", cv_hyper)

[100, 15, 0.06]
[29, 15, 2]
train: 145268
fraud ratio: 0.001700305641985847
train: 145266
fraud ratio: 0.0016934451282474908
train: 145266
fraud ratio: 0.0016934451282474908
train: 145266
fraud ratio: 0.0016934451282474908
train: 145270
fraud ratio: 0.0017002822330832244
[0.8850222104454775, 0.7434690639873083, 0.808093533859503]
[100, 15, 0.03]
[29, 15, 2]
train: 145266
fraud ratio: 0.0016934451282474908
train: 145266
fraud ratio: 0.0016934451282474908
train: 145268
fraud ratio: 0.001700305641985847
train: 145266
fraud ratio: 0.0016934451282474908
train: 145270
fraud ratio: 0.0017002822330832244
[0.8799228135698722, 0.7499735589635114, 0.8097678542353672]
[100, 15, 0.01]
[29, 15, 2]
train: 145268
fraud ratio: 0.001700305641985847
train: 145266
fraud ratio: 0.0016934451282474908
train: 145266
fraud ratio: 0.0016934451282474908
train: 145266
fraud ratio: 0.0016934451282474908
train: 145270
fraud ratio: 0.0017002822330832244
[0.8875697247648467, 0.7466419883659439, 0.8110293406749083]
[1

In [38]:
features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 
            'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28']
#maxIter, number of neurons in hidden layer, stepsize
params = [100, 15, 0.005]
m, p, r, f = TrainTest(trainData,testData,'mlp',features,params)

[29, 15, 2]

 precision, recall, f_score: 0.8735632183908046, 0.8351648351648352, 0.853932584269663
[[4.5294e+04 1.1000e+01]
 [1.5000e+01 7.6000e+01]]


### Conclusion: [100,15,0.03]

## 4. Feature Selection

#### p-val(V13, V22, V23, V25, V26) = 0.442, 0.054, 0.147, 0.735, 0.141

#### bench mark cv: 0.898414964064903, 0.7662612374405076, 0.8270924537479238
#### bench mark interpolation: 0.8735632183908046, 0.8351648351648352, 0.853932584269663

### 4.1.1 remove feature "V25" based on T test and run cv with defaults params -> slightly better precision

In [91]:
df = sqlContext.read.csv("base_train.csv", header = True)
features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 
            'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V26', 'V27', 'V28']
hiddenlayer = int((len(features) + 2) / 2)
cv_hyper, cv_precision, cv_recall, cv_fscore = tune(df, 5, True, 'None', 0, False, features, 'mlp', [100], [hiddenlayer], [0.03])
print("avg precision:", cv_precision)
print("avg recall:", cv_recall)
print("avg f-score:", cv_fscore)

[100, 15, 0.03]
[28, 15, 2]
train: 145266
fraud ratio: 0.0016934451282474908
train: 145268
fraud ratio: 0.001700305641985847
train: 145266
fraud ratio: 0.0016934451282474908
train: 145266
fraud ratio: 0.0016934451282474908
train: 145270
fraud ratio: 0.0017002822330832244
[0.9121402810595697, 0.7663141195134848, 0.8328924230700581]
avg precision: 0.9121402810595697
avg recall: 0.7663141195134848
avg f-score: 0.8328924230700581


### 4.1.2 Interpolation -> kept the same

In [42]:
features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 
            'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V26', 'V27', 'V28']
#maxIter, number of neurons in hidden layer, stepsize
params = [100, 15, 0.03]
m, p, r, f = TrainTest(trainData,testData,'mlp',features,params)

[28, 15, 2]

 precision, recall, f_score: 0.8735632183908046, 0.8351648351648352, 0.853932584269663
[[4.5294e+04 1.1000e+01]
 [1.5000e+01 7.6000e+01]]


### 4.2.1 remove feature "V13", V25" based on T test and run cv with default params -> precision and recall trade-off comparing to 4.1.1

In [51]:
df = sqlContext.read.csv("base_train.csv", header = True)
features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V14', 
            'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V26', 'V27', 'V28']
hiddenlayer = int((len(features) + 2) / 2)
cv_hyper, cv_precision, cv_recall, cv_fscore = tune(df, 5, True, 'None', 0, False, features, 'mlp', [100], [hiddenlayer], [0.03])
print("avg precision:", cv_precision)
print("avg recall:", cv_recall)
print("avg f-score:", cv_fscore)

[100, 14, 0.03]
[27, 14, 2]
train: 145268
fraud ratio: 0.001700305641985847
train: 145266
fraud ratio: 0.0016934451282474908
train: 145266
fraud ratio: 0.0016934451282474908
train: 145266
fraud ratio: 0.0016934451282474908
train: 145270
fraud ratio: 0.0017002822330832244
[0.9037135195958725, 0.7727657324167108, 0.8331255385678298]
avg precision: 0.9037135195958725
avg recall: 0.7727657324167108
avg f-score: 0.8331255385678298


### 4.2.2 Interpolation -> 0.006 F-score improvement

In [52]:
features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V14', 
            'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V26', 'V27', 'V28']
#maxIter, number of neurons in hidden layer, stepsize
params = [100, 14, 0.03]
m, p, r, f = TrainTest(trainData,testData,'mlp',features,params)

[27, 14, 2]

 precision, recall, f_score: 0.875, 0.8461538461538461, 0.8603351955307262
[[4.5294e+04 1.1000e+01]
 [1.4000e+01 7.7000e+01]]


### 4.3.1 remove feature "V13", "V23", "V25" based on T test and run cv with default params -> precision and recall trade-off comparing to 4.1.1

In [56]:
df = sqlContext.read.csv("base_train.csv", header = True)
features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V14', 
            'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V24', 'V26', 'V27', 'V28']
hiddenlayer = int((len(features) + 2) / 2)
cv_hyper, cv_precision, cv_recall, cv_fscore = tune(df, 5, True, 'None', 0, False, features, 'mlp', [100], [hiddenlayer], [0.03])
print("avg precision:", cv_precision)
print("avg recall:", cv_recall)
print("avg f-score:", cv_fscore)

[100, 14, 0.03]
[26, 14, 2]
train: 145268
fraud ratio: 0.001700305641985847
train: 145266
fraud ratio: 0.0016934451282474908
train: 145266
fraud ratio: 0.0016934451282474908
train: 145266
fraud ratio: 0.0016934451282474908
train: 145270
fraud ratio: 0.0017002822330832244
[0.9056869927458162, 0.7824960338445267, 0.8395967363320154]
avg precision: 0.9056869927458162
avg recall: 0.7824960338445267
avg f-score: 0.8395967363320154


### 4.3.2 Interpolation -> 0.001 F-score improvement

In [57]:
features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V14', 
            'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V24', 'V26', 'V27', 'V28']
#maxIter, number of neurons in hidden layer, stepsize
params = [100, 14, 0.03]
m, p, r, f = TrainTest(trainData,testData,'mlp',features,params)

[26, 14, 2]

 precision, recall, f_score: 0.8651685393258427, 0.8461538461538461, 0.8555555555555556
[[4.5293e+04 1.2000e+01]
 [1.4000e+01 7.7000e+01]]


### Conclusion: Use all features

## 5. Random Sampling

#### bench mark cv: 0.898414964064903, 0.7662612374405076, 0.8270924537479238
#### bench mark interpolation: 0.8735632183908046, 0.8351648351648352, 0.853932584269663

#### under sampling

In [61]:
ratio_list = [0.003, 0.005]
df = sqlContext.read.csv("modeling.csv", header = True)
features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 
            'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 
            'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 
            'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28']
for ratio in ratio_list:
    tune(df, 5, True, 'under', ratio, False, features, 'mlp', [100], [15], [0.03])

[100, 15, 0.03]
[29, 15, 2]
train: 106333
fraud ratio: 0.0030000094044181956
train: 106666
fraud ratio: 0.003000018750117188
train: 106333
fraud ratio: 0.0030000094044181956
train: 106333
fraud ratio: 0.0030000094044181956
train: 106333
fraud ratio: 0.0030000094044181956
[0.8644870461818176, 0.7970569620253164, 0.8294037537813084]
[100, 15, 0.03]
[29, 15, 2]
train: 63799
fraud ratio: 0.0050000783711343436
train: 63799
fraud ratio: 0.0050000783711343436
train: 63999
fraud ratio: 0.005000078126220722
train: 63799
fraud ratio: 0.0050000783711343436
train: 63799
fraud ratio: 0.0050000783711343436
[0.8391794247215933, 0.8170569620253165, 0.8279704477498036]


In [65]:
trainData = sample(trainData, "under", 0.003)

features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V14', 
            'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V24', 'V26', 'V27', 'V28']
#maxIter, number of neurons in hidden layer, stepsize
params = [100, 14, 0.03]
m, p, r, f = TrainTest(trainData,testData,'mlp',features,params)

[26, 14, 2]

 precision, recall, f_score: 0.8588235294117647, 0.8021978021978022, 0.8295454545454546
[[4.5293e+04 1.2000e+01]
 [1.8000e+01 7.3000e+01]]


#### over sampling

In [62]:
ratio_list = [0.003, 0.005]
df = sqlContext.read.csv("modeling.csv", header = True)
features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 
            'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 
            'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 
            'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28']
for ratio in ratio_list:
    tune(df, 5, True, 'over', ratio, False, features, 'mlp', [100], [15], [0.03])

[100, 15, 0.03]
[29, 15, 2]
train: 181809
fraud ratio: 0.0029976513813947604
train: 181810
fraud ratio: 0.0029976348935702108
train: 181809
fraud ratio: 0.0029976513813947604
train: 181809
fraud ratio: 0.0029976513813947604
train: 181812
fraud ratio: 0.002997601918465228
[0.8672244847587314, 0.8020569620253164, 0.8333686772588232]
[100, 15, 0.03]
[29, 15, 2]
train: 182174
fraud ratio: 0.004995224345954966
train: 182175
fraud ratio: 0.004995196926032661
train: 182174
fraud ratio: 0.004995224345954966
train: 182174
fraud ratio: 0.004995224345954966
train: 182177
fraud ratio: 0.004995142087091125
[0.8195489129858184, 0.8095569620253166, 0.8145222952110646]


In [66]:
trainData = sample(trainData, "over", 0.003)

features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V14', 
            'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V24', 'V26', 'V27', 'V28']
#maxIter, number of neurons in hidden layer, stepsize
params = [100, 14, 0.03]
m, p, r, f = TrainTest(trainData,testData,'mlp',features,params)

[26, 14, 2]

 precision, recall, f_score: 0.872093023255814, 0.8241758241758241, 0.847457627118644
[[4.5294e+04 1.1000e+01]
 [1.6000e+01 7.5000e+01]]


### Conclusion: Use original ratio; no sampling

## 6. OOT

In [20]:
features = ['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 
            'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 
            'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 
            'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28']
#maxIter, number of neurons in hidden layer, stepsize
params = [100, 15, 0.03]
m, p, r, f = TrainTest(trainData,oot,'mlp',features,params)

[29, 15, 2]

 precision, recall, f_score: 0.9310344827586207, 0.7297297297297297, 0.8181818181818181
[[5.6668e+04 4.0000e+00]
 [2.0000e+01 5.4000e+01]]


In [7]:
sc.stop()