In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import pyspark.sql.functions as fn
from pyspark.sql.functions import *
from pyspark.ml.feature import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator,BinaryClassificationEvaluator

# Just read in the file path to variable 'raw_dataset' using spark session or context and save the notebook as: final_year[#]

# Then run all the commands after "Preprocessing" header

### Read in Data

###### SparkSession

In [1]:
#If using Spark Session --Do not run if using SparkContext
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
from pyspark.sql import SQLContext
sqlContext = SQLContext(spark)

In [None]:
#If using Spark Session --Do not run if using SparkContext

raw_dataset = spark.read.csv(r'filepath/year#.csv',header = True)

###### SparkContext

In [2]:
#create spark context --Do not run if using SparkSession
raw_dataset = sqlContext.read.load('data/csv_result-1year.csv',
                                  format='com.databricks.spark.csv',
                                  header='true',
                                  inferSchema='true')

In [None]:
raw_dataset.describe().toPandas().transpose()

# Preprocessing

### Cast to Type Double

In [3]:
cols = raw_dataset.columns
typecasted_dataset = raw_dataset.select(*(col(c).cast("double").alias(c) for c in cols))
print ('Count of rows: {0}'.format(typecasted_dataset.count()))
print ('Count of distinct rows: {0}'.format(typecasted_dataset.distinct().count()))
print ('Count if distinct ids: {0}'.format(typecasted_dataset.select([c for c in typecasted_dataset.columns if c!= 'id' or c!= 'class']).distinct().count()))

Count of rows: 7027
Count of distinct rows: 7027
Count if distinct ids: 7027


In [None]:
typecasted_dataset.dtypes

### Impute Missing Values

In [4]:
from pyspark.ml.feature import Imputer
#default is mean
imputer = Imputer(inputCols = typecasted_dataset.columns[1:-1], outputCols = [s + "_imputed" for s in typecasted_dataset.columns[1:-1]])
imp_model = imputer.fit(typecasted_dataset)
imp_df = imp_model.transform(typecasted_dataset)
imp_df = imp_df.select(imp_df.columns[65:])
imp_df.columns

['class',
 'Attr23_imputed',
 'Attr20_imputed',
 'Attr35_imputed',
 'Attr18_imputed',
 'Attr9_imputed',
 'Attr17_imputed',
 'Attr25_imputed',
 'Attr39_imputed',
 'Attr38_imputed',
 'Attr15_imputed',
 'Attr50_imputed',
 'Attr64_imputed',
 'Attr13_imputed',
 'Attr42_imputed',
 'Attr11_imputed',
 'Attr21_imputed',
 'Attr52_imputed',
 'Attr56_imputed',
 'Attr14_imputed',
 'Attr8_imputed',
 'Attr45_imputed',
 'Attr26_imputed',
 'Attr19_imputed',
 'Attr44_imputed',
 'Attr58_imputed',
 'Attr34_imputed',
 'Attr55_imputed',
 'Attr36_imputed',
 'Attr61_imputed',
 'Attr3_imputed',
 'Attr40_imputed',
 'Attr48_imputed',
 'Attr46_imputed',
 'Attr29_imputed',
 'Attr60_imputed',
 'Attr54_imputed',
 'Attr31_imputed',
 'Attr43_imputed',
 'Attr16_imputed',
 'Attr53_imputed',
 'Attr22_imputed',
 'Attr12_imputed',
 'Attr41_imputed',
 'Attr27_imputed',
 'Attr32_imputed',
 'Attr28_imputed',
 'Attr2_imputed',
 'Attr63_imputed',
 'Attr5_imputed',
 'Attr4_imputed',
 'Attr59_imputed',
 'Attr62_imputed',
 'Attr30

In [None]:
imp_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in imp_df.columns]).show()

In [None]:
imp_df.describe().toPandas().transpose()

### Scaling

In [6]:
#Rename column 'class' to 'label' so it works easier with spark ML functions
imp_df = imp_df.withColumnRenamed('class','label')

In [7]:
from pyspark.ml.feature import VectorAssembler

assemblerInputs = imp_df.columns[1:]

assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
feature_df = assembler.transform(imp_df).select(['features','label'])

In [8]:
#Need to split the dataset into train and test before scaling

train,test = feature_df.randomSplit([.7,.3],seed = 1)

In [10]:
print(train.groupBy('label').count().show())
print(feature_df.groupBy('label').count().show())

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 4716|
|  1.0|  180|
+-----+-----+

None
+-----+-----+
|label|count|
+-----+-----+
|  0.0| 6756|
|  1.0|  271|
+-----+-----+

None


In [9]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol = 'features',outputCol = 'scaledFeatures')
scalerModel = scaler.fit(train)

#only fit the scaler on train, not test

In [10]:
sc_train = scalerModel.transform(train).select('label','scaledFeatures')
sc_test = scalerModel.transform(test).select('label','scaledFeatures')

#transform the scaler on both train and test

In [None]:
sc_train.dtypes

In [66]:
sc_train

DataFrame[label: double, scaledFeatures: vector]

### Dimensionality Reduction (PCA10)

In [11]:
from pyspark.ml.feature import PCA

In [12]:
#Will try with 10 first, should try different # of components as well (probably fewer, maybe 3, 5 and 7 just to see which is best)
pca = PCA(k = 10,inputCol = 'scaledFeatures', outputCol = 'pcaFeatures')
pcaModel = pca.fit(sc_train)

#only fit PCA model on train, not test

In [13]:
pca_train = pcaModel.transform(sc_train).select('label','pcaFeatures').withColumnRenamed('pcaFeatures','features')
pca_test = pcaModel.transform(sc_test).select('label','pcaFeatures').withColumnRenamed('pcaFeatures','features')
pca_train.columns

#transform PCA model on both train and test

['label', 'features']

In [None]:
pca_train.dtypes

### Dimensionality Reduction (PCA5)

In [14]:
from pyspark.ml.feature import PCA

In [15]:
#Will try with 10 first, should try different # of components as well (probably fewer, maybe 3, 5 and 7 just to see which is best)
pca5 = PCA(k = 5,inputCol = 'scaledFeatures', outputCol = 'pcaFeatures')
pca5Model = pca5.fit(sc_train)

#only fit PCA model on train, not test

In [16]:
pca5_train = pca5Model.transform(sc_train).select('label','pcaFeatures').withColumnRenamed('pcaFeatures','features')
pca5_test = pca5Model.transform(sc_test).select('label','pcaFeatures').withColumnRenamed('pcaFeatures','features')
pca5_train.columns

#transform PCA model on both train and test

['label', 'features']

In [None]:
pca5_train.dtypes

In [17]:
def printMulticlassMetrics(predictions):
    
    #*********
    """
    Function to print classification metrics. Input is a dataframe containing a column
    with class labels named 'label' and a column with model predictions named
    'prediction'
    """
    #*********
    
    #define confusion matrix results based on labels and predicitons
    tp = predictions[(predictions.label ==1) & (predictions.prediction ==1)].count()
    tn = predictions[(predictions.label ==0) & (predictions.prediction ==0)].count()
    fp = predictions[(predictions.label ==0) & (predictions.prediction ==1)].count()
    fn = predictions[(predictions.label ==1) & (predictions.prediction ==0)].count()
    
    #print confusion matrix
    print("\n")
    print("CONFUSION MATRIX\n\tPredicted\n\t  1\t0\n\t1|",tp,"\t",fn,"|\nActual\t0|",fp,"\t",tn,"|")
    print("\n")
    
    
    #print f1 score
    print('F1 Score:')
    try:
        f1 = (2*tp)/((2*tp)+fp+fn)
        print(f1)
    except:
        print("error")
        
    #print accuracy
    print('\nAccuracy:')
    try:
        acc = (tp + tn)/(tp+fn+tn+fp)
        print(acc)
    except:
        print("error")
        
    #print precision
    print('\nPrecision:')
    try:
        prec = tp / (tp + fp)
        print(prec)
    except:
        print("error")
        
    #print recall
    print('\nRecall:')
    try:
        rec = tp / (tp + fn)
        print(rec)
    except:
        print("error")
        
    e = BinaryClassificationEvaluator()
    #print auc
    print('\nAUC:')
    print(e.evaluate(predictions))
    
    print("\n****************************************************************")

# Models

### Logistic Regression

In [18]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from spark_stratifier import StratifiedCrossValidator
from pyspark.ml.tuning import ParamGridBuilder

In [19]:
lr = LogisticRegression(labelCol = 'label',featuresCol = 'features')
lr_paramGrid = ParamGridBuilder().addGrid(lr.regParam,[1.0,0.1,0.01]).addGrid(lr.elasticNetParam,[0.0,0.5,1.0]).build()

In [20]:
lr_CrossVal = StratifiedCrossValidator(estimator = lr, estimatorParamMaps = lr_paramGrid,evaluator = MulticlassClassificationEvaluator(),numFolds = 10)
lr_cvModel = lr_CrossVal.fit(pca_train)

In [21]:
lr_train_predictions = lr_cvModel.transform(pca_train)
lr_test_predictions = lr_cvModel.transform(pca_test)

In [22]:
print("Logistic Regression Train Set Metrics")
printMulticlassMetrics(lr_train_predictions)
print("\n\nLogistic Regression Test Set Metrics")
printMulticlassMetrics(lr_test_predictions)

Logistic Regression Train Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 2 	 178 |
Actual	0| 0 	 4716 |


F1 Score:
0.02197802197802198

Accuracy:
0.9636437908496732

Precision:
1.0

Recall:
0.011111111111111112

AUC:
0.5816699651305183

****************************************************************


Logistic Regression Test Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 91 |
Actual	0| 2 	 2038 |


F1 Score:
0.0

Accuracy:
0.9563585171281088

Precision:
0.0

Recall:
0.0

AUC:
0.5991596638655463

****************************************************************


In [23]:
lr_bestModel = lr_cvModel.bestModel
print("Best Metrics")
print("Regularization: ",lr_bestModel._java_obj.getRegParam())
print("Elastic Net: ",lr_bestModel._java_obj.getElasticNetParam())

Best Metrics
Regularization:  0.01
Elastic Net:  0.0


In [24]:
lr_bestModel.coefficients

DenseVector([0.0043, -0.0263, -0.0315, 0.0237, -0.0192, 0.0444, -0.0902, 0.0385, -0.0503, 0.0406])

### Random Forest

In [25]:
from pyspark.ml.classification import RandomForestClassifier

In [26]:
rf = RandomForestClassifier(labelCol = 'label',featuresCol = 'features',numTrees = 200,seed = 1)
rf_paramGrid = ParamGridBuilder().addGrid(rf.impurity,['gini','entropy']).addGrid(rf.maxDepth,[2,5,10]).build()

In [27]:
rf_CrossVal = StratifiedCrossValidator(estimator = rf, estimatorParamMaps = rf_paramGrid,evaluator = MulticlassClassificationEvaluator(),numFolds = 10)
rf_cvModel = rf_CrossVal.fit(pca_train)

In [28]:
rf_train_predictions = rf_cvModel.transform(pca_train)
rf_test_predictions = rf_cvModel.transform(pca_test)

In [29]:
print("Random Forest Train Set Metrics")
printMulticlassMetrics(rf_train_predictions)
print("\n\nRandom Forest Test Set Metrics")
printMulticlassMetrics(rf_test_predictions)

Random Forest Train Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 31 	 149 |
Actual	0| 0 	 4716 |


F1 Score:
0.2938388625592417

Accuracy:
0.9695669934640523

Precision:
1.0

Recall:
0.17222222222222222

AUC:
0.9988143200452361

****************************************************************


Random Forest Test Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 91 |
Actual	0| 0 	 2040 |


F1 Score:
0.0

Accuracy:
0.9572970436414828

Precision:
error

Recall:
0.0

AUC:
0.6817146089204914

****************************************************************


In [30]:
rf_bestModel = rf_cvModel.bestModel
print("Best Metrics")
print("Impurity: ",rf_bestModel._java_obj.getImpurity())
print("Max Depth: ",rf_bestModel._java_obj.getMaxDepth())

Best Metrics
Impurity:  entropy
Max Depth:  10


In [31]:
rf_bestModel.featureImportances

SparseVector(10, {0: 0.1084, 1: 0.1022, 2: 0.1341, 3: 0.0891, 4: 0.0955, 5: 0.1116, 6: 0.0889, 7: 0.0935, 8: 0.0937, 9: 0.083})

### Gradient Boosted Tree

In [32]:
from pyspark.ml.classification import GBTClassifier

In [33]:
gbt = GBTClassifier(labelCol = 'label',featuresCol = 'features',maxIter = 100,seed = 1)
gbt_paramGrid = ParamGridBuilder().addGrid(gbt.maxDepth,[2,5,10]).build()

In [34]:
gbt_CrossVal = StratifiedCrossValidator(estimator = gbt, estimatorParamMaps = gbt_paramGrid,evaluator = MulticlassClassificationEvaluator(),numFolds = 10)
gbt_cvModel = gbt_CrossVal.fit(pca_train)

In [35]:
gbt_train_predictions = gbt_cvModel.transform(pca_train)
gbt_test_predictions = gbt_cvModel.transform(pca_test)

In [36]:
print("GradientBoostedTree Train Set Metrics")
printMulticlassMetrics(gbt_train_predictions)
print("\n\nGradientBoostedTree Test Set Metrics")
printMulticlassMetrics(gbt_test_predictions)

GradientBoostedTree Train Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 2 	 178 |
Actual	0| 1 	 4715 |


F1 Score:
0.02185792349726776

Accuracy:
0.9634395424836601

Precision:
0.6666666666666666

Recall:
0.011111111111111112

AUC:
0.7779150174347343

****************************************************************


GradientBoostedTree Test Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 91 |
Actual	0| 0 	 2040 |


F1 Score:
0.0

Accuracy:
0.9572970436414828

Precision:
error

Recall:
0.0

AUC:
0.6664188752424047

****************************************************************


In [37]:
gbt_bestModel = gbt_cvModel.bestModel
print("Best Metrics")
print("Max Depth: ",gbt_bestModel._java_obj.getMaxDepth())

Best Metrics
Max Depth:  2


In [38]:
gbt_bestModel.featureImportances

SparseVector(10, {0: 0.0657, 1: 0.0832, 2: 0.2387, 3: 0.0539, 4: 0.1313, 5: 0.0928, 6: 0.0643, 7: 0.0773, 8: 0.1623, 9: 0.0305})

### Multilayer Perceptron

In [53]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [54]:
mlp = MultilayerPerceptronClassifier(labelCol = 'label',featuresCol = 'features',maxIter = 100,seed = 1)
mlp_paramGrid = ParamGridBuilder().addGrid(mlp.solver,['l-bfgs','gd']).addGrid(mlp.layers,[[10,5,2],[10,5,5,2],[10,5,10,2]]).build()
#mlp.layers should be in the following format: [# of input columns, size of hidden layer 1, ...size of hidden layer N..., number of classes]

In [55]:
mlp_CrossVal = StratifiedCrossValidator(estimator = mlp, estimatorParamMaps = mlp_paramGrid,evaluator = MulticlassClassificationEvaluator(),numFolds = 10)
mlp_cvModel = mlp_CrossVal.fit(pca_train)

In [56]:
mlp_train_predictions = mlp_cvModel.transform(pca_train)
mlp_test_predictions = mlp_cvModel.transform(pca_test)

In [57]:
print("MultilayerPerceptron Train Set Metrics")
printMulticlassMetrics(mlp_train_predictions)
print("\n\nMultilayerPerceptron Test Set Metrics")
printMulticlassMetrics(mlp_test_predictions)

MultilayerPerceptron Train Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 180 |
Actual	0| 0 	 4716 |


F1 Score:
0.0

Accuracy:
0.9632352941176471

Precision:
error

Recall:
0.0

AUC:
0.4558936481010234

****************************************************************


MultilayerPerceptron Test Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 91 |
Actual	0| 0 	 2040 |


F1 Score:
0.0

Accuracy:
0.9572970436414828

Precision:
error

Recall:
0.0

AUC:
0.42961646196940323

****************************************************************


In [58]:
mlp_bestModel = mlp_cvModel.bestModel
print("Best Metrics")
#print("Solver: ",mlp_bestModel._java_obj.getSolver()) #errors out
print("Layers: ",mlp_bestModel.layers)

Best Metrics
Layers:  [10, 5, 5, 2]


# Models, PCA5

### Logistic Regression

In [39]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from spark_stratifier import StratifiedCrossValidator
from pyspark.ml.tuning import ParamGridBuilder

In [40]:
lr5 = LogisticRegression(labelCol = 'label',featuresCol = 'features')
lr5_paramGrid = ParamGridBuilder().addGrid(lr5.regParam,[1.0,0.1,0.01]).addGrid(lr5.elasticNetParam,[0.0,0.5,1.0]).build()

In [41]:
lr5_CrossVal = StratifiedCrossValidator(estimator = lr5, estimatorParamMaps = lr5_paramGrid,evaluator = MulticlassClassificationEvaluator(),numFolds = 10)
lr5_cvModel = lr5_CrossVal.fit(pca5_train)

In [42]:
lr5_train_predictions = lr5_cvModel.transform(pca5_train)
lr5_test_predictions = lr5_cvModel.transform(pca5_test)

In [43]:
print("Logistic Regression Train Set Metrics")
printMulticlassMetrics(lr5_train_predictions)
print("\n\nLogistic Regression Test Set Metrics")
printMulticlassMetrics(lr5_test_predictions)

Logistic Regression Train Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 180 |
Actual	0| 0 	 4716 |


F1 Score:
0.0

Accuracy:
0.9632352941176471

Precision:
error

Recall:
0.0

AUC:
0.5702619922721708

****************************************************************


Logistic Regression Test Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 91 |
Actual	0| 0 	 2040 |


F1 Score:
0.0

Accuracy:
0.9572970436414828

Precision:
error

Recall:
0.0

AUC:
0.5797565179918122

****************************************************************


In [44]:
lr5_bestModel = lr5_cvModel.bestModel
print("Best Metrics")
print("Regularization: ",lr5_bestModel._java_obj.getRegParam())
print("Elastic Net: ",lr5_bestModel._java_obj.getElasticNetParam())

Best Metrics
Regularization:  1.0
Elastic Net:  0.0


In [45]:
lr5_bestModel.coefficients

DenseVector([0.0002, -0.002, 0.0005, 0.0024, -0.0047])

### Random Forest

In [46]:
from pyspark.ml.classification import RandomForestClassifier

In [47]:
rf5 = RandomForestClassifier(labelCol = 'label',featuresCol = 'features',numTrees = 200,seed = 1)
rf5_paramGrid = ParamGridBuilder().addGrid(rf5.impurity,['gini','entropy']).addGrid(rf5.maxDepth,[2,5,10]).build()

In [48]:
rf5_CrossVal = StratifiedCrossValidator(estimator = rf5, estimatorParamMaps = rf5_paramGrid,evaluator = MulticlassClassificationEvaluator(),numFolds = 10)
rf5_cvModel = rf5_CrossVal.fit(pca5_train)

In [49]:
rf5_train_predictions = rf5_cvModel.transform(pca5_train)
rf5_test_predictions = rf5_cvModel.transform(pca5_test)

In [50]:
print("Random Forest Train Set Metrics")
printMulticlassMetrics(rf5_train_predictions)
print("\n\nRandom Forest Test Set Metrics")
printMulticlassMetrics(rf5_test_predictions)

Random Forest Train Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 180 |
Actual	0| 0 	 4716 |


F1 Score:
0.0

Accuracy:
0.9632352941176471

Precision:
error

Recall:
0.0

AUC:
0.56474531146923

****************************************************************


Random Forest Test Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 91 |
Actual	0| 0 	 2040 |


F1 Score:
0.0

Accuracy:
0.9572970436414828

Precision:
error

Recall:
0.0

AUC:
0.572382029734971

****************************************************************


In [51]:
rf5_bestModel = rf5_cvModel.bestModel
print("Best Metrics")
print("Impurity: ",rf5_bestModel._java_obj.getImpurity())
print("Max Depth: ",rf5_bestModel._java_obj.getMaxDepth())

Best Metrics
Impurity:  gini
Max Depth:  2


In [52]:
rf5_bestModel.featureImportances

SparseVector(5, {0: 0.147, 1: 0.1554, 2: 0.3129, 3: 0.2559, 4: 0.1287})

### Gradient Boosted Tree

In [59]:
from pyspark.ml.classification import GBTClassifier

In [60]:
gbt5 = GBTClassifier(labelCol = 'label',featuresCol = 'features',maxIter = 100,seed = 1)
gbt5_paramGrid = ParamGridBuilder().addGrid(gbt5.maxDepth,[2,5,10]).build()

In [61]:
gbt5_CrossVal = StratifiedCrossValidator(estimator = gbt5, estimatorParamMaps = gbt5_paramGrid,evaluator = MulticlassClassificationEvaluator(),numFolds = 10)
gbt5_cvModel = gbt5_CrossVal.fit(pca5_train)

In [62]:
gbt5_train_predictions = gbt5_cvModel.transform(pca5_train)
gbt5_test_predictions = gbt5_cvModel.transform(pca5_test)

In [63]:
print("GradientBoostedTree Train Set Metrics")
printMulticlassMetrics(gbt5_train_predictions)
print("\n\nGradientBoostedTree Test Set Metrics")
printMulticlassMetrics(gbt5_test_predictions)

GradientBoostedTree Train Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 180 |
Actual	0| 0 	 4716 |


F1 Score:
0.0

Accuracy:
0.9632352941176471

Precision:
error

Recall:
0.0

AUC:
0.7613473046838174

****************************************************************


GradientBoostedTree Test Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 91 |
Actual	0| 0 	 2040 |


F1 Score:
0.0

Accuracy:
0.9572970436414828

Precision:
error

Recall:
0.0

AUC:
0.665826330532213

****************************************************************


In [64]:
gbt5_bestModel = gbt5_cvModel.bestModel
print("Best Metrics")
print("Max Depth: ",gbt5_bestModel._java_obj.getMaxDepth())

Best Metrics
Max Depth:  2


In [65]:
gbt5_bestModel.featureImportances

SparseVector(5, {0: 0.1725, 1: 0.0942, 2: 0.3217, 3: 0.2339, 4: 0.1776})

### Multilayer Perceptron

In [67]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [68]:
mlp5 = MultilayerPerceptronClassifier(labelCol = 'label',featuresCol = 'features',maxIter = 100,seed = 1)
mlp5_paramGrid = ParamGridBuilder().addGrid(mlp5.solver,['l-bfgs','gd']).addGrid(mlp5.layers,[[5,5,2],[5,5,5,2],[5,5,10,2]]).build()

In [69]:
mlp5_CrossVal = StratifiedCrossValidator(estimator = mlp5, estimatorParamMaps = mlp5_paramGrid,evaluator = MulticlassClassificationEvaluator(),numFolds = 10)
mlp5_cvModel = mlp5_CrossVal.fit(pca5_train)

In [70]:
mlp5_train_predictions = mlp5_cvModel.transform(pca5_train)
mlp5_test_predictions = mlp5_cvModel.transform(pca5_test)

In [71]:
print("MultilayerPerceptron Train Set Metrics")
printMulticlassMetrics(mlp5_train_predictions)
print("\n\nMultilayerPerceptron Test Set Metrics")
printMulticlassMetrics(mlp5_test_predictions)

MultilayerPerceptron Train Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 180 |
Actual	0| 0 	 4716 |


F1 Score:
0.0

Accuracy:
0.9632352941176471

Precision:
error

Recall:
0.0

AUC:
0.31651470172462637

****************************************************************


MultilayerPerceptron Test Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 91 |
Actual	0| 0 	 2040 |


F1 Score:
0.0

Accuracy:
0.9572970436414828

Precision:
error

Recall:
0.0

AUC:
0.38122710622710626

****************************************************************


In [72]:
mlp5_bestModel = mlp5_cvModel.bestModel
print("Best Metrics")
#print("Solver: ",mlp5_bestModel._java_obj.getSolver()) #errors out
print("Layers: ",mlp5_bestModel.layers)

Best Metrics
Layers:  [5, 5, 5, 2]


# Models without PCA

### Logistic Regression

In [73]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from spark_stratifier import StratifiedCrossValidator
from pyspark.ml.tuning import ParamGridBuilder

In [74]:
lrsc = LogisticRegression(labelCol = 'label',featuresCol = 'scaledFeatures')
lrsc_paramGrid = ParamGridBuilder().addGrid(lrsc.regParam,[1.0,0.1,0.01]).addGrid(lrsc.elasticNetParam,[0.0,0.5,1.0]).build()

In [75]:
lrsc_CrossVal = StratifiedCrossValidator(estimator = lrsc, estimatorParamMaps = lrsc_paramGrid,evaluator = MulticlassClassificationEvaluator(),numFolds = 10)
lrsc_cvModel = lrsc_CrossVal.fit(sc_train)

In [76]:
lrsc_train_predictions = lrsc_cvModel.transform(sc_train)
lrsc_test_predictions = lrsc_cvModel.transform(sc_test)

In [77]:
print("Logistic Regression Train Set Metrics")
printMulticlassMetrics(lrsc_train_predictions)
print("\n\nLogistic Regression Test Set Metrics")
printMulticlassMetrics(lrsc_test_predictions)

Logistic Regression Train Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 2 	 178 |
Actual	0| 0 	 4716 |


F1 Score:
0.02197802197802198

Accuracy:
0.9636437908496732

Precision:
1.0

Recall:
0.011111111111111112

AUC:
0.5861570540005612

****************************************************************


Logistic Regression Test Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 91 |
Actual	0| 1 	 2039 |


F1 Score:
0.0

Accuracy:
0.9568277803847959

Precision:
0.0

Recall:
0.0

AUC:
0.5878420599008836

****************************************************************


In [78]:
lrsc_bestModel = lrsc_cvModel.bestModel
print("Best Metrics")
print("Regularization: ",lrsc_bestModel._java_obj.getRegParam())
print("Elastic Net: ",lrsc_bestModel._java_obj.getElasticNetParam())

Best Metrics
Regularization:  0.1
Elastic Net:  0.0


In [100]:
lrsc_bestModel.coefficients

DenseVector([-0.0011, -0.0015, -0.0068, -0.0061, -0.0061, -0.0039, -0.0114, -0.0004, -0.0107, 0.0148, 0.0162, -0.0069, -0.0015, -0.0003, -0.0021, -0.0045, -0.0005, 0.0015, -0.0061, -0.005, 0.0223, -0.0136, -0.0012, -0.0017, -0.0015, 0.0229, -0.0265, -0.0058, -0.0026, -0.0201, 0.0414, 0.0009, -0.0075, -0.0552, -0.0038, -0.0065, -0.0013, -0.0017, -0.0198, -0.0065, -0.0024, -0.0232, -0.0138, -0.0041, 0.0047, -0.0228, 0.0197, -0.0113, 0.0025, 0.002, 0.0004, -0.0028, -0.0018, -0.0082, -0.0023, 0.0001, 0.0008, -0.0061, 0.0189, -0.0102, -0.0106, -0.058, 0.0018, -0.0144])

### Random Forest

In [80]:
from pyspark.ml.classification import RandomForestClassifier

In [81]:
rfsc = RandomForestClassifier(labelCol = 'label',featuresCol = 'scaledFeatures',numTrees = 200,seed = 1)
rfsc_paramGrid = ParamGridBuilder().addGrid(rfsc.impurity,['gini','entropy']).addGrid(rfsc.maxDepth,[2,5,10]).build()

In [82]:
rfsc_CrossVal = StratifiedCrossValidator(estimator = rfsc, estimatorParamMaps = rfsc_paramGrid,evaluator = MulticlassClassificationEvaluator(),numFolds = 10)
rfsc_cvModel = rfsc_CrossVal.fit(sc_train)

In [83]:
rfsc_train_predictions = rfsc_cvModel.transform(sc_train)
rfsc_test_predictions = rfsc_cvModel.transform(sc_test)

In [84]:
print("Random Forest Train Set Metrics")
printMulticlassMetrics(rfsc_train_predictions)
print("\n\nRandom Forest Test Set Metrics")
printMulticlassMetrics(rfsc_test_predictions)

Random Forest Train Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 127 	 53 |
Actual	0| 0 	 4716 |


F1 Score:
0.8273615635179153

Accuracy:
0.9891748366013072

Precision:
1.0

Recall:
0.7055555555555556

AUC:
0.9999858637263217

****************************************************************


Random Forest Test Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 25 	 66 |
Actual	0| 1 	 2039 |


F1 Score:
0.42735042735042733

Accuracy:
0.9685593618019709

Precision:
0.9615384615384616

Recall:
0.27472527472527475

AUC:
0.9063833225597934

****************************************************************


In [85]:
rfsc_bestModel = rfsc_cvModel.bestModel
print("Best Metrics")
print("Impurity: ",rfsc_bestModel._java_obj.getImpurity())
print("Max Depth: ",rfsc_bestModel._java_obj.getMaxDepth())

Best Metrics
Impurity:  gini
Max Depth:  10


In [86]:
rfsc_bestModel.featureImportances

SparseVector(64, {0: 0.0167, 1: 0.0138, 2: 0.0138, 3: 0.008, 4: 0.0303, 5: 0.006, 6: 0.0109, 7: 0.0104, 8: 0.0099, 9: 0.0098, 10: 0.0125, 11: 0.0098, 12: 0.0119, 13: 0.0106, 14: 0.0528, 15: 0.0152, 16: 0.0094, 17: 0.0204, 18: 0.0089, 19: 0.0097, 20: 0.0098, 21: 0.0122, 22: 0.0109, 23: 0.0121, 24: 0.0252, 25: 0.0404, 26: 0.0148, 27: 0.0125, 28: 0.0125, 29: 0.0117, 30: 0.0192, 31: 0.0126, 32: 0.0253, 33: 0.0259, 34: 0.0106, 35: 0.0139, 36: 0.0106, 37: 0.0103, 38: 0.0123, 39: 0.011, 40: 0.0165, 41: 0.0113, 42: 0.0117, 43: 0.1356, 44: 0.0099, 45: 0.0117, 46: 0.0074, 47: 0.0076, 48: 0.0118, 49: 0.012, 50: 0.0065, 51: 0.0093, 52: 0.0119, 53: 0.0191, 54: 0.0122, 55: 0.01, 56: 0.0104, 57: 0.0091, 58: 0.0096, 59: 0.0111, 60: 0.0078, 61: 0.0085, 62: 0.0133, 63: 0.0212})

### Gradient Boosted Tree

In [87]:
from pyspark.ml.classification import GBTClassifier

In [88]:
gbtsc = GBTClassifier(labelCol = 'label',featuresCol = 'scaledFeatures',maxIter = 100,seed = 1)
gbtsc_paramGrid = ParamGridBuilder().addGrid(gbtsc.maxDepth,[2,5,10]).build()

In [89]:
gbtsc_CrossVal = StratifiedCrossValidator(estimator = gbtsc, estimatorParamMaps = gbtsc_paramGrid,evaluator = MulticlassClassificationEvaluator(),numFolds = 10)
gbtsc_cvModel = gbtsc_CrossVal.fit(sc_train)

In [90]:
gbtsc_train_predictions = gbtsc_cvModel.transform(sc_train)
gbtsc_test_predictions = gbtsc_cvModel.transform(sc_test)

In [91]:
print("GradientBoostedTree Train Set Metrics")
printMulticlassMetrics(gbtsc_train_predictions)
print("\n\nGradientBoostedTree Test Set Metrics")
printMulticlassMetrics(gbtsc_test_predictions)

GradientBoostedTree Train Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 90 	 90 |
Actual	0| 5 	 4711 |


F1 Score:
0.6545454545454545

Accuracy:
0.9805964052287581

Precision:
0.9473684210526315

Recall:
0.5

AUC:
0.9528961690698261

****************************************************************


GradientBoostedTree Test Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 42 	 49 |
Actual	0| 1 	 2039 |


F1 Score:
0.6268656716417911

Accuracy:
0.97653683716565

Precision:
0.9767441860465116

Recall:
0.46153846153846156

AUC:
0.888100624865331

****************************************************************


In [92]:
gbtsc_bestModel = gbtsc_cvModel.bestModel
print("Best Metrics")
print("Max Depth: ",gbtsc_bestModel._java_obj.getMaxDepth())

Best Metrics
Max Depth:  2


In [93]:
gbtsc_bestModel.featureImportances

SparseVector(64, {1: 0.0099, 4: 0.023, 6: 0.0145, 7: 0.0156, 8: 0.0334, 9: 0.0147, 12: 0.0465, 13: 0.009, 14: 0.1628, 15: 0.067, 17: 0.0277, 21: 0.0052, 24: 0.0111, 25: 0.0954, 30: 0.0173, 31: 0.0007, 32: 0.0442, 33: 0.0261, 34: 0.0044, 37: 0.0048, 40: 0.0066, 42: 0.0086, 43: 0.1479, 45: 0.0106, 47: 0.0043, 48: 0.02, 51: 0.0042, 53: 0.0463, 55: 0.0347, 56: 0.0021, 58: 0.0251, 60: 0.0011, 62: 0.0257, 63: 0.0294})

### Multilayer Perceptron

In [94]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [95]:
mlpsc = MultilayerPerceptronClassifier(labelCol = 'label',featuresCol = 'scaledFeatures',maxIter = 100,seed = 1)
mlpsc_paramGrid = ParamGridBuilder().addGrid(mlpsc.solver,['l-bfgs','gd']).addGrid(mlpsc.layers,[[64,5,2],[64,5,5,2],[64,5,10,2]]).build()
#mlpsc.layers should be in the following format: [# of input columns, size of hidden layer 1, ...size of hidden layer N..., number of classes]

In [96]:
mlpsc_CrossVal = StratifiedCrossValidator(estimator = mlpsc, estimatorParamMaps = mlpsc_paramGrid,evaluator = MulticlassClassificationEvaluator(),numFolds = 10)
mlpsc_cvModel = mlpsc_CrossVal.fit(sc_train)

In [97]:
mlpsc_train_predictions = mlpsc_cvModel.transform(sc_train)
mlpsc_test_predictions = mlpsc_cvModel.transform(sc_test)

In [98]:
print("MultilayerPerceptron Train Set Metrics")
printMulticlassMetrics(mlpsc_train_predictions)
print("\n\nMultilayerPerceptron Test Set Metrics")
printMulticlassMetrics(mlpsc_test_predictions)

MultilayerPerceptron Train Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 180 |
Actual	0| 0 	 4716 |


F1 Score:
0.0

Accuracy:
0.9632352941176471

Precision:
error

Recall:
0.0

AUC:
0.5441134200358114

****************************************************************


MultilayerPerceptron Test Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 91 |
Actual	0| 0 	 2040 |


F1 Score:
0.0

Accuracy:
0.9572970436414828

Precision:
error

Recall:
0.0

AUC:
0.5343191122602888

****************************************************************


In [99]:
mlpsc_bestModel = mlpsc_cvModel.bestModel
print("Best Metrics")
#print("Solver: ",mlpsc_bestModel._java_obj.getSolver()) #errors out
print("Layers: ",mlpsc_bestModel.layers)

Best Metrics
Layers:  [64, 5, 5, 2]


# Models without PCA or Scaling

### Logistic Regression

In [101]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from spark_stratifier import StratifiedCrossValidator
from pyspark.ml.tuning import ParamGridBuilder

In [102]:
lr_base = LogisticRegression(labelCol = 'label',featuresCol = 'features')
lr_base_paramGrid = ParamGridBuilder().addGrid(lr_base.regParam,[1.0,0.1,0.01]).addGrid(lr_base.elasticNetParam,[0.0,0.5,1.0]).build()

In [103]:
lr_base_CrossVal = StratifiedCrossValidator(estimator = lr_base, estimatorParamMaps = lr_base_paramGrid,evaluator = MulticlassClassificationEvaluator(),numFolds = 10)
lr_base_cvModel = lr_base_CrossVal.fit(train)

In [104]:
lrtrain_predictions = lr_base_cvModel.transform(train)
lrtest_predictions = lr_base_cvModel.transform(test)

In [105]:
print("Logistic Regression Train Set Metrics")
printMulticlassMetrics(lrtrain_predictions)
print("\n\nLogistic Regression Test Set Metrics")
printMulticlassMetrics(lrtest_predictions)

Logistic Regression Train Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 2 	 178 |
Actual	0| 0 	 4716 |


F1 Score:
0.02197802197802198

Accuracy:
0.9636437908496732

Precision:
1.0

Recall:
0.011111111111111112

AUC:
0.5861570540005612

****************************************************************


Logistic Regression Test Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 91 |
Actual	0| 1 	 2039 |


F1 Score:
0.0

Accuracy:
0.9568277803847959

Precision:
0.0

Recall:
0.0

AUC:
0.5878420599008836

****************************************************************


In [106]:
lr_base_bestModel = lr_base_cvModel.bestModel
print("Best Metrics")
print("Regularization: ",lr_base_bestModel._java_obj.getRegParam())
print("Elastic Net: ",lr_base_bestModel._java_obj.getElasticNetParam())

Best Metrics
Regularization:  0.1
Elastic Net:  0.0


In [107]:
lr_base_bestModel.coefficients

DenseVector([-0.0, -0.0, -0.0012, -0.0009, -0.0001, -0.0002, -0.0003, -0.0, -0.0003, 0.0, 0.0029, -0.0, -0.0, -0.0, -0.0003, -0.0, -0.0, 0.0, -0.0009, -0.0002, 0.0002, -0.0015, -0.0, -0.0, -0.0, 0.0013, -0.0, -0.0001, -0.0, -0.0032, 0.0114, 0.0001, -0.0017, -0.0816, -0.0, -0.0, -0.0, -0.0, -0.003, -0.0, -0.0003, -0.0042, -0.0009, -0.0, 0.0, -0.0007, 0.0031, -0.0005, 0.0, 0.0004, 0.0001, -0.0, -0.0, -0.0009, -0.0004, 0.0, 0.0, -0.0009, 0.003, -0.0, -0.0003, -0.0124, 0.0, -0.002])

### Random Forest

In [108]:
from pyspark.ml.classification import RandomForestClassifier

In [109]:
rf_base = RandomForestClassifier(labelCol = 'label',featuresCol = 'features',numTrees = 200,seed = 1)
rf_base_paramGrid = ParamGridBuilder().addGrid(rf_base.impurity,['gini','entropy']).addGrid(rf_base.maxDepth,[2,5,10]).build()

In [110]:
rf_base_CrossVal = StratifiedCrossValidator(estimator = rf_base, estimatorParamMaps = rf_base_paramGrid,evaluator = MulticlassClassificationEvaluator(),numFolds = 10)
rf_base_cvModel = rf_base_CrossVal.fit(train)

In [111]:
rftrain_predictions = rf_base_cvModel.transform(train)
rftest_predictions = rf_base_cvModel.transform(test)

In [112]:
print("Random Forest Train Set Metrics")
printMulticlassMetrics(rftrain_predictions)
print("\n\nRandom Forest Test Set Metrics")
printMulticlassMetrics(rftest_predictions)

Random Forest Train Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 127 	 53 |
Actual	0| 0 	 4716 |


F1 Score:
0.8273615635179153

Accuracy:
0.9891748366013072

Precision:
1.0

Recall:
0.7055555555555556

AUC:
0.9999858637263217

****************************************************************


Random Forest Test Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 25 	 66 |
Actual	0| 1 	 2039 |


F1 Score:
0.42735042735042733

Accuracy:
0.9685593618019709

Precision:
0.9615384615384616

Recall:
0.27472527472527475

AUC:
0.9063833225597934

****************************************************************


In [113]:
rf_base_bestModel = rf_base_cvModel.bestModel
print("Best Metrics")
print("Impurity: ",rf_base_bestModel._java_obj.getImpurity())
print("Max Depth: ",rf_base_bestModel._java_obj.getMaxDepth())

Best Metrics
Impurity:  gini
Max Depth:  10


In [114]:
rf_base_bestModel.featureImportances

SparseVector(64, {0: 0.0167, 1: 0.0138, 2: 0.0138, 3: 0.008, 4: 0.0303, 5: 0.006, 6: 0.0109, 7: 0.0104, 8: 0.0099, 9: 0.0098, 10: 0.0125, 11: 0.0098, 12: 0.0119, 13: 0.0106, 14: 0.0528, 15: 0.0152, 16: 0.0094, 17: 0.0204, 18: 0.0089, 19: 0.0097, 20: 0.0098, 21: 0.0122, 22: 0.0109, 23: 0.0121, 24: 0.0252, 25: 0.0404, 26: 0.0148, 27: 0.0125, 28: 0.0125, 29: 0.0117, 30: 0.0192, 31: 0.0126, 32: 0.0253, 33: 0.0259, 34: 0.0106, 35: 0.0139, 36: 0.0106, 37: 0.0103, 38: 0.0123, 39: 0.011, 40: 0.0165, 41: 0.0113, 42: 0.0117, 43: 0.1356, 44: 0.0099, 45: 0.0117, 46: 0.0074, 47: 0.0076, 48: 0.0118, 49: 0.012, 50: 0.0065, 51: 0.0093, 52: 0.0119, 53: 0.0191, 54: 0.0122, 55: 0.01, 56: 0.0104, 57: 0.0091, 58: 0.0096, 59: 0.0111, 60: 0.0078, 61: 0.0085, 62: 0.0133, 63: 0.0212})

### Gradient Boosted Tree

In [115]:
from pyspark.ml.classification import GBTClassifier

In [116]:
gbt_base = GBTClassifier(labelCol = 'label',featuresCol = 'features',maxIter = 100,seed = 1)
gbt_base_paramGrid = ParamGridBuilder().addGrid(gbt_base.maxDepth,[2,5,10]).build()

In [117]:
gbt_base_CrossVal = StratifiedCrossValidator(estimator = gbt_base, estimatorParamMaps = gbt_base_paramGrid,evaluator = MulticlassClassificationEvaluator(),numFolds = 10)
gbt_base_cvModel = gbt_base_CrossVal.fit(train)

In [118]:
gbttrain_predictions = gbt_base_cvModel.transform(train)
gbttest_predictions = gbt_base_cvModel.transform(test)

In [119]:
print("GradientBoostedTree Train Set Metrics")
printMulticlassMetrics(gbttrain_predictions)
print("\n\nGradientBoostedTree Test Set Metrics")
printMulticlassMetrics(gbttest_predictions)

GradientBoostedTree Train Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 173 	 7 |
Actual	0| 0 	 4716 |


F1 Score:
0.9801699716713881

Accuracy:
0.9985702614379085

Precision:
1.0

Recall:
0.9611111111111111

AUC:
0.999997643954387

****************************************************************


GradientBoostedTree Test Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 45 	 46 |
Actual	0| 9 	 2031 |


F1 Score:
0.6206896551724138

Accuracy:
0.9741905208822149

Precision:
0.8333333333333334

Recall:
0.4945054945054945

AUC:
0.9332390648567122

****************************************************************


In [120]:
gbt_base_bestModel = gbt_base_cvModel.bestModel
print("Best Metrics")
print("Max Depth: ",gbt_base_bestModel._java_obj.getMaxDepth())

Best Metrics
Max Depth:  5


In [121]:
gbt_base_bestModel.featureImportances

SparseVector(64, {0: 0.0268, 1: 0.0151, 2: 0.0132, 3: 0.0139, 4: 0.0325, 5: 0.0199, 6: 0.0213, 7: 0.0129, 8: 0.0082, 9: 0.0151, 10: 0.012, 11: 0.0142, 12: 0.0143, 13: 0.0031, 14: 0.0334, 15: 0.0214, 16: 0.0068, 17: 0.0393, 19: 0.0083, 20: 0.0095, 21: 0.009, 22: 0.002, 23: 0.0106, 24: 0.0655, 25: 0.0755, 26: 0.0222, 27: 0.0074, 28: 0.011, 29: 0.0071, 30: 0.0168, 31: 0.0034, 32: 0.061, 33: 0.0318, 34: 0.0049, 35: 0.0075, 36: 0.0082, 37: 0.0037, 38: 0.0042, 39: 0.0099, 40: 0.0028, 41: 0.0043, 42: 0.0177, 43: 0.0606, 44: 0.006, 45: 0.0086, 46: 0.0065, 47: 0.0035, 48: 0.066, 49: 0.0101, 50: 0.01, 51: 0.0022, 52: 0.0105, 53: 0.0124, 54: 0.0066, 55: 0.0041, 56: 0.0157, 58: 0.004, 59: 0.0033, 60: 0.0035, 61: 0.0084, 62: 0.0114, 63: 0.0191})

### Multilayer Perceptron

In [122]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [123]:
mlp_base = MultilayerPerceptronClassifier(labelCol = 'label',featuresCol = 'features',maxIter = 100,seed = 1)
mlp_base_paramGrid = ParamGridBuilder().addGrid(mlp_base.solver,['l-bfgs','gd']).addGrid(mlp_base.layers,[[64,5,2],[64,5,5,2],[64,5,10,2]]).build()
#mlp_base.layers should be in the following format: [# of input columns, size of hidden layer 1, ...size of hidden layer N..., number of classes]

In [124]:
mlp_base_CrossVal = StratifiedCrossValidator(estimator = mlp_base, estimatorParamMaps = mlp_base_paramGrid,evaluator = MulticlassClassificationEvaluator(),numFolds = 10)
mlp_base_cvModel = mlp_base_CrossVal.fit(train)

In [125]:
mlptrain_predictions = mlp_base_cvModel.transform(train)
mlptest_predictions = mlp_base_cvModel.transform(test)

In [126]:
print("MultilayerPerceptron Train Set Metrics")
printMulticlassMetrics(mlptrain_predictions)
print("\n\nMultilayerPerceptron Test Set Metrics")
printMulticlassMetrics(mlptest_predictions)

MultilayerPerceptron Train Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 180 |
Actual	0| 0 	 4716 |


F1 Score:
0.0

Accuracy:
0.9632352941176471

Precision:
error

Recall:
0.0

AUC:
0.5598724201300528

****************************************************************


MultilayerPerceptron Test Set Metrics


CONFUSION MATRIX
	Predicted
	  1	0
	1| 0 	 91 |
Actual	0| 0 	 2040 |


F1 Score:
0.0

Accuracy:
0.9572970436414828

Precision:
error

Recall:
0.0

AUC:
0.5325495582848524

****************************************************************


In [127]:
mlp_base_bestModel = mlp_base_cvModel.bestModel
print("Best Metrics")
#print("Solver: ",mlp_base_bestModel._java_obj.getSolver()) #errors out
print("Layers: ",mlp_base_bestModel.layers)

Best Metrics
Layers:  [64, 5, 5, 2]


# Results Table

### See linked ExcelOnline Sheet. Enter results in manually to the correct section so we can compare our results between models and between years