In [125]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors, VectorUDT

dataframe = spark.read.csv('wasb:///creditcard.csv', header=True, inferSchema=True)
dataframe.cache()

DataFrame[Time: decimal(10,0), V1: double, V2: double, V3: double, V4: double, V5: double, V6: double, V7: double, V8: double, V9: double, V10: double, V11: double, V12: double, V13: double, V14: double, V15: double, V16: double, V17: double, V18: double, V19: double, V20: double, V21: double, V22: double, V23: double, V24: double, V25: double, V26: double, V27: double, V28: double, Amount: double, Class: int]

In [43]:
#dataframe.show()

In [44]:
#dataframe.printSchema()

In [8]:
dataframe.groupBy('Class').count().show()
# this shows that the dataset is imbalanced 

+-----+------+
|Class| count|
+-----+------+
|    1|   492|
|    0|284315|
+-----+------+



In [87]:
from pyspark.sql.functions import udf, log, lit
# Data Transfromation on Amount with formula log(Amount+1)
dataframe = dataframe.select('*',(log(dataframe.Amount + 1)).alias('logAmount'))
amount = dataframe.select('logAmount')
amount.show()
amount.cache()


+------------------+
|         logAmount|
+------------------+
| 5.014760108673205|
|1.3056264580524357|
| 5.939276115362396|
| 4.824305715904762|
| 4.262539022051294|
| 1.541159071680806|
|1.7900914121273581|
|3.7328963395307104|
| 4.545420181582317|
|1.5432981099295553|
| 2.174751721484161|
|  2.39698576841553|
| 4.808111029984782|
| 3.349904087274605|
| 4.091005660956586|
|2.8326249356838407|
|  2.63834278867739|
|0.6365768290715511|
|  3.86702563949741|
| 1.791759469228055|
+------------------+
only showing top 20 rows

DataFrame[logAmount: double]

In [88]:
slen = udf(lambda vs: Vectors.dense(vs), VectorUDT())
# amountNormalized.select('*', slen(amountNormalized['Amount']).alias('slen')).show()
dataframe_with = dataframe.withColumn('slen', slen(amount['logAmount']))
dataframe_with.cache()
# dataframe_with.show()
mmScaler = MinMaxScaler(inputCol="slen", outputCol="scaled")
model = mmScaler.fit(dataframe_with)
# print mmScaler.explainParams()
normalized_df = model.transform(dataframe_with)
normalized_df.show()

+----+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------+-----+------------------+--------------------+--------------------+
|Time|          V1|          V2|          V3|          V4|          V5|          V6|          V7|          V8|          V9|         V10|         V11|         V12|         V13|         V14|         V15|         V16|         V17|         V18|         V19|         V20|         V21|         V22|         V23|         V24|         V25|         V26|         V27|         V28|Amount|Class|         logAmount|                slen|              scaled|
+----+------------+------------+------------+------------+------------+------------+------------+------------+

In [95]:
# modelling

### Logistic Regression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
# training data
keep_list = ['Class','Time','V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','V11','V12','V13','V14','V15','V16','V17','V17','V18','V19','V20','V21','V22','V23','V24','V25','V26','V27','V28','scaled']
feature_list = ['Time','V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','V11','V12','V13','V14','V15','V16','V17','V17','V18','V19','V20','V21','V22','V23','V24','V25','V26','V27','V28','scaled']
df_train = normalized_df.select(*keep_list).withColumnRenamed('Class', 'label')

combineFeature = VectorAssembler(inputCols = feature_list,
                                 outputCol = "features")

data = combineFeature.transform(df_train)

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(data)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

Coefficients: (31,[],[])
Intercept: -6.35935934092

In [93]:
data.select('features').show()

+--------------------+
|            features|
+--------------------+
|[0.0,-1.359807134...|
|[0.0,1.191857111,...|
|[1.0,-1.358354062...|
|[1.0,-0.966271712...|
|[2.0,-1.158233093...|
|[2.0,-0.425965884...|
|[4.0,1.229657635,...|
|[7.0,-0.644269442...|
|[7.0,-0.894286082...|
|[9.0,-0.338261752...|
|[10.0,1.449043781...|
|[10.0,0.384978215...|
|[10.0,1.249998742...|
|[11.0,1.069373588...|
|[12.0,-2.79185476...|
|[12.0,-0.75241704...|
|[12.0,1.103215435...|
|[13.0,-0.43690507...|
|[14.0,-5.40125766...|
|[15.0,1.492935977...|
+--------------------+
only showing top 20 rows

In [121]:
### Random forest classifier
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)



+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         0.0|[1.0,-1.358354062...|
|       0.0|         0.0|[1.0,-0.966271712...|
|       0.0|         0.0|[2.0,-1.158233093...|
|       0.0|         0.0|[4.0,1.229657635,...|
|       0.0|         0.0|[10.0,0.384978215...|
+----------+------------+--------------------+
only showing top 5 rows

In [122]:
# Cross Validation
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()
    
evaluator = BinaryClassificationEvaluator()
evaluator.setLabelCol("indexedLabel")
evaluator.setRawPredictionCol("prediction")

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)  

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(trainingData)

predictionsDf = cvModel.transform(testData)

numSuccesses = predictionsDf.where('label == prediction').count()
numInspections = predictionsDf.count()

print ("There were %d inspections and there were %d successful predictions" % (numInspections, numSuccesses))
print("This is a %f%% success rate" % (float(numSuccesses) / float(numInspections) * 100))

There were 85340 inspections and there were 85287 successful predictions
This is a 99.937895% success rate
Exception AttributeError: "'BinaryClassificationEvaluator' object has no attribute '_java_obj'" in <object repr() failed> ignored

In [123]:
### Gradient-boosted tree classifier
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a GBT model.
gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)

# Chain indexers and GBT in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

gbtModel = model.stages[2]
print(gbtModel)  # summary only


+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         0.0|[0.0,-1.359807134...|
|       0.0|         0.0|[0.0,1.191857111,...|
|       0.0|         0.0|[1.0,-0.966271712...|
|       0.0|         0.0|[4.0,1.229657635,...|
|       0.0|         0.0|[7.0,-0.894286082...|
+----------+------------+--------------------+
only showing top 5 rows

Test Error = 0.000679188
GBTClassificationModel (uid=GBTClassifier_48be906c2c939c1e4b78) with 10 trees

In [124]:
# Cross Validation
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()
    
evaluator = BinaryClassificationEvaluator()
evaluator.setLabelCol("indexedLabel")
evaluator.setRawPredictionCol("prediction")

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)  

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(trainingData)

predictionsDf = cvModel.transform(testData)

numSuccesses = predictionsDf.where('label == prediction').count()
numInspections = predictionsDf.count()

print ("There were %d inspections and there were %d successful predictions" % (numInspections, numSuccesses))
print("This is a %f%% success rate" % (float(numSuccesses) / float(numInspections) * 100))

There were 84948 inspections and there were 84893 successful predictions
This is a 99.935255% success rate