In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from sklearn.metrics import f1_score, recall_score, classification_report

In [2]:
def init_spark():
    spark = SparkSession.builder.appName("Python Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate()
    return spark
spark_object = init_spark()

In [3]:
train_spark = spark_object.read.option("delimiter", ",").csv("./data/trainbalanced.csv", header='true', inferSchema='true')

In [4]:
for i in train_spark.columns:
    oldname = i
    if "." in oldname:
        newname = oldname.replace(".","")
        train_spark = train_spark.withColumnRenamed(oldname, newname)

In [5]:
train_data, test_data = train_spark.randomSplit([0.7, 0.3], seed=123)

In [6]:
input_list =[]
for i in train_spark.columns:
    if i != 'isFraud':
        input_list.append(i)

In [7]:
assembler = VectorAssembler(inputCols=input_list,outputCol='features')

In [8]:
train_features = assembler.transform(train_data)
test_features = assembler.transform(test_data)

In [9]:
dt_models = []
max_depths = [5, 10, 20]
for md in max_depths:
    print("-"*25)
    print("Depth: " + str(md))
    dt = DecisionTreeClassifier(featuresCol='features', labelCol='isFraud', maxDepth=md)
    model = dt.fit(train_features)
    predictions = model.transform(train_features)
    y_train_pred=predictions.select("prediction").collect()
    y_train_orig=predictions.select("isFraud").collect()
    print('Training Report')
    print(classification_report(y_train_orig, y_train_pred))

    predictions = model.transform(test_features)
    y_test_pred=predictions.select("prediction").collect()
    y_test_orig=predictions.select("isFraud").collect()
    print('Test Report')
    print(classification_report(y_test_orig, y_test_pred))
    dt_models.append(model)

-------------------------
Depth: 5
Training Report
              precision    recall  f1-score   support

           0       0.92      0.94      0.93     93211
           1       0.94      0.92      0.93     93300

    accuracy                           0.93    186511
   macro avg       0.93      0.93      0.93    186511
weighted avg       0.93      0.93      0.93    186511

Test Report
              precision    recall  f1-score   support

           0       0.92      0.94      0.93     39704
           1       0.94      0.92      0.93     39615

    accuracy                           0.93     79319
   macro avg       0.93      0.93      0.93     79319
weighted avg       0.93      0.93      0.93     79319

-------------------------
Depth: 10
Training Report
              precision    recall  f1-score   support

           0       0.96      0.97      0.96     93211
           1       0.97      0.96      0.96     93300

    accuracy                           0.96    186511
   macro avg 

In [10]:
# try:
#     dt = DecisionTreeClassifier(featuresCol='features', labelCol='isFraud')
#     # paramGrid = ParamGridBuilder().build()
#     # cv = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=5)
#     model = dt.fit(train_features)
# except Exception as e:
#     print(e)

                                                                                

In [16]:
# predictions = model.transform(test_features)
# # evaluator = BinaryClassificationEvaluator(labelCol='isFraud', rawPredictionCol='prediction', metricName='areaUnderROC')


In [19]:
# y_pred=predictions.select("prediction").collect()
# y_orig=predictions.select("isFraud").collect()

                                                                                

In [20]:
# f1_score(y_orig, y_pred)

0.9278894982734106

In [24]:
# recall_score(y_orig, y_pred)

0.9229681264838107

In [14]:
# rmse = evaluator.evaluate(predictions)
# print('RoC =', rmse)

                                                                                

RoC = 0.9283146850100981
