In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from sklearn.metrics import f1_score, recall_score, classification_report

In [2]:
def init_spark():
    spark = SparkSession.builder.appName("Python Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate()
    return spark
spark_object = init_spark()

In [3]:
train_spark = spark_object.read.option("delimiter", ",").csv("./data/trainbalanced.csv", header='true', inferSchema='true')

In [4]:
for i in train_spark.columns:
    oldname = i
    if "." in oldname:
        newname = oldname.replace(".","")
        train_spark = train_spark.withColumnRenamed(oldname, newname)

In [5]:
train_data, test_data = train_spark.randomSplit([0.7, 0.3], seed=123)

In [6]:
input_list =[]
for i in train_spark.columns:
    if i != 'isFraud':
        input_list.append(i)

In [7]:
assembler = VectorAssembler(inputCols=input_list,outputCol='features')

In [8]:
train_features = assembler.transform(train_data)
test_features = assembler.transform(test_data)

In [9]:
maxDepths = [5, 10, 15]
numTrees = [20, 25, 30]

In [10]:
rf_models = []
for md in maxDepths:
    for nt in numTrees:
        print("-"*25)
        print("Depth: " + str(md))
        print("numTrees: " + str(nt))
        rf = RandomForestClassifier(featuresCol='features', labelCol='isFraud', maxDepth=md, numTrees=nt)
        model = rf.fit(train_features)
        predictions = model.transform(train_features)
        y_train_pred=predictions.select("prediction").collect()
        y_train_orig=predictions.select("isFraud").collect()
        print('Training Report')
        print(classification_report(y_train_orig, y_train_pred))
        
        predictions = model.transform(test_features)
        y_test_pred=predictions.select("prediction").collect()
        y_test_orig=predictions.select("isFraud").collect()
        print('Test Report')
        print(classification_report(y_test_orig, y_test_pred))
        rf_models.append(model)


-------------------------
Depth: 5
numTrees: 20
Training Report
              precision    recall  f1-score   support

           0       0.91      0.93      0.92     93211
           1       0.92      0.91      0.92     93300

    accuracy                           0.92    186511
   macro avg       0.92      0.92      0.92    186511
weighted avg       0.92      0.92      0.92    186511

Test Report
              precision    recall  f1-score   support

           0       0.91      0.93      0.92     39704
           1       0.93      0.91      0.92     39615

    accuracy                           0.92     79319
   macro avg       0.92      0.92      0.92     79319
weighted avg       0.92      0.92      0.92     79319

-------------------------
Depth: 5
numTrees: 25
Training Report
              precision    recall  f1-score   support

           0       0.92      0.93      0.92     93211
           1       0.93      0.92      0.92     93300

    accuracy                           0.9

ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

In [None]:
# try:
#     dt = RandomForestClassifier(featuresCol='features', labelCol='isFraud')
#     model = dt.fit(train_features)
# except Exception as e:
#     print(e)

In [None]:
# predictions = model.transform(test_features)
# # evaluator = BinaryClassificationEvaluator(labelCol='isFraud', rawPredictionCol='prediction', metricName='areaUnderROC')


In [None]:
# y_pred=predictions.select("prediction").collect()
# y_orig=predictions.select("isFraud").collect()

In [None]:
# print(classification_report(y_orig, y_pred))

In [None]:
# print(f1_score(y_orig, y_pred))
# print(recall_score(y_orig, y_pred))

In [None]:
# rmse = evaluator.evaluate(predictions)
# print('RoC =', rmse)

In [None]:
# test_spark = spark_object.read.option("delimiter", ",").csv("./data/testbalanced.csv", header='true', inferSchema='true')
# num_rows = test_spark.count()
# num_cols = len(test_spark.columns)
# print("Shape of DataFrame: ({}, {})".format(num_rows, num_cols))

# for i in test_spark.columns:
#     oldname = i
#     if "." in oldname:
#         newname = oldname.replace(".","")
#         test_spark = test_spark.withColumnRenamed(oldname, newname)

In [None]:
# assembler = VectorAssembler(inputCols=input_list,outputCol='features')
# test_data = assembler.transform(test_spark)

In [None]:
# predictions = model.transform(test_data)