In [1]:
from pyspark.sql import SparkSession
import findspark
import pandas as pd
findspark.init()

# Spark session & context
spark = SparkSession.builder.getOrCreate()

In [2]:
#Read data
df = spark.read.options(header='True',inferSchema='True',delimiter=',').csv('./BigData/Dataset/finalData.csv')

In [3]:
from pyspark.ml import Transformer
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType 

# Stage 1: get ClassWeightCol
# Create Class Weight Column Transformer
class getClassWeightCol(Transformer):
    def __init__(self, label):
        super(getClassWeightCol, self).__init__()
        self.label = label

    def _transform(self, df: DataFrame) -> DataFrame:
        up = df.filter(col(label) == 'up').count()
        down = df.filter(col(label) == 'down').count()
        sw = df.filter(col(label) == 'sw').count()
        total = df.count()

        wup = total/(3*up)
        wdown = total/(3*down)
        sw = total/(3*sw)

        calculateWeights = udf(lambda x: wup if x == "up" else (wdown if x=="down" else (sw)),DoubleType())
        return df.withColumn("classWeightCol", calculateWeights(label))

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import  DecisionTreeClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler, StandardScaler

# Choose optine label
label = 'pred_5_5p'


#Stage 2: pre Features
vec = VectorAssembler(inputCols=['close','14_period_RSI', '14_period_STOCH_K', 'MFV', '14_period_ATR', 'MOM', \
                                 '14_period_MFI', 'ROC', 'OBV', '20_period_CCI', '14_period_EMV', \
                                 'Williams', '14_period_ADX', '20_period_TRIX'], outputCol="NumFeatures")

# Stage 3: standardscaler Features
standardscaler = StandardScaler(inputCol="NumFeatures", outputCol="features", withMean=True, withStd=True)

# Stage 4: get Label
labelIndexer = StringIndexer(inputCol=label, outputCol="indexLabel")

# Stage 5: get Features
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexFeatures", maxCategories=4)

# Stage 6: Model
dt = DecisionTreeClassifier(labelCol="indexLabel", featuresCol="indexFeatures", weightCol='classWeightCol')

# Final pipeline
pipeline_dt = Pipeline(stages=[getClassWeightCol(label), vec, standardscaler, labelIndexer, featureIndexer, dt])

In [5]:
#Train model
# Split the data into training and test sets 
(trainingData, testData) = df.randomSplit(weights=[0.8,0.2], seed = 2000)

# Train model.  This also runs the indexers.
model_dt = pipeline_dt.fit(trainingData)

# model_lr = pipeline_lr.fit(trainingData)

# model_rf = pipeline_rf.fit(trainingData)

In [6]:
# # Save model
# modelname = 'DT'
# model_dt.write().overwrite().save("./BigData/Model/" + modelname)

In [7]:
# Make predictions.
predictions_dt = model_dt.transform(testData)

# predictions_lr = model_lr.transform(testData)

# predictions_rf = model_rf.transform(testData)

# Select example rows to display.
predictions_dt.select("prediction", "indexLabel", "features").show(5)

# predictions_lr.select("prediction", "indexLabel", "features").show(5)

# predictions_rf.select("prediction", "indexLabel", "features").show(5)

+----------+----------+--------------------+
|prediction|indexLabel|            features|
+----------+----------+--------------------+
|       0.0|       0.0|[-0.3740466994898...|
|       0.0|       0.0|[-0.3740595638688...|
|       0.0|       0.0|[-0.3741682004229...|
|       0.0|       0.0|[-0.3739600940591...|
|       0.0|       0.0|[-0.3741051149762...|
+----------+----------+--------------------+
only showing top 5 rows



In [8]:
# Test result
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexLabel", predictionCol="prediction", metricName="accuracy")
accuracy_dt = evaluator.evaluate(predictions_dt)

# accuracy_lr = evaluator.evaluate(predictions_lr)

# accuracy_rf = evaluator.evaluate(predictions_rf)

# print('accuracy dt lr rf:', accuracy_dt, accuracy_lr, accuracy_rf)
print('accuracy dt :', accuracy_dt)

accuracy dt : 0.7163157694357725


In [9]:
#Visualize result
from pyspark.mllib.evaluation import MulticlassMetrics
y_true = predictions_dt.select(['indexLabel']).collect()
y_pred = predictions_dt.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))
print(confusion_matrix(y_true, y_pred))

# #Visualize result
# from pyspark.mllib.evaluation import MulticlassMetrics
# y_true = predictions_lr.select(['indexLabel']).collect()
# y_pred = predictions_lr.select(['prediction']).collect()

# from sklearn.metrics import classification_report, confusion_matrix
# print(classification_report(y_true, y_pred))
# print(confusion_matrix(y_true, y_pred))

# #Visualize result
# from pyspark.mllib.evaluation import MulticlassMetrics
# y_true = predictions_rf.select(['indexLabel']).collect()
# y_pred = predictions_rf.select(['prediction']).collect()

# from sklearn.metrics import classification_report, confusion_matrix
# print(classification_report(y_true, y_pred))
# print(confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.88      0.81      0.84    261560
         1.0       0.18      0.27      0.22     29842
         2.0       0.18      0.23      0.20     23790

    accuracy                           0.72    315192
   macro avg       0.41      0.44      0.42    315192
weighted avg       0.76      0.72      0.74    315192

[[212334  29559  19667]
 [ 16390   7983   5469]
 [ 12654   5676   5460]]
