In [2]:
from pyspark.sql import SparkSession

# Spark session & context
spark = (SparkSession
         .builder
         .master("local")
         .appName("raw-dataset")
         # Add postgres jar
         .config("spark.driver.extraClassPath", "/home/jovyan/work/jars/postgresql-9.4.1207.jar")
         .getOrCreate())
sc = spark.sparkContext

22/01/17 02:56:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
#Read data
df = spark.read.options(header='True',inferSchema='True',delimiter=',').csv('/home/jovyan/work/data/stock_data_final')

                                                                                

In [6]:
from pyspark.ml import Transformer
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType 


In [7]:
#Get class weight to deal with imbalance data

label = 'pred_5_5p'

up = df.filter(col(label) == 'up').count()
down = df.filter(col(label) == 'down').count()
sw = df.filter(col(label) == 'sw').count()
total = df.count()

wup = total/(3*up)
wdown = total/(3*down)
sw = total/(3*sw)

calculateWeights = udf(lambda x: wup if x == "up" else (wdown if x=="down" else (sw)),DoubleType())
df =  df.withColumn("classWeightCol", calculateWeights(label))

                                                                                

In [8]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import  DecisionTreeClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler, StandardScaler

# Choose optine label
label = 'pred_5_5p'


#Stage 1: pre Features
vec = VectorAssembler(inputCols=['14_period_RSI', '14_period_STOCH_K', 'MFV', '14_period_ATR', 'MOM', \
                                 '14_period_MFI', 'ROC', 'OBV', '20_period_CCI', '14_period_EMV', \
                                 'Williams', '14_period_ADX', '20_period_TRIX'], outputCol="NumFeatures")

# Stage 2: standardscaler Features
standardscaler = StandardScaler(inputCol="NumFeatures", outputCol="features", withMean=True, withStd=True)

# Stage 3: get Label
labelIndexer = StringIndexer(inputCol=label, outputCol="indexLabel")

# Stage 4: get Features
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexFeatures", maxCategories=4)

# Stage 5: Model
dt = DecisionTreeClassifier(labelCol="indexLabel", featuresCol="indexFeatures", weightCol='classWeightCol')

# Final pipeline
pipeline_dt = Pipeline(stages=[vec, standardscaler, labelIndexer, featureIndexer, dt])

In [9]:
#Train model
# Split the data into training and test sets 
(trainingData, testData) = df.randomSplit(weights=[0.8,0.2], seed = 2000)

# Train model.  This also runs the indexers.
model_dt = pipeline_dt.fit(trainingData)

# model_lr = pipeline_lr.fit(trainingData)

# model_rf = pipeline_rf.fit(trainingData)

22/01/17 03:01:51 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [44]:
# Save model
modelname = 'DT_5d_5p'
model_dt.write().overwrite().save("/home/jovyan/work/notebooks/model/" + modelname)

                                                                                

In [47]:
from pyspark.ml import PipelineModel
pipelineModel = PipelineModel.load("/home/jovyan/work/notebooks/model/" + modelname)
test = pipelineModel.transform(testData)


In [48]:
test.select("prediction", "indexLabel", "features").show(5)

[Stage 134:>                                                        (0 + 1) / 1]

+----------+----------+--------------------+
|prediction|indexLabel|            features|
+----------+----------+--------------------+
|       0.0|       0.0|[1.72114180191364...|
|       0.0|       0.0|[1.65607442054934...|
|       0.0|       0.0|[1.12868033529626...|
|       0.0|       0.0|[1.06735827067081...|
|       0.0|       0.0|[-1.6232517526980...|
+----------+----------+--------------------+
only showing top 5 rows



                                                                                

In [10]:
# Make predictions.
predictions_dt = model_dt.transform(testData)

# predictions_lr = model_lr.transform(testData)

# predictions_rf = model_rf.transform(testData)

# Select example rows to display.
predictions_dt.select("prediction", "indexLabel", "features").show(5)

# predictions_lr.select("prediction", "indexLabel", "features").show(5)

# predictions_rf.select("prediction", "indexLabel", "features").show(5)

[Stage 29:>                                                         (0 + 1) / 1]

+----------+----------+--------------------+
|prediction|indexLabel|            features|
+----------+----------+--------------------+
|       0.0|       0.0|[1.72114180191364...|
|       0.0|       0.0|[1.65607442054934...|
|       0.0|       0.0|[1.12868033529626...|
|       0.0|       0.0|[1.06735827067081...|
|       0.0|       0.0|[-1.6232517526980...|
+----------+----------+--------------------+
only showing top 5 rows



                                                                                

In [11]:
# Test result
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexLabel", predictionCol="prediction", metricName="accuracy")
accuracy_dt = evaluator.evaluate(predictions_dt)

# accuracy_lr = evaluator.evaluate(predictions_lr)

# accuracy_rf = evaluator.evaluate(predictions_rf)

# print('accuracy dt lr rf:', accuracy_dt, accuracy_lr, accuracy_rf)
print('accuracy dt :', accuracy_dt)



accuracy dt : 0.6741401338503045


                                                                                

In [12]:
#Visualize result
from pyspark.mllib.evaluation import MulticlassMetrics
y_true = predictions_dt.select(['indexLabel']).collect()
y_pred = predictions_dt.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))
print(confusion_matrix(y_true, y_pred))



                                                                                

              precision    recall  f1-score   support

         0.0       0.88      0.75      0.81    261947
         1.0       0.16      0.31      0.22     29790
         2.0       0.16      0.24      0.20     23690

    accuracy                           0.67    315427
   macro avg       0.40      0.44      0.41    315427
weighted avg       0.76      0.67      0.71    315427

[[197517  40930  23500]
 [ 14537   9330   5923]
 [ 11436   6459   5795]]
