In [1]:
from pyspark.sql import SparkSession
import findspark
import pandas as pd
findspark.init()

# Spark session & context
spark = SparkSession.builder.getOrCreate()

In [2]:
spark = SparkSession.builder.getOrCreate()

In [108]:
df = spark.read.options(header='True',inferSchema='True',delimiter=',').csv('./BigData/f3.csv')
df.show(5)
# min = df.agg({"close": "min"}).collect()
# print(min)

In [135]:
#symbol encoder
from pyspark.ml.feature import StringIndexer, QuantileDiscretizer

df_indexed = StringIndexer(inputCol="symbol", outputCol="symbol_encode").fit(df)
df_indexed = df_indexed.transform(df)
discretizer = QuantileDiscretizer(numBuckets=75, inputCol="close", outputCol="price_level")
df_indexed = discretizer.fit(df_indexed).transform(df_indexed)

# df_indexed.select(["symbol","symbol_encode"]).distinct().show()
df_indexed = df_indexed.drop("symbol","Date",'pred_3','pred_5','pred_7','pred_3_4','pred_10','pred_7_4','pred_10_4')
df_indexed.show(5)

+-----------------+-----------------+-----------------+------------------+-------------------+------------------+-----------------+------------------+--------+------------------+-------------------+-------------------+-----------------+-------------------+--------+-------------+-----------+
|            close|    14_period_RSI|14_period_STOCH_K|               MFV|      14_period_ATR|               MOM|    14_period_MFI|               ROC|     OBV|     20_period_CCI|      14_period_EMV|           Williams|    14_period_ADX|     20_period_TRIX|pred_5_4|symbol_encode|price_level|
+-----------------+-----------------+-----------------+------------------+-------------------+------------------+-----------------+------------------+--------+------------------+-------------------+-------------------+-----------------+-------------------+--------+-------------+-----------+
| 5.94867562943439|70.97639263246325|66.68075841781726| 481284.9656916955|0.14197895158295562|0.3867884771316552|79.21465270

In [136]:
vec_df = StringIndexer(inputCol="pred_5_4", outputCol="label").fit(df_indexed)
vec_df = vec_df.transform(df_indexed)
vec_df = vec_df.drop("pred_5_4")
vec_df.show(5)

+-----------------+-----------------+-----------------+------------------+-------------------+------------------+-----------------+------------------+--------+------------------+-------------------+-------------------+-----------------+-------------------+-------------+-----------+-----+
|            close|    14_period_RSI|14_period_STOCH_K|               MFV|      14_period_ATR|               MOM|    14_period_MFI|               ROC|     OBV|     20_period_CCI|      14_period_EMV|           Williams|    14_period_ADX|     20_period_TRIX|symbol_encode|price_level|label|
+-----------------+-----------------+-----------------+------------------+-------------------+------------------+-----------------+------------------+--------+------------------+-------------------+-------------------+-----------------+-------------------+-------------+-----------+-----+
| 5.94867562943439|70.97639263246325|66.68075841781726| 481284.9656916955|0.14197895158295562|0.3867884771316552|79.21465270102593| 7

In [137]:
from pyspark.ml.feature import VectorAssembler
feature_col = vec_df.columns[0:16]
vec = VectorAssembler(inputCols=feature_col, outputCol="features")
final_df = vec.transform(vec_df)
final_df.show(5)

+-----------------+-----------------+-----------------+------------------+-------------------+------------------+-----------------+------------------+--------+------------------+-------------------+-------------------+-----------------+-------------------+-------------+-----------+-----+--------------------+
|            close|    14_period_RSI|14_period_STOCH_K|               MFV|      14_period_ATR|               MOM|    14_period_MFI|               ROC|     OBV|     20_period_CCI|      14_period_EMV|           Williams|    14_period_ADX|     20_period_TRIX|symbol_encode|price_level|label|            features|
+-----------------+-----------------+-----------------+------------------+-------------------+------------------+-----------------+------------------+--------+------------------+-------------------+-------------------+-----------------+-------------------+-------------+-----------+-----+--------------------+
| 5.94867562943439|70.97639263246325|66.68075841781726| 481284.9656916

In [138]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import  LinearSVC, RandomForestClassifier, DecisionTreeClassifier, LogisticRegression, NaiveBayes
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer

In [139]:
#Stage1
labelIndexer = StringIndexer(inputCol="label", outputCol="indexLabel").fit(final_df)
# labelIndexer.transform(df_indexed).show(5)

In [140]:
#Stage2
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexFeatures", maxCategories=4).fit(final_df)

In [141]:
#Stage3
# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexLabel", featuresCol="indexFeatures")

# Train a LogisticRegression model.
lr = LogisticRegression(labelCol="indexLabel", featuresCol="indexFeatures", maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Train a RandomForestClassifier model.
rf = RandomForestClassifier(labelCol="indexLabel", featuresCol="indexFeatures", numTrees=10)

# Train a LinearSVC model.
lsvc = LinearSVC(labelCol="indexLabel", featuresCol="indexFeatures", maxIter=10, regParam=0.1)

# Train a NaiveBayes model.
nb = NaiveBayes(labelCol="indexLabel", featuresCol="indexFeatures", smoothing=1.0, modelType="multinomial")

In [142]:
# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, lr])

In [143]:
# Split the data into training and test sets 
(trainingData, testData) = final_df.randomSplit(weights=[0.8,0.2], seed = 2000)

In [144]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexLabel", "features").show(5)

+----------+----------+--------------------+
|prediction|indexLabel|            features|
+----------+----------+--------------------+
|       0.0|       2.0|[0.05664763008762...|
|       0.0|       2.0|[0.05717808714818...|
|       0.0|       1.0|[0.05742008230245...|
|       0.0|       0.0|[0.05833304418646...|
|       0.0|       2.0|[0.05983068078971...|
+----------+----------+--------------------+
only showing top 5 rows



In [145]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(accuracy)

0.7556218274111676


In [146]:
#Visualize result
from pyspark.mllib.evaluation import MulticlassMetrics
y_true = predictions.select(['indexLabel']).collect()
y_pred = predictions.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.76      1.00      0.86    238172
         1.0       0.00      0.00      0.00     42564
         2.0       0.00      0.00      0.00     34464

    accuracy                           0.76    315200
   macro avg       0.25      0.33      0.29    315200
weighted avg       0.57      0.76      0.65    315200

