In [1]:
from pyspark.sql import SparkSession
import findspark
import pandas as pd
findspark.init()

# Spark session & context
spark = (SparkSession
         .builder
         .master("local")
         .appName("model-creation")
         # Add postgres jar
         .config("spark.driver.extraClassPath", "/home/jovyan/work/jars/postgresql-9.4.1207.jar")
         .getOrCreate())
sc = spark.sparkContext

22/01/07 05:46:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
df = spark.read.options(header='True',inferSchema='True',delimiter=',').csv('/home/jovyan/work/data/dataset_final')
df.show()

                                                                                

+----------+------+------+------------------+------------------+------------------+-------------------+-------+------------------+-------------------+--------+-------------------+--------------------+------------------+------------------+--------------------+------+------+------+-------+
|      Date| close|symbol|     14_period_RSI| 14_period_STOCH_K|               MFV|      14_period_ATR|    MOM|     14_period_MFI|                ROC|     OBV|      20_period_CCI|       14_period_EMV|          Williams|     14_period_ADX|      20_period_TRIX|pred_3|pred_5|pred_7|pred_10|
+----------+------+------+------------------+------------------+------------------+-------------------+-------+------------------+-------------------+--------+-------------------+--------------------+------------------+------------------+--------------------+------+------+------+-------+
|1987-08-28|6.4375|  ABMD| 59.41557540827517| 44.11764705882353|          292820.0|0.36607142857142855|   0.25| 66.97030346447272|  8

In [3]:
#symbol encoder
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="symbol", outputCol="symbol_encode")
df_indexed = indexer.fit(df).transform(df)
df_indexed.select(["symbol","symbol_encode"]).distinct().show()
df_indexed = df_indexed.drop("symbol")

                                                                                

+------+-------------+
|symbol|symbol_encode|
+------+-------------+
|   DVA|        124.0|
|   DXC|         55.0|
|   BMY|         13.0|
|   ADP|         40.0|
|   COO|         72.0|
|   ECL|         24.0|
|  CTXS|        121.0|
|   CVX|          5.0|
|   AIG|         37.0|
|  AMAT|         48.0|
|   EFX|         58.0|
|   COP|         62.0|
|   EMR|         14.0|
|   FIS|        149.0|
|   AEE|        136.0|
|   ADM|         51.0|
|   ACN|        150.0|
|   ATO|         82.0|
|   MMM|          9.0|
|   BRO|        127.0|
+------+-------------+
only showing top 20 rows





In [16]:
from pyspark.ml.feature import VectorAssembler

feature_col = df_indexed.columns[2:15]
vec = VectorAssembler(inputCols=feature_col, outputCol="features")
vec_df = vec.transform(df_indexed)

In [17]:
vec_df = vec_df.withColumnRenamed("pred_3","label")

In [18]:
train, test = vec_df.randomSplit(weights=[0.8,0.2], seed=200)

In [19]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 10)
dtModel = dt.fit(train)
df2 = dtModel.transform(vec_df)

                                                                                

In [23]:
def ExtractFeatureImp(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

In [27]:
varlist = ExtractFeatureImp(dtModel.featureImportances, df2, "features").head(10)
varidx = [x for x in varlist['idx'][0:6]]


In [28]:
varidx

[6, 3, 12, 2, 10, 11]

In [32]:
from pyspark.ml.feature import VectorSlicer
slicer = VectorSlicer(inputCol="features", outputCol="features2", indices=varidx)
df3 = slicer.transform(df2)

In [37]:
from pyspark.ml.classification import DecisionTreeClassifier
df3 = df3.drop('rawPrediction', 'probability', 'prediction')
rf2 = DecisionTreeClassifier(labelCol="label", featuresCol="features2", seed = 8464,
                            maxDepth=10, cacheNodeIds = True)
dtModel2 = rf2.fit(df3)
df4 = dtModel.transform(df3)

Exception ignored in: <function JavaWrapper.__del__ at 0x7fdae03369d0>
Traceback (most recent call last):
  File "/usr/local/spark/python/pyspark/ml/wrapper.py", line 39, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'DecisionTreeClassifier' object has no attribute '_java_obj'
                                                                                

In [38]:
df4

DataFrame[Date: string, close: double, 14_period_RSI: double, 14_period_STOCH_K: double, MFV: double, 14_period_ATR: double, MOM: double, 14_period_MFI: double, ROC: double, OBV: double, 20_period_CCI: double, 14_period_EMV: double, Williams: double, 14_period_ADX: double, 20_period_TRIX: double, label: int, pred_5: int, pred_7: int, pred_10: int, symbol_encode: double, features: vector, features2: vector, rawPrediction: vector, probability: vector, prediction: double]

In [39]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator=MulticlassClassificationEvaluator(predictionCol="prediction")
acc = evaluator.evaluate(df4)
 
print("Prediction Accuracy: ", acc)
 
#y_pred=predictions.select("prediction").collect()
#y_orig=predictions.select("label").collect()




Prediction Accuracy:  0.456172057374421


                                                                                

In [12]:
dtModel.featureImportances

SparseVector(13, {0: 0.0414, 1: 0.0554, 2: 0.0928, 3: 0.0961, 4: 0.0477, 5: 0.0588, 6: 0.0949, 7: 0.0672, 8: 0.0779, 9: 0.0704, 10: 0.1243, 11: 0.053, 12: 0.1201})

In [None]:
pred = dtModel.transform(test)
pred.show(3)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)

In [None]:
df.printSchema()