In [0]:
import pyspark
import numpy as np
import pandas as pd

df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/runze.yu@ucalgary.ca/content.csv")


In [0]:
df.show()


+------------+---------+--------------------+
|issue_number|was_fixed|             content|
+------------+---------+--------------------+
|       18544|     True|powershel ssh ter...|
|       18540|     True|handl quotat mark...|
|       18501|    False|problem psnativec...|
|       18500|     True|question download...|
|       18497|     True|defin env var inl...|
|       18490|    False|writeerror prefix...|
|       18489|     True|semanticvers doe ...|
|       18485|     True|errorresponseexce...|
|       18478|     True|pleas remov refer...|
|       18476|     True|class regist erro...|
|       18463|     True|gethelp paramet d...|
|       18460|     True|invokepowershel s...|
|       18459|    False|sendmailmessag se...|
|       18436|     True|switchprocess bro...|
|       18433|     True|switchprocess use...|
|       18432|     True|switchprocess exe...|
|       18431|     True|disablewindowsopt...|
|       18424|     True|systemprivateuri ...|
|       18423|     True|windowstyl

In [0]:
df.groupBy("was_fixed").count().show()

+---------+-----+
|was_fixed|count|
+---------+-----+
|    False|  370|
|     True|  630|
+---------+-----+



In [0]:
import pyspark.ml.feature
from pyspark.ml.feature import Tokenizer as Tok, StopWordsRemover as SWR, CountVectorizer as CV, IDF, StringIndexer as SI

In [0]:
tokenizer = Tok(inputCol = "content", outputCol = "mytokens")

stop_rem = SWR(inputCol = "mytokens", outputCol = "filtered_tokens")

vectorizer = CV(inputCol = "filtered_tokens", outputCol = "rawFeatures")

idf = IDF(inputCol = "rawFeatures", outputCol = "vectorFeatures")


In [0]:
labelEncoder = SI(inputCol = 'was_fixed', outputCol = 'label').fit(df)

In [0]:
labelEncoder.transform(df).show()

+------------+---------+--------------------+-----+
|issue_number|was_fixed|             content|label|
+------------+---------+--------------------+-----+
|       18544|     True|powershel ssh ter...|  0.0|
|       18540|     True|handl quotat mark...|  0.0|
|       18501|    False|problem psnativec...|  1.0|
|       18500|     True|question download...|  0.0|
|       18497|     True|defin env var inl...|  0.0|
|       18490|    False|writeerror prefix...|  1.0|
|       18489|     True|semanticvers doe ...|  0.0|
|       18485|     True|errorresponseexce...|  0.0|
|       18478|     True|pleas remov refer...|  0.0|
|       18476|     True|class regist erro...|  0.0|
|       18463|     True|gethelp paramet d...|  0.0|
|       18460|     True|invokepowershel s...|  0.0|
|       18459|    False|sendmailmessag se...|  1.0|
|       18436|     True|switchprocess bro...|  0.0|
|       18433|     True|switchprocess use...|  0.0|
|       18432|     True|switchprocess exe...|  0.0|
|       1843

In [0]:
labelEncoder.labels

Out[241]: ['True', 'False']

In [0]:
df = labelEncoder.transform(df)

In [0]:
(trainDF, testDF) = df.randomSplit((0.7, 0.3), seed = 42)
trainDF.show(5)

+------------+---------+--------------------+-----+
|issue_number|was_fixed|             content|label|
+------------+---------+--------------------+-----+
|       15194|     True|question chang th...|  0.0|
|       15196|     True|datat type call g...|  0.0|
|       15201|     True|recurs option get...|  0.0|
|       15202|     True|run wget output p...|  0.0|
|       15205|     True|command line uac ...|  0.0|
+------------+---------+--------------------+-----+
only showing top 5 rows



In [0]:
from pyspark.ml.classification import LogisticRegression as LR, LinearSVC as SVC, NaiveBayes as NB

lr = LR(featuresCol = "vectorFeatures", labelCol = 'label')
svc = SVC(featuresCol = "vectorFeatures", labelCol = 'label')
nb = NB(featuresCol = "vectorFeatures", labelCol = 'label')

In [0]:
from pyspark.ml import Pipeline

pipeline_lr = Pipeline(stages = [tokenizer, stop_rem, vectorizer, idf, lr])
pipeline_nb = Pipeline(stages = [tokenizer, stop_rem, vectorizer, idf, nb])
pipeline_svc = Pipeline(stages = [tokenizer, stop_rem, vectorizer, idf, svc])

In [0]:
lr_model = pipeline_lr.fit(trainDF)
nb_model = pipeline_nb.fit(trainDF)
svc_model = pipeline_svc.fit(trainDF)

In [0]:
predictions_lr = lr_model.transform(testDF)
predictions_lr.show()

+------------+---------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|issue_number|was_fixed|             content|label|            mytokens|     filtered_tokens|         rawFeatures|      vectorFeatures|       rawPrediction|         probability|prediction|
+------------+---------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|       15200|    False|windows open powe...|  1.0|[windows, open, p...|[windows, open, p...|(10700,[0,2,4,5,9...|(10700,[0,2,4,5,9...|[-42.131045264098...|[5.04335483642045...|       1.0|
|       15206|    False|defer input pipel...|  1.0|[defer, input, pi...|[defer, input, pi...|(10700,[0,2,3,7,1...|(10700,[0,2,3,7,1...|[2.81474488353407...|[0.94346742850466...|       0.0|
|       15213|    False|distribut support...|  1.0|[dis

In [0]:
predictions_nb = nb_model.transform(testDF)
predictions_nb.show()

+------------+---------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|issue_number|was_fixed|             content|label|            mytokens|     filtered_tokens|         rawFeatures|      vectorFeatures|       rawPrediction|         probability|prediction|
+------------+---------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|       15200|    False|windows open powe...|  1.0|[windows, open, p...|[windows, open, p...|(10700,[0,2,4,5,9...|(10700,[0,2,4,5,9...|[-477.45632893044...|[3.81294528612891...|       1.0|
|       15206|    False|defer input pipel...|  1.0|[defer, input, pi...|[defer, input, pi...|(10700,[0,2,3,7,1...|(10700,[0,2,3,7,1...|[-2397.0383477486...|[1.0,2.5977677366...|       0.0|
|       15213|    False|distribut support...|  1.0|[dis

In [0]:
predictions_svc = svc_model.transform(testDF)
predictions_svc.show()

+------------+---------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|issue_number|was_fixed|             content|label|            mytokens|     filtered_tokens|         rawFeatures|      vectorFeatures|       rawPrediction|prediction|
+------------+---------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|       15200|    False|windows open powe...|  1.0|[windows, open, p...|[windows, open, p...|(10700,[0,2,4,5,9...|(10700,[0,2,4,5,9...|[-1.3072710266530...|       1.0|
|       15206|    False|defer input pipel...|  1.0|[defer, input, pi...|[defer, input, pi...|(10700,[0,2,3,7,1...|(10700,[0,2,3,7,1...|[0.20080689702196...|       0.0|
|       15213|    False|distribut support...|  1.0|[distribut, suppo...|[distribut, suppo...|(10700,[0,2,6,10,...|(10700,[0,2,6,10,...|[-3.2027016806537...|    

In [0]:
predictions_lr.select('rawPrediction', 'probability', 'was_fixed', 'label',  'prediction').show()

+--------------------+--------------------+---------+-----+----------+
|       rawPrediction|         probability|was_fixed|label|prediction|
+--------------------+--------------------+---------+-----+----------+
|[-42.131045264098...|[5.04335483642045...|    False|  1.0|       1.0|
|[2.81474488353407...|[0.94346742850466...|    False|  1.0|       0.0|
|[-114.96630320193...|[1.17697954332140...|    False|  1.0|       1.0|
|[-129.12183296521...|[8.37723020541807...|    False|  1.0|       1.0|
|[-43.734891293636...|[1.01432713679356...|    False|  1.0|       1.0|
|[-114.96630320193...|[1.17697954332140...|    False|  1.0|       1.0|
|[-40.284068781806...|[3.19779942029345...|     True|  0.0|       1.0|
|[-4.2615848868383...|[0.01390389384663...|    False|  1.0|       1.0|
|[1.33433701424125...|[0.79155712290776...|     True|  0.0|       0.0|
|[-0.5418645893369...|[0.36775393796723...|    False|  1.0|       1.0|
|[-29.518489143090...|[1.51454803214010...|    False|  1.0|       1.0|
|[-47.

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator as MCE

eval = MCE(labelCol = 'label', predictionCol = 'prediction', metricName = 'accuracy')

In [0]:
from pyspark.mllib.evaluation import MulticlassMetrics as MM

lr_metric = MM(predictions_lr['label', 'prediction'].rdd)
nb_metric = MM(predictions_nb['label', 'prediction'].rdd)
svc_metric = MM(predictions_svc['label', 'prediction'].rdd)

print("Accuracy LR:", lr_metric.accuracy)
print("Precision LR:", lr_metric.precision(1.0))
print("Recall LR:", lr_metric.recall(1.0))
print("F1 Score LR:", lr_metric.fMeasure(1.0))

print(" ")

print("Accuracy NB:", nb_metric.accuracy)
print("Precision NB:", nb_metric.precision(1.0))
print("Recall NB:", nb_metric.recall(1.0))
print("F1 Score NB:", nb_metric.fMeasure(1.0))

print(" ")

print("Accuracy SVC:", svc_metric.accuracy)
print("Precision SVC:", svc_metric.precision(1.0))
print("Recall SVC:", svc_metric.recall(1.0))
print("F1 Score SVC:", svc_metric.fMeasure(1.0))

Accuracy LR: 0.57421875
Precision LR: 0.4639175257731959
Recall LR: 0.4411764705882353
F1 Score LR: 0.4522613065326633
 
Accuracy NB: 0.6171875
Precision NB: 0.5360824742268041
Recall NB: 0.49523809523809526
F1 Score NB: 0.5148514851485149
 
Accuracy SVC: 0.58984375
Precision SVC: 0.4536082474226804
Recall SVC: 0.4583333333333333
F1 Score SVC: 0.4559585492227979
