In [2]:
#importing pyspark
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Crime-In-NYC-Data-Mining').getOrCreate()

In [3]:
#importing cleaned data
df = spark.read.csv('Datasets/good-nyc-park-crime-2016.csv',header=True,inferSchema=True)
df.show()

+---+--------------------+---------------+-------+------------------+------+----+-------+--------------+--------+-------------+------------------------------+-----+-------------+--------------+
|_c0|                PARK|        BOROUGH|QUARTER|          CATEGORY|MURDER|RAPE|ROBBERY|FELONY ASSAULT|BURGLARY|GRAND LARCENY|GRAND LARCENY OF MOTOR VEHICLE|TOTAL|CRIME OCCURED|VICIOUS CRIMES|
+---+--------------------+---------------+-------+------------------+------+----+-------+--------------+--------+-------------+------------------------------+-----+-------------+--------------+
|  0|     PELHAM BAY PARK|          BRONX|      1|ONE ACRE OR LARGER|   0.0| 1.0|    1.0|           0.0|     0.0|          0.0|                           0.0|    2|            1|           1.0|
|  1|  VAN CORTLANDT PARK|          BRONX|      1|ONE ACRE OR LARGER|   0.0| 0.0|    2.0|           1.0|     0.0|          0.0|                           0.0|    3|            1|           0.0|
|  2|ROCKAWAY BEACH AN...|    

In [4]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- PARK: string (nullable = true)
 |-- BOROUGH: string (nullable = true)
 |-- QUARTER: integer (nullable = true)
 |-- CATEGORY: string (nullable = true)
 |-- MURDER: double (nullable = true)
 |-- RAPE: double (nullable = true)
 |-- ROBBERY: double (nullable = true)
 |-- FELONY ASSAULT: double (nullable = true)
 |-- BURGLARY: double (nullable = true)
 |-- GRAND LARCENY: double (nullable = true)
 |-- GRAND LARCENY OF MOTOR VEHICLE: double (nullable = true)
 |-- TOTAL: integer (nullable = true)
 |-- CRIME OCCURED: integer (nullable = true)
 |-- VICIOUS CRIMES: double (nullable = true)



In [5]:
#importing essential packets for data mining
from pyspark.sql import *
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer)

In [6]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer)

In [7]:

murder_indexer = StringIndexer(inputCol='MURDER',outputCol='murder_re')
rape_indexer = StringIndexer(inputCol='RAPE',outputCol='rape_re')
vicious_crimes_indexer = StringIndexer(inputCol='VICIOUS CRIMES',outputCol='label')


murder_encoder = OneHotEncoder(inputCol='murder_re',outputCol='murder_vec')
rape_encoder = OneHotEncoder(inputCol='rape_re',outputCol='rape_vec')
label_encoder = StringIndexer(inputCol='label',outputCol='label')

assembler = VectorAssembler(inputCols=['murder_vec','rape_vec'], outputCol="features")

In [8]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[murder_indexer,rape_indexer,vicious_crimes_indexer,murder_encoder,rape_encoder, assembler])

In [9]:
pipeline_model = pipeline.fit(df)
pipe_df = pipeline_model.transform(df)
pipe_df = pipe_df.select('label','features')
pipe_df.describe()

DataFrame[summary: string, label: string]

In [10]:
train_data, test_data = pipe_df.randomSplit([0.6,0.4])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

Training Dataset Count: 2773
Test Dataset Count: 1843


In [11]:
from pyspark.ml.regression import *

In [12]:
# Load training data

print("Linear Regression")

lr = LinearRegression(featuresCol = 'features', labelCol='label',maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(train_data)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)


Linear Regression
Coefficients: [0.0,0.0,0.0]
Intercept: 0.006491164803461955
numIterations: 1
objectiveHistory: [0.5]
+--------------------+
|           residuals|
+--------------------+
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
|-0.00649116480346...|
+--------------------+
only showing top 20 rows

RMSE: 0.080306
r2: -0.000000


In [13]:
lr_predictions = lrModel.transform(test_data)
lr_predictions.select("prediction","label","features").show(5)
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+--------------------+-----+-------------+
|          prediction|label|     features|
+--------------------+-----+-------------+
|0.006491164803461955|  0.0|[1.0,1.0,0.0]|
|0.006491164803461955|  0.0|[1.0,1.0,0.0]|
|0.006491164803461955|  0.0|[1.0,1.0,0.0]|
|0.006491164803461955|  0.0|[1.0,1.0,0.0]|
|0.006491164803461955|  0.0|[1.0,1.0,0.0]|
+--------------------+-----+-------------+
only showing top 5 rows

R Squared (R2) on test data = -5.27324e-08


In [14]:
#Decision Tree Regression

print("Decision Tree Regression")

dec = DecisionTreeRegressor(featuresCol = 'features', labelCol='label', maxBins=32, maxDepth=5)

# Fit the model
decModel = dec.fit(train_data)

predictions = decModel.transform(test_data)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


Decision Tree Regression
+----------+-----+-------------+
|prediction|label|     features|
+----------+-----+-------------+
|       0.0|  0.0|[1.0,1.0,0.0]|
|       0.0|  0.0|[1.0,1.0,0.0]|
|       0.0|  0.0|[1.0,1.0,0.0]|
|       0.0|  0.0|[1.0,1.0,0.0]|
|       0.0|  0.0|[1.0,1.0,0.0]|
+----------+-----+-------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 0.0232936


In [15]:
#Random Forest Regression
print("Random Forest Regression")

dec = RandomForestRegressor(featuresCol = 'features', labelCol='label', maxBins=32, maxDepth=5)

# Fit the model
decModel = dec.fit(train_data)

predictions = decModel.transform(test_data)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Random Forest Regression
+--------------------+-----+-------------+
|          prediction|label|     features|
+--------------------+-----+-------------+
|0.001508380488653...|  0.0|[1.0,1.0,0.0]|
|0.001508380488653...|  0.0|[1.0,1.0,0.0]|
|0.001508380488653...|  0.0|[1.0,1.0,0.0]|
|0.001508380488653...|  0.0|[1.0,1.0,0.0]|
|0.001508380488653...|  0.0|[1.0,1.0,0.0]|
+--------------------+-----+-------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 0.0447865


In [16]:
train_data_iter2, test_data_iter2 = pipe_df.randomSplit([0.7,0.3])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

Training Dataset Count: 2773
Test Dataset Count: 1843


In [17]:

print("Linear Regression - Iteration 2")

lr = LinearRegression(featuresCol = 'features', labelCol='label',maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(train_data_iter2)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)


Linear Regression - Iteration 2
Coefficients: [0.0,0.0,0.0]
Intercept: 0.006790123456790123
numIterations: 1
objectiveHistory: [0.5000000000000001]
+--------------------+
|           residuals|
+--------------------+
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
|-0.00679012345679...|
+--------------------+
only showing top 20 rows

RMSE: 0.085798
r2: 0.000000


In [18]:
lr_predictions = lrModel.transform(test_data_iter2)
lr_predictions.select("prediction","label","features").show(5)
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+--------------------+-----+-------------+
|          prediction|label|     features|
+--------------------+-----+-------------+
|0.006790123456790123|  0.0|[1.0,1.0,0.0]|
|0.006790123456790123|  0.0|[1.0,1.0,0.0]|
|0.006790123456790123|  0.0|[1.0,1.0,0.0]|
|0.006790123456790123|  0.0|[1.0,1.0,0.0]|
|0.006790123456790123|  0.0|[1.0,1.0,0.0]|
+--------------------+-----+-------------+
only showing top 5 rows

R Squared (R2) on test data = -0.000164859


In [19]:
#Decision Tree Regression

print("Decision Tree Regression - Iteration 2")

dec = DecisionTreeRegressor(featuresCol = 'features', labelCol='label', maxBins=32, maxDepth=10)

# Fit the model
decModel = dec.fit(train_data_iter2)

predictions = decModel.transform(test_data_iter2)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


Decision Tree Regression - Iteration 2
+----------+-----+-------------+
|prediction|label|     features|
+----------+-----+-------------+
|       0.0|  0.0|[1.0,1.0,0.0]|
|       0.0|  0.0|[1.0,1.0,0.0]|
|       0.0|  0.0|[1.0,1.0,0.0]|
|       0.0|  0.0|[1.0,1.0,0.0]|
|       0.0|  0.0|[1.0,1.0,0.0]|
+----------+-----+-------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 0


In [20]:
#Random Forest Regression
print("Random Forest Regression - Iteration 2")

dec = RandomForestRegressor(featuresCol = 'features', labelCol='label', maxBins=32, maxDepth=10)

# Fit the model
decModel = dec.fit(train_data_iter2)

predictions = decModel.transform(test_data_iter2)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Random Forest Regression - Iteration 2
+--------------------+-----+-------------+
|          prediction|label|     features|
+--------------------+-----+-------------+
|0.002020378741551...|  0.0|[1.0,1.0,0.0]|
|0.002020378741551...|  0.0|[1.0,1.0,0.0]|
|0.002020378741551...|  0.0|[1.0,1.0,0.0]|
|0.002020378741551...|  0.0|[1.0,1.0,0.0]|
|0.002020378741551...|  0.0|[1.0,1.0,0.0]|
+--------------------+-----+-------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 0.0179271


In [21]:
murder_indexer = StringIndexer(inputCol='MURDER',outputCol='murder_re')
rape_indexer = StringIndexer(inputCol='RAPE',outputCol='rape_re')
vicious_crimes_indexer = StringIndexer(inputCol='VICIOUS CRIMES',outputCol='vicious_crimes_re')
total_indexer = StringIndexer(inputCol='TOTAL', outputCol='total_re')
crime_occured_indexer = StringIndexer(inputCol='CRIME OCCURED', outputCol='crime_occured_re')
category_indexer = StringIndexer(inputCol='CATEGORY', outputCol='category_re')
quarter_indexer = StringIndexer(inputCol='QUARTER', outputCol='quarter_re')
borough_indexer = StringIndexer(inputCol='BOROUGH', outputCol='label')



murder_encoder = OneHotEncoder(inputCol='murder_re',outputCol='murder_vec')
rape_encoder = OneHotEncoder(inputCol='rape_re',outputCol='rape_vec')
vicious_crimes_encoder = OneHotEncoder(inputCol='vicious_crimes_re',outputCol='vicious_crimes_vec')
total_encoder = OneHotEncoder(inputCol='total_re',outputCol='total_vec')
crime_occured_encoder = OneHotEncoder(inputCol='crime_occured_re',outputCol='crime_occured_vec')
category_encoder = OneHotEncoder(inputCol='category_re',outputCol='category_vec')
quarter_encoder = OneHotEncoder(inputCol='quarter_re',outputCol='quarter_vec')
label_encoder = StringIndexer(inputCol='label',outputCol='label')

assembler = VectorAssembler(inputCols=['murder_vec','rape_vec', 'vicious_crimes_vec', 'total_vec', 'crime_occured_vec', 'category_vec', 'quarter_vec'], outputCol="features")

In [22]:
pipeline2 = Pipeline(stages=[murder_indexer,rape_indexer,vicious_crimes_indexer,total_indexer,crime_occured_indexer,category_indexer,quarter_indexer,borough_indexer,murder_encoder,rape_encoder,vicious_crimes_encoder,total_encoder,crime_occured_encoder,category_encoder,quarter_encoder,assembler])

In [23]:
pipeline2_model = pipeline2.fit(df)
pipe2_df = pipeline2_model.transform(df)
pipe2_df = pipe2_df.select('label','features')
pipe2_df.describe()

DataFrame[summary: string, label: string]

In [24]:
train_data2, test_data2 = pipe2_df.randomSplit([0.6,0.4])
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

Training Dataset Count: 2773
Test Dataset Count: 1843


In [25]:
from pyspark.ml.classification import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

print("Logistic Regression - Iteration 1")

# Instantiate the model.
lr_model = LogisticRegression(featuresCol='features',labelCol='label')

# Fit the model.
lr_model = lr_model.fit(train_data2)

# And evaluate the model using the test data.
predictions = lr_model.transform(test_data2)
predictions.select("prediction", "label", "features").show(5)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)


print("Test set accuracy = " + str(accuracy))

Logistic Regression - Iteration 1
+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
+----------+-----+--------------------+
only showing top 5 rows

Test set accuracy = 0.287506819421713


In [26]:

print("One vs Rest Classifer - Iteration 1")
# instantiate the base classifier.
lr = LogisticRegression(maxIter=5, tol=1E-6, fitIntercept=True)

# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# train the multiclass model.
ovrModel = ovr.fit(train_data2)

# score the model on test data.
predictions = ovrModel.transform(test_data2)
predictions.select("prediction", "label", "features").show(5)

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("Test accuracy = %g" % (1.0 - accuracy))

One vs Rest Classifer - Iteration 1
+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
+----------+-----+--------------------+
only showing top 5 rows

Test accuracy = 0.704855


In [33]:
# create the trainer and set its parameters
from pyspark.ml.evaluation import BinaryClassificationEvaluator

print("Naive Bayes Iteration 1")
nb = NaiveBayes(smoothing=0.4, modelType="multinomial")

# train the model
model = nb.fit(train_data2)

# select example rows to display.
predictions = model.transform(test_data2)
#predictions.show()
predictions.select("prediction", "label", "features").show(5)

# compute accuracy on the test set
#evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
#print("Test set accuracy = " + str(accuracy))

evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))
print("Test accuracy = %g" % (1.0 - accuracy))

Naive Bayes Iteration 1
+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
+----------+-----+--------------------+
only showing top 5 rows

Test Area Under ROC: 0.49044284380418846
Test accuracy = 0.509557


In [28]:
train_data2_iter2, test_data2_iter2 = pipe2_df.randomSplit([0.7,0.3])
print("Training Dataset Count: " + str(train_data2_iter2.count()))
print("Test Dataset Count: " + str(test_data2_iter2.count()))

Training Dataset Count: 3209
Test Dataset Count: 1407


In [29]:
from pyspark.ml.classification import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

print("Logistic Regression - Iteration 2")

# Instantiate the model.
lr_model = LogisticRegression(featuresCol='features',labelCol='label')

# Fit the model.
lr_model = lr_model.fit(train_data2_iter2)

# And evaluate the model using the test data.
predictions = lr_model.transform(test_data2_iter2)

predictions.select("prediction", "label", "features").show(20)


evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Logistic Regression - Iteration 2
+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
+----------+-----+--------------------+
only s

In [30]:
print("One vs Rest Classifer - Iteration 2")
# instantiate the base classifier.
lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# train the multiclass model.
ovrModel = ovr.fit(train_data2_iter2)

# score the model on test data.
predictions = ovrModel.transform(test_data2_iter2)

predictions.select("prediction", "label", "features").show(20)

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("Test accuracy = %g" % (1.0 - accuracy))

One vs Rest Classifer - Iteration 2
+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
+----------+-----+--------------------+
only

In [34]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

print("Naive Bayes Iteration 2")
nb = NaiveBayes(smoothing=0.8, modelType="multinomial")

# train the model
model = nb.fit(train_data2_iter2)

# select example rows to display.
predictions = model.transform(test_data2_iter2)

predictions.select("prediction", "label", "features").show(10)



# compute accuracy on the test set
#evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
#print("Test set accuracy = " + str(accuracy))

evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))
print("Test accuracy = %g" % (1.0 - accuracy))

Naive Bayes Iteration 2
+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
|       1.0|  0.0|(36,[0,1,3,5,23,2...|
+----------+-----+--------------------+
only showing top 10 rows

Test Area Under ROC: 0.5211499085288767
Test accuracy = 0.47885
