In [1]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
#import numpy
# Load training data
from pyspark.ml.linalg import SparseVector
# from pyspark.python.pyspark.shell import spark
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [2]:
data = spark.read.load("adult_data.csv", format="csv", header=True, delimiter=",")
data.show(5)

+---+------+---+---+----+---+---+
|  1|     3|  5| 10|  11| 12| 13|
+---+------+---+---+----+---+---+
| 39| 77516| 13|  1|2174|  0| 40|
| 50| 83311| 13|  1|   0|  0| 13|
| 38|215646|  9|  1|   0|  0| 40|
| 53|234721|  7|  1|   0|  0| 40|
| 28|338409| 13|  2|   0|  0| 40|
+---+------+---+---+----+---+---+
only showing top 5 rows



In [3]:
from pyspark.sql.types import *

# Change column type
data = data.withColumn("1", data["1"].cast(IntegerType()))
data = data.withColumn("3", data["3"].cast(IntegerType()))
data = data.withColumn("5", data["5"].cast(IntegerType()))
data = data.withColumn("10", data["10"].cast(IntegerType()))
data = data.withColumn("11", data["11"].cast(IntegerType()))
data = data.withColumn("12", data["12"].cast(IntegerType()))
data = data.withColumn("13", data["13"].cast(IntegerType()))

data.printSchema()

root
 |-- 1: integer (nullable = true)
 |-- 3: integer (nullable = true)
 |-- 5: integer (nullable = true)
 |-- 10: integer (nullable = true)
 |-- 11: integer (nullable = true)
 |-- 12: integer (nullable = true)
 |-- 13: integer (nullable = true)



In [4]:
data = data.withColumn("label", data['10'] - 0)
data.show(5)

+---+------+---+---+----+---+---+-----+
|  1|     3|  5| 10|  11| 12| 13|label|
+---+------+---+---+----+---+---+-----+
| 39| 77516| 13|  1|2174|  0| 40|    1|
| 50| 83311| 13|  1|   0|  0| 13|    1|
| 38|215646|  9|  1|   0|  0| 40|    1|
| 53|234721|  7|  1|   0|  0| 40|    1|
| 28|338409| 13|  2|   0|  0| 40|    2|
+---+------+---+---+----+---+---+-----+
only showing top 5 rows



In [5]:
assem = VectorAssembler(inputCols=data.columns[0:7], outputCol='features')
data = assem.transform(data)
data.show(5)

+---+------+---+---+----+---+---+-----+--------------------+
|  1|     3|  5| 10|  11| 12| 13|label|            features|
+---+------+---+---+----+---+---+-----+--------------------+
| 39| 77516| 13|  1|2174|  0| 40|    1|[39.0,77516.0,13....|
| 50| 83311| 13|  1|   0|  0| 13|    1|[50.0,83311.0,13....|
| 38|215646|  9|  1|   0|  0| 40|    1|[38.0,215646.0,9....|
| 53|234721|  7|  1|   0|  0| 40|    1|[53.0,234721.0,7....|
| 28|338409| 13|  2|   0|  0| 40|    2|[28.0,338409.0,13...|
+---+------+---+---+----+---+---+-----+--------------------+
only showing top 5 rows



In [6]:
# Split the data into train and test
train,test = data.randomSplit([0.6, 0.4], 1234)

In [10]:
# create the trainer and set its parameters
nb1 = NaiveBayes(smoothing=1.0, modelType="multinomial")

# train the model
model1 = nb1.fit(train)

# select example rows to display.
predictions = model1.transform(test)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

+---+-----+---+---+----+---+---+-----+--------------------+--------------------+--------------------+----------+
|  1|    3|  5| 10|  11| 12| 13|label|            features|       rawPrediction|         probability|prediction|
+---+-----+---+---+----+---+---+-----+--------------------+--------------------+--------------------+----------+
| 17|19752|  7|  2|   0|  0| 25|    2|[17.0,19752.0,7.0...|[-603.83024152897...|[3.55017080576790...|       1.0|
| 17|24090|  9|  2|   0|  0| 35|    2|[17.0,24090.0,9.0...|[-741.82504306404...|[1.30388265806149...|       1.0|
| 17|25051|  6|  1|   0|  0| 16|    1|[17.0,25051.0,6.0...|[-547.62221373539...|[8.55480582189268...|       1.0|
| 17|29571|  8|  1|   0|  0| 15|    1|[17.0,29571.0,8.0...|[-594.41135826275...|[4.22658921812652...|       1.0|
| 17|31007|  6|  2|   0|  0| 30|    2|[17.0,31007.0,6.0...|[-724.46084085428...|[6.78632746005661...|       1.0|
| 17|32607|  6|  1|   0|  0| 20|    1|[17.0,32607.0,6.0...|[-640.64253400357...|[1.1532817128748

In [12]:
# create the trainer and set its parameters
nb2 = NaiveBayes(smoothing=10.0, modelType="multinomial")

# train the model
model2 = nb2.fit(train)

# select example rows to display.
predictions = model2.transform(test)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

+---+-----+---+---+----+---+---+-----+--------------------+--------------------+--------------------+----------+
|  1|    3|  5| 10|  11| 12| 13|label|            features|       rawPrediction|         probability|prediction|
+---+-----+---+---+----+---+---+-----+--------------------+--------------------+--------------------+----------+
| 17|19752|  7|  2|   0|  0| 25|    2|[17.0,19752.0,7.0...|[-603.82833255900...|[3.54425060120333...|       1.0|
| 17|24090|  9|  2|   0|  0| 35|    2|[17.0,24090.0,9.0...|[-741.82292742258...|[1.30136607200351...|       1.0|
| 17|25051|  6|  1|   0|  0| 16|    1|[17.0,25051.0,6.0...|[-547.62132541304...|[8.54402615287241...|       1.0|
| 17|29571|  8|  1|   0|  0| 15|    1|[17.0,29571.0,8.0...|[-594.41044627901...|[4.22120922120825...|       1.0|
| 17|31007|  6|  2|   0|  0| 30|    2|[17.0,31007.0,6.0...|[-724.45916112928...|[6.77654354208634...|       1.0|
| 17|32607|  6|  1|   0|  0| 20|    1|[17.0,32607.0,6.0...|[-640.64174307001...|[1.1519318472885

In [14]:
from pyspark.ml.classification import DecisionTreeClassifier

# create the trainer and set its parameters
nb3 = DecisionTreeClassifier(labelCol="label", featuresCol="features")

# train the model
model3 = nb3.fit(train)

# select example rows to display.
predictions = model3.transform(test)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

+---+-----+---+---+----+---+---+-----+--------------------+-----------------+-------------+----------+
|  1|    3|  5| 10|  11| 12| 13|label|            features|    rawPrediction|  probability|prediction|
+---+-----+---+---+----+---+---+-----+--------------------+-----------------+-------------+----------+
| 17|19752|  7|  2|   0|  0| 25|    2|[17.0,19752.0,7.0...| [0.0,0.0,6409.0]|[0.0,0.0,1.0]|       2.0|
| 17|24090|  9|  2|   0|  0| 35|    2|[17.0,24090.0,9.0...| [0.0,0.0,6409.0]|[0.0,0.0,1.0]|       2.0|
| 17|25051|  6|  1|   0|  0| 16|    1|[17.0,25051.0,6.0...|[0.0,13008.0,0.0]|[0.0,1.0,0.0]|       1.0|
| 17|29571|  8|  1|   0|  0| 15|    1|[17.0,29571.0,8.0...|[0.0,13008.0,0.0]|[0.0,1.0,0.0]|       1.0|
| 17|31007|  6|  2|   0|  0| 30|    2|[17.0,31007.0,6.0...| [0.0,0.0,6409.0]|[0.0,0.0,1.0]|       2.0|
| 17|32607|  6|  1|   0|  0| 20|    1|[17.0,32607.0,6.0...|[0.0,13008.0,0.0]|[0.0,1.0,0.0]|       1.0|
| 17|34019|  6|  1|   0|  0| 20|    1|[17.0,34019.0,6.0...|[0.0,13008.0,0

In [16]:
from pyspark.ml.classification import RandomForestClassifier

# create the trainer and set its parameters
nb3 = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)

# train the model
model3 = nb3.fit(train)

# select example rows to display.
predictions = model3.transform(test)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

+---+-----+---+---+----+---+---+-----+--------------------+--------------------+--------------------+----------+
|  1|    3|  5| 10|  11| 12| 13|label|            features|       rawPrediction|         probability|prediction|
+---+-----+---+---+----+---+---+-----+--------------------+--------------------+--------------------+----------+
| 17|19752|  7|  2|   0|  0| 25|    2|[17.0,19752.0,7.0...|[0.0,0.4271755725...|[0.0,0.0427175572...|       2.0|
| 17|24090|  9|  2|   0|  0| 35|    2|[17.0,24090.0,9.0...|[0.0,0.4271755725...|[0.0,0.0427175572...|       2.0|
| 17|25051|  6|  1|   0|  0| 16|    1|[17.0,25051.0,6.0...|[0.0,9.4271755725...|[0.0,0.9427175572...|       1.0|
| 17|29571|  8|  1|   0|  0| 15|    1|[17.0,29571.0,8.0...|[0.0,9.4271755725...|[0.0,0.9427175572...|       1.0|
| 17|31007|  6|  2|   0|  0| 30|    2|[17.0,31007.0,6.0...|[0.0,0.4271755725...|[0.0,0.0427175572...|       2.0|
| 17|32607|  6|  1|   0|  0| 20|    1|[17.0,32607.0,6.0...|[0.0,9.4271755725...|[0.0,0.942717557

In [17]:
from pyspark.ml.classification import RandomForestClassifier

# create the trainer and set its parameters
nb3 = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100)

# train the model
model3 = nb3.fit(train)

# select example rows to display.
predictions = model3.transform(test)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

+---+-----+---+---+----+---+---+-----+--------------------+--------------------+--------------------+----------+
|  1|    3|  5| 10|  11| 12| 13|label|            features|       rawPrediction|         probability|prediction|
+---+-----+---+---+----+---+---+-----+--------------------+--------------------+--------------------+----------+
| 17|19752|  7|  2|   0|  0| 25|    2|[17.0,19752.0,7.0...|[0.0,5.5219912822...|[0.0,0.0552199128...|       2.0|
| 17|24090|  9|  2|   0|  0| 35|    2|[17.0,24090.0,9.0...|[0.0,4.2688654979...|[0.0,0.0426886549...|       2.0|
| 17|25051|  6|  1|   0|  0| 16|    1|[17.0,25051.0,6.0...|[0.0,94.521991282...|[0.0,0.9452199128...|       1.0|
| 17|29571|  8|  1|   0|  0| 15|    1|[17.0,29571.0,8.0...|[0.0,94.209548863...|[0.0,0.9420954886...|       1.0|
| 17|31007|  6|  2|   0|  0| 30|    2|[17.0,31007.0,6.0...|[0.0,5.5219912822...|[0.0,0.0552199128...|       2.0|
| 17|32607|  6|  1|   0|  0| 20|    1|[17.0,32607.0,6.0...|[0.0,94.521991282...|[0.0,0.945219912

In [18]:
from pyspark.ml.classification import RandomForestClassifier

# create the trainer and set its parameters
nb3 = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=1)

# train the model
model3 = nb3.fit(train)

# select example rows to display.
predictions = model3.transform(test)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

+---+-----+---+---+----+---+---+-----+--------------------+-------------+-------------+----------+
|  1|    3|  5| 10|  11| 12| 13|label|            features|rawPrediction|  probability|prediction|
+---+-----+---+---+----+---+---+-----+--------------------+-------------+-------------+----------+
| 17|19752|  7|  2|   0|  0| 25|    2|[17.0,19752.0,7.0...|[0.0,0.0,1.0]|[0.0,0.0,1.0]|       2.0|
| 17|24090|  9|  2|   0|  0| 35|    2|[17.0,24090.0,9.0...|[0.0,0.0,1.0]|[0.0,0.0,1.0]|       2.0|
| 17|25051|  6|  1|   0|  0| 16|    1|[17.0,25051.0,6.0...|[0.0,1.0,0.0]|[0.0,1.0,0.0]|       1.0|
| 17|29571|  8|  1|   0|  0| 15|    1|[17.0,29571.0,8.0...|[0.0,1.0,0.0]|[0.0,1.0,0.0]|       1.0|
| 17|31007|  6|  2|   0|  0| 30|    2|[17.0,31007.0,6.0...|[0.0,0.0,1.0]|[0.0,0.0,1.0]|       2.0|
| 17|32607|  6|  1|   0|  0| 20|    1|[17.0,32607.0,6.0...|[0.0,1.0,0.0]|[0.0,1.0,0.0]|       1.0|
| 17|34019|  6|  1|   0|  0| 20|    1|[17.0,34019.0,6.0...|[0.0,1.0,0.0]|[0.0,1.0,0.0]|       1.0|
| 17|47199

In [19]:
data = spark.read.load("imports-85_data.csv", format="csv", header=True, delimiter=",")
data.show(5)

+---+---+----+-----+----+----+----+---+----+----+---+---+----+---+---+
|  0|  1|   2|    3|   4|   5|   6|  7|   8|   9| 10| 11|  12| 13| 14|
+---+---+----+-----+----+----+----+---+----+----+---+---+----+---+---+
|  3|  1|88.6|168.8|64.1|48.8|2548|130|3.47|2.68|  9|111|5000| 21| 27|
|  3|  1|88.6|168.8|64.1|48.8|2548|130|3.47|2.68|  9|111|5000| 21| 27|
|  1|  1|94.5|171.2|65.5|52.4|2823|152|2.68|3.47|  9|154|5000| 19| 26|
|  2|  1|99.8|176.6|66.2|54.3|2337|109|3.19| 3.4| 10|102|5500| 24| 30|
|  2|  1|99.4|176.6|66.4|54.3|2824|136|3.19| 3.4|  8|115|5500| 18| 22|
+---+---+----+-----+----+----+----+---+----+----+---+---+----+---+---+
only showing top 5 rows



In [20]:
from pyspark.sql.types import *

# Change column type
data = data.withColumn("0", data["0"].cast(IntegerType()))
data = data.withColumn("1", data["1"].cast(IntegerType()))
data = data.withColumn("2", data["2"].cast(IntegerType()))
data = data.withColumn("4", data["4"].cast(IntegerType()))
data = data.withColumn("3", data["3"].cast(IntegerType()))
data = data.withColumn("5", data["5"].cast(IntegerType()))

data = data.withColumn("6", data["6"].cast(IntegerType()))
data = data.withColumn("7", data["7"].cast(IntegerType()))
data = data.withColumn("8", data["8"].cast(IntegerType()))
data = data.withColumn("9", data["9"].cast(IntegerType()))


data = data.withColumn("10", data["10"].cast(IntegerType()))
data = data.withColumn("11", data["11"].cast(IntegerType()))
data = data.withColumn("12", data["12"].cast(IntegerType()))
data = data.withColumn("13", data["13"].cast(IntegerType()))

data.printSchema()

root
 |-- 0: integer (nullable = true)
 |-- 1: integer (nullable = true)
 |-- 2: integer (nullable = true)
 |-- 3: integer (nullable = true)
 |-- 4: integer (nullable = true)
 |-- 5: integer (nullable = true)
 |-- 6: integer (nullable = true)
 |-- 7: integer (nullable = true)
 |-- 8: integer (nullable = true)
 |-- 9: integer (nullable = true)
 |-- 10: integer (nullable = true)
 |-- 11: integer (nullable = true)
 |-- 12: integer (nullable = true)
 |-- 13: integer (nullable = true)
 |-- 14: string (nullable = true)



In [21]:
data = data.withColumn("label", data['14'] - 0)
data.show(5)

+---+---+---+---+---+---+----+---+---+---+---+---+----+---+---+-----+
|  0|  1|  2|  3|  4|  5|   6|  7|  8|  9| 10| 11|  12| 13| 14|label|
+---+---+---+---+---+---+----+---+---+---+---+---+----+---+---+-----+
|  3|  1| 88|168| 64| 48|2548|130|  3|  2|  9|111|5000| 21| 27| 27.0|
|  3|  1| 88|168| 64| 48|2548|130|  3|  2|  9|111|5000| 21| 27| 27.0|
|  1|  1| 94|171| 65| 52|2823|152|  2|  3|  9|154|5000| 19| 26| 26.0|
|  2|  1| 99|176| 66| 54|2337|109|  3|  3| 10|102|5500| 24| 30| 30.0|
|  2|  1| 99|176| 66| 54|2824|136|  3|  3|  8|115|5500| 18| 22| 22.0|
+---+---+---+---+---+---+----+---+---+---+---+---+----+---+---+-----+
only showing top 5 rows



In [22]:
assem = VectorAssembler(inputCols=data.columns[0:14], outputCol='features')
data = assem.transform(data)
data.show(5)

+---+---+---+---+---+---+----+---+---+---+---+---+----+---+---+-----+--------------------+
|  0|  1|  2|  3|  4|  5|   6|  7|  8|  9| 10| 11|  12| 13| 14|label|            features|
+---+---+---+---+---+---+----+---+---+---+---+---+----+---+---+-----+--------------------+
|  3|  1| 88|168| 64| 48|2548|130|  3|  2|  9|111|5000| 21| 27| 27.0|[3.0,1.0,88.0,168...|
|  3|  1| 88|168| 64| 48|2548|130|  3|  2|  9|111|5000| 21| 27| 27.0|[3.0,1.0,88.0,168...|
|  1|  1| 94|171| 65| 52|2823|152|  2|  3|  9|154|5000| 19| 26| 26.0|[1.0,1.0,94.0,171...|
|  2|  1| 99|176| 66| 54|2337|109|  3|  3| 10|102|5500| 24| 30| 30.0|[2.0,1.0,99.0,176...|
|  2|  1| 99|176| 66| 54|2824|136|  3|  3|  8|115|5500| 18| 22| 22.0|[2.0,1.0,99.0,176...|
+---+---+---+---+---+---+----+---+---+---+---+---+----+---+---+-----+--------------------+
only showing top 5 rows



In [23]:
# Split the data into train and test
train,test = data.randomSplit([0.6, 0.4], 1234)

In [29]:
data = spark.read.load("imports-85_data.csv", format="csv", header=True, delimiter=",")
data.show(5)

from pyspark.sql.types import *

# Change column type
data = data.withColumn("0", data["0"].cast(IntegerType()))
data = data.withColumn("1", data["1"].cast(IntegerType()))
data = data.withColumn("2", data["2"].cast(IntegerType()))
data = data.withColumn("4", data["4"].cast(IntegerType()))
data = data.withColumn("3", data["3"].cast(IntegerType()))
data = data.withColumn("5", data["5"].cast(IntegerType()))

data = data.withColumn("6", data["6"].cast(IntegerType()))
data = data.withColumn("7", data["7"].cast(IntegerType()))
data = data.withColumn("8", data["8"].cast(IntegerType()))
data = data.withColumn("9", data["9"].cast(IntegerType()))


data = data.withColumn("10", data["10"].cast(IntegerType()))
data = data.withColumn("11", data["11"].cast(IntegerType()))
data = data.withColumn("12", data["12"].cast(IntegerType()))
data = data.withColumn("13", data["13"].cast(IntegerType()))

data.printSchema()

from pyspark.ml.regression import LinearRegression

# create the trainer and set its parameters
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(data)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

+---+---+----+-----+----+----+----+---+----+----+---+---+----+---+---+
|  0|  1|   2|    3|   4|   5|   6|  7|   8|   9| 10| 11|  12| 13| 14|
+---+---+----+-----+----+----+----+---+----+----+---+---+----+---+---+
|  3|  1|88.6|168.8|64.1|48.8|2548|130|3.47|2.68|  9|111|5000| 21| 27|
|  3|  1|88.6|168.8|64.1|48.8|2548|130|3.47|2.68|  9|111|5000| 21| 27|
|  1|  1|94.5|171.2|65.5|52.4|2823|152|2.68|3.47|  9|154|5000| 19| 26|
|  2|  1|99.8|176.6|66.2|54.3|2337|109|3.19| 3.4| 10|102|5500| 24| 30|
|  2|  1|99.4|176.6|66.4|54.3|2824|136|3.19| 3.4|  8|115|5500| 18| 22|
+---+---+----+-----+----+----+----+---+----+----+---+---+----+---+---+
only showing top 5 rows

root
 |-- 0: integer (nullable = true)
 |-- 1: integer (nullable = true)
 |-- 2: integer (nullable = true)
 |-- 3: integer (nullable = true)
 |-- 4: integer (nullable = true)
 |-- 5: integer (nullable = true)
 |-- 6: integer (nullable = true)
 |-- 7: integer (nullable = true)
 |-- 8: integer (nullable = true)
 |-- 9: integer (nullable

IllegalArgumentException: 'Field "features" does not exist.\nAvailable fields: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14'