## Spark

In [12]:
import findspark
findspark.init()

In [13]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

## Create Spark Session

In [14]:
from pyspark.sql.session import SparkSession

spark = (SparkSession.builder
    .appName("Truckfleet ML violation detection")
    .getOrCreate())

In [15]:
truckDF = (spark.read.option("header","true")
                 .option("inferSchema","true")
                 .csv("hdfs://localhost:9000/datalake/raw/truckfleet/")
                 .cache())

print(f"There are {truckDF.count()} rows in the datasets")

There are 1000 rows in the datasets


In [16]:
truckDF.printSchema()

root
 |-- driverName: string (nullable = true)
 |-- is_violation: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- is_Adam_John_Michael: integer (nullable = true)



In [17]:
truckDF.limit(5).toPandas()

Unnamed: 0,driverName,is_violation,year,month,day,eventTime,is_Adam_John_Michael
0,John,1,2022,7,27,2022-07-27,1
1,Adam,0,2022,12,14,2022-12-14,1
2,Adam,1,2020,5,26,2020-05-26,1
3,Adam,1,2022,1,12,2022-01-12,1
4,Adam,1,2020,4,8,2020-04-08,1


In [1]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["year","month","day","is_Adam_John_Michael"], outputCol="features")
truckFeaturesDF = assembler.transform(truckDF)
truckFeaturesDF.limit(5).toPandas()

ModuleNotFoundError: No module named 'pyspark'

In [19]:
truckFeaturesDF.printSchema()

root
 |-- driverName: string (nullable = true)
 |-- is_violation: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- is_Adam_John_Michael: integer (nullable = true)
 |-- features: vector (nullable = true)



In [20]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

# Assuming your SparkSession is named 'spark' and your DataFrame is 'df'

# 1. Split the Data into Training and Test Sets
(trainData, testData) = truckFeaturesDF.randomSplit([0.7, 0.3], seed=42)

# 2. Train the Model
# Initialize the LogisticRegression estimator
lr = LogisticRegression(featuresCol="features", labelCol="is_violation")

# Fit the model to the training data
lrModel = lr.fit(trainData)

# 3. Make Predictions on the Test Data
predictions = lrModel.transform(testData)

# 4. Evaluate the Model
# Use the BinaryClassificationEvaluator to evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="is_violation", rawPredictionCol="rawPrediction")

# Compute the area under the ROC curve for the model
auc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})

print(f"Area under the ROC curve: {auc:.3f}")

Area under the ROC curve: 0.732


In [21]:
# Define the HDFS path where you want to save the model
modelPath = "hdfs://localhost:9000/datalake/raw/truckfleet/logistic_regression_model"

# Save the model to HDFS
lrModel.save(modelPath)

print(f"Model successfully saved to {modelPath}")

[Stage 43:>                                                         (0 + 1) / 1]

Model successfully saved to hdfs://localhost:9000/datalake/raw/truckfleet/logistic_regression_model


                                                                                

In [22]:
from pyspark.ml.classification import LogisticRegressionModel

# Load the model back from the specified HDFS path
loadedModel = LogisticRegressionModel.load(modelPath)

print("Model successfully loaded.")

Model successfully loaded.


In [23]:

# 3. Make Predictions on the Test Data
predictions = loadedModel.transform(testData)

# 4. Evaluate the Model
# Use the BinaryClassificationEvaluator to evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="is_violation", rawPredictionCol="rawPrediction")

# Compute the area under the ROC curve for the model
auc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})

print(f"Area under the ROC curve: {auc:.3f}")

Area under the ROC curve: 0.732
