IMPORTING PACKAGES

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql import DataFrame
import pandas as pd
from pyspark.sql.functions import (
    min, max, col, isnan, isnull, 
)
from pyspark.sql.types import StringType
from pyspark.ml.stat import Correlation
import numpy as np
import pylab as pl
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator



import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [2]:
# Changing the directory to a specific location
%cd C:\Users\june3\OneDrive\Desktop\my_output_result\

C:\Users\june3\OneDrive\Desktop\my_output_result


CREATE SPARK SESSION

In [3]:
# Step 1: Create a SparkSession
spark = SparkSession.builder \
    .appName("SDWA DATA ANALYSIS 2") \
    .getOrCreate()

In [4]:
# Step 2: Read all CSV files into a Spark DataFrame
df = spark.read.csv("gmm_k5.csv", header=True, inferSchema=True)

In [5]:
# cluster_mapping = {0: 'High-Risk', 
#                    1: 'Low-Risk', 
#                    2: 'Moderate-Risk', 
#                    3: 'Near-Compliant', 
#                    4: 'Compliant'}

# mapping_expr = F.create_map([F.lit(x) for pair in cluster_mapping.items() for x in pair])
# df = df.withColumn('label', mapping_expr[df['cluster']])

In [9]:
# Selecting the relevant columns
df = df.select("PWSID", 
            "PWS_ACTIVITY_CODE", 
            "IS_SCHOOL_OR_DAYCARE_IND",
            "SOURCE_WATER_PROTECTION_CODE", 
            "OUTSTANDING_PERFORMER",
            "MANAGEMENT_OPS_EVAL_CODE",
            "SOURCE_WATER_EVAL_CODE",
            "SECURITY_EVAL_CODE",
            "PUMPS_EVAL_CODE",
            "OTHER_EVAL_CODE",
            "COMPLIANCE_EVAL_CODE",
            "DATA_VERIFICATION_EVAL_CODE",
            "TREATMENT_EVAL_CODE",
            "FINISHED_WATER_STOR_EVAL_CODE",
            "DISTRIBUTION_EVAL_CODE",
            "FINANCIAL_EVAL_CODE",
            "VIOLATION_CATEGORY_CODE",
            "IS_HEALTH_BASED_IND",
            "IS_MAJOR_VIOL_IND",
            "VIOLATION_STATUS",
            "ENF_ACTION_CATEGORY",
           "COMPLIANCE_STATUS",
            "TOTAL_POPULATION_SERVED_COUNT_LOG",
            "TOTAL_SERVICE_CONNECTIONS_COUNT_LOG",
            "VIOL_MEASURE_LOG",
            "TOTAL_VIOLATIONS_LOG",
            "AVG_VIOLATION_DURATION_DAYS_LOG",
            "OPEN_VIOLATIONS_COUNT_LOG",
            "TOTAL_LATE_COMPLIANT_ACTIONS_LOG",
            "cluster")

In [11]:
# Defining a function to find the missing values
def get_missing_values(df, dataframe_name):
    # Compute missing values for each column
    missing_df = df.select([
        count(when(isnull(c) | isnan(c), c)).alias(c) for c in df.columns
    ])
    
    # Convert to Pandas for better formatting
    missing = missing_df.toPandas().transpose().reset_index()
    missing.columns = ['Column', 'Missing_Count']
    
    # Display the missing values
    print(f"--- Missing Values in {dataframe_name} ---")
    print(missing.to_markdown(index=False))
    print("\n")

In [12]:
# Display the missing values
get_missing_values(df, "Dataframe")

--- Missing Values in Dataframe ---
| Column                              |   Missing_Count |
|:------------------------------------|----------------:|
| PWSID                               |               0 |
| PWS_ACTIVITY_CODE                   |               0 |
| IS_SCHOOL_OR_DAYCARE_IND            |               0 |
| SOURCE_WATER_PROTECTION_CODE        |               0 |
| OUTSTANDING_PERFORMER               |               0 |
| MANAGEMENT_OPS_EVAL_CODE            |               0 |
| SOURCE_WATER_EVAL_CODE              |               0 |
| SECURITY_EVAL_CODE                  |               0 |
| PUMPS_EVAL_CODE                     |               0 |
| OTHER_EVAL_CODE                     |               0 |
| COMPLIANCE_EVAL_CODE                |               0 |
| DATA_VERIFICATION_EVAL_CODE         |               0 |
| TREATMENT_EVAL_CODE                 |               0 |
| FINISHED_WATER_STOR_EVAL_CODE       |               0 |
| DISTRIBUTION_EVAL_CODE            

In [14]:
# Display the Dataframe shape
print(f" Dataframe 1: rows: {df.count()}, columns: {len(df.columns)} ")

 Dataframe 1: rows: 43073, columns: 30 


In [15]:
# Listing the Categorical values into a variable
cat_vars = ["PWS_ACTIVITY_CODE", 
            "IS_SCHOOL_OR_DAYCARE_IND",
            "SOURCE_WATER_PROTECTION_CODE", 
            "OUTSTANDING_PERFORMER",
            "MANAGEMENT_OPS_EVAL_CODE",
            "SOURCE_WATER_EVAL_CODE",
            "SECURITY_EVAL_CODE",
            "PUMPS_EVAL_CODE",
            "OTHER_EVAL_CODE",
            "COMPLIANCE_EVAL_CODE",
            "DATA_VERIFICATION_EVAL_CODE",
            "TREATMENT_EVAL_CODE",
            "FINISHED_WATER_STOR_EVAL_CODE",
            "DISTRIBUTION_EVAL_CODE",
            "FINANCIAL_EVAL_CODE",
            "VIOLATION_CATEGORY_CODE",
            "IS_HEALTH_BASED_IND",
            "IS_MAJOR_VIOL_IND",
            "VIOLATION_STATUS",
            "ENF_ACTION_CATEGORY",
           "COMPLIANCE_STATUS",
           ]

In [16]:
# Listing the Numerical values into a variable
num_vars = ["TOTAL_POPULATION_SERVED_COUNT_LOG",
            "TOTAL_SERVICE_CONNECTIONS_COUNT_LOG",
            "VIOL_MEASURE_LOG",
            "TOTAL_VIOLATIONS_LOG",
            "AVG_VIOLATION_DURATION_DAYS_LOG",
            "OPEN_VIOLATIONS_COUNT_LOG",
            "TOTAL_LATE_COMPLIANT_ACTIONS_LOG",
           ]

#### Data Preprocessing

In [17]:
# Pipeline stages
stages = []

# Step 1: Process categorical variables
for col in cat_vars:
    # StringIndexer for categorical variable
    indexer = StringIndexer(inputCol=col, outputCol=f"{col}_indexed")
    stages.append(indexer)

    # OneHotEncoder for indexed variable
    encoder = OneHotEncoder(inputCol=f"{col}_indexed", outputCol=f"{col}_encoded")
    stages.append(encoder)

# Step 2: Combine all encoded categorical and numerical columns using VectorAssembler
# Collect encoded categorical columns and add numerical columns
assembled_input_cols = [f"{col}_encoded" for col in cat_vars] + num_vars

assembler = VectorAssembler(inputCols=assembled_input_cols, outputCol="features")
stages.append(assembler)

# Step 3: Create the pipeline
pipeline = Pipeline(stages=stages)

# Fit and transform the data
final_df = pipeline.fit(df).transform(df)

# final_df will have a "features" column with all categorical and numerical variables combined


In [18]:
# Standardize the feature vector using StandardScaler
scaler1 = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
scaler_model1 = scaler1.fit(final_df)
scaled_data1 = scaler_model1.transform(final_df)

In [19]:
# Standardize the feature vector using MinMaxScaler
scaler2 = MinMaxScaler(inputCol="features", outputCol="scaled_features")
scaler_model2 = scaler2.fit(final_df)
scaled_data2 = scaler_model2.transform(final_df)


In [21]:
# Split the data into training and testing sets (70% training, 30% testing)
train_data, test_data = scaled_data1.randomSplit([0.7, 0.3], seed=42)

In [22]:
# Split the data into training and testing sets (70% training, 30% testing)
trainData, testData = scaled_data2.randomSplit([0.7, 0.3], seed=42)

### Supervised Machine Learning Modeling

MULTINOMIAL LOGISTIC REGRESSION

In [23]:
# Initialize Multinomial Logistic Regression model
lr = LogisticRegression(featuresCol="scaled_features", labelCol="cluster", family="multinomial", maxIter=10)

# Train the Multinomial Logistic Regression model
lr_model = lr.fit(train_data)

# Make predictions on test data
lr_predictions = lr_model.transform(test_data)

# Evaluate the model using accuracy
lr_evaluator = MulticlassClassificationEvaluator(labelCol="cluster", predictionCol="prediction", metricName="accuracy")
lr_accuracy = lr_evaluator.evaluate(lr_predictions)
print(f"Accuracy of Multinomial Logistic Regression Model: {lr_accuracy}")


f1_evaluator = MulticlassClassificationEvaluator(labelCol="cluster", predictionCol="prediction", metricName="f1")
f1_score = f1_evaluator.evaluate(lr_predictions)
print(f"F1-Score of Multinomial Logistic Regression Model: {f1_score}")

Accuracy of Multinomial Logistic Regression Model: 0.9728072933039924
F1-Score of Multinomial Logistic Regression Model: 0.9722223615356852


In [45]:

lr_trainingSummary = lr_model.summary
lr_data = {
    "Label": list(range(len(lr_trainingSummary.falsePositiveRateByLabel))),
    "False Positive Rate": lr_trainingSummary.falsePositiveRateByLabel,
    "True Positive Rate": lr_trainingSummary.truePositiveRateByLabel,
    "Precision": lr_trainingSummary.precisionByLabel,
    "Recall": lr_trainingSummary.recallByLabel,
    "F-Measure": lr_trainingSummary.fMeasureByLabel(),
}

# Create a DataFrame for the per-label metrics
lr_metrics_df = pd.DataFrame(lr_data)

# Add overall metrics as a separate row
overall_lr_metrics = pd.DataFrame([{
    "Label": "Overall",
    "False Positive Rate": lr_trainingSummary.weightedFalsePositiveRate,
    "True Positive Rate": lr_trainingSummary.weightedTruePositiveRate,
    "Precision": lr_trainingSummary.weightedPrecision,
    "Recall": lr_trainingSummary.weightedRecall,
    "F-Measure": lr_trainingSummary.weightedFMeasure(),
}])

# Combine the two DataFrames using pd.concat()
lr_metrics_df = pd.concat([lr_metrics_df, overall_lr_metrics], ignore_index=True)

In [46]:
lr_metrics_df

Unnamed: 0,Label,False Positive Rate,True Positive Rate,Precision,Recall,F-Measure
0,0,0.001559,0.753555,0.932551,0.753555,0.833552
1,1,0.005448,0.986075,0.975032,0.986075,0.980522
2,2,0.020755,0.999815,0.982266,0.999815,0.990963
3,3,0.009493,0.910203,0.950536,0.910203,0.929932
4,4,0.003959,0.970567,0.961701,0.970567,0.966114
5,Overall,0.014064,0.972849,0.972391,0.972849,0.972234


RANDOM FOREST CLASSIFIER

In [24]:
# Initialize the RandomForestClassifier
rf = RandomForestClassifier(labelCol="cluster", featuresCol="scaled_features", numTrees=100)

# Train the model
rf_model = rf.fit(train_data)

# Make predictions on the test set
rf_predictions = rf_model.transform(test_data)

# Evaluate the model
rf_evaluator = MulticlassClassificationEvaluator(labelCol="cluster", predictionCol="prediction", metricName="accuracy")
rf_accuracy = rf_evaluator.evaluate(rf_predictions)
print(f"Accuracy of Random Forest Classifier: {rf_accuracy}")
print("Test Error for Random Forest Classifier = %g" % (1.0 - rf_accuracy))
# # Feature importances
# importances = rf_model.featureImportances
# print(f"Feature Importances obtained by Random Forest Classifier: {importances}")

Accuracy of Random Forest Classifier: 0.885020433825841
Test Error for Random Forest Classifier = 0.11498


In [48]:
rf_trainingSummary = rf_model.summary
rf_data = {
    "Label": list(range(len(rf_trainingSummary.falsePositiveRateByLabel))),
    "False Positive Rate": rf_trainingSummary.falsePositiveRateByLabel,
    "True Positive Rate": rf_trainingSummary.truePositiveRateByLabel,
    "Precision": rf_trainingSummary.precisionByLabel,
    "Recall": rf_trainingSummary.recallByLabel,
    "F-Measure": rf_trainingSummary.fMeasureByLabel(),
}

# Create a DataFrame for the per-label metrics
rf_metrics_df = pd.DataFrame(rf_data)

# Add overall metrics as a separate row
overall_rf_metrics = pd.DataFrame([{
    "Label": "Overall",
    "False Positive Rate": rf_trainingSummary.weightedFalsePositiveRate,
    "True Positive Rate": rf_trainingSummary.weightedTruePositiveRate,
    "Precision": rf_trainingSummary.weightedPrecision,
    "Recall": rf_trainingSummary.weightedRecall,
    "F-Measure": rf_trainingSummary.weightedFMeasure(),
}])

# Combine the two DataFrames using pd.concat()
rf_metrics_df = pd.concat([rf_metrics_df, overall_rf_metrics], ignore_index=True)

In [49]:
rf_metrics_df

Unnamed: 0,Label,False Positive Rate,True Positive Rate,Precision,Recall,F-Measure
0,0,0.0,0.0,0.0,0.0,0.0
1,1,0.009053,0.992573,0.95944,0.992573,0.975725
2,2,0.190905,0.997413,0.857294,0.997413,0.922061
3,3,0.02124,0.608052,0.851575,0.608052,0.709499
4,4,0.002361,0.783333,0.971416,0.783333,0.867295
5,Overall,0.107477,0.883917,0.86123,0.883917,0.865365


DECISION TREE CLASSIFIER

In [25]:
# Decision Tree
dt = DecisionTreeClassifier(featuresCol='scaled_features', labelCol='cluster')
dt_model = dt.fit(train_data)

# Make predictions on the testing data
dt_predictions = dt_model.transform(test_data)

# Select (prediction, true label) and compute test error
dt_evaluator = MulticlassClassificationEvaluator(labelCol="cluster", predictionCol="prediction", metricName="accuracy")
dt_accuracy =dt_evaluator.evaluate(dt_predictions)
print("Test Error = %g " % (1.0 - dt_accuracy))
print(f"Accuracy of Decision Tree Classifer: {dt_accuracy}")

Test Error = 0.0944671 
Accuracy of Decision Tree Classifer: 0.9055328513046211


NAIVE BAYES CLASSIFIER

In [26]:
# Initialize Naive Bayes model
naive_bayes = NaiveBayes(featuresCol="scaled_features", labelCol="cluster", modelType="multinomial")

# Train the Naive Bayes model
nb_model = naive_bayes.fit(trainData)

# Make predictions on test data
nb_predictions = nb_model.transform(testData)

# Evaluate the model using accuracy
nb_evaluator = MulticlassClassificationEvaluator(labelCol="cluster", predictionCol="prediction", metricName="accuracy")
nb_accuracy = nb_evaluator.evaluate(nb_predictions)
print(f"Accuracy of Naive Bayes Model: {nb_accuracy}")

Accuracy of Naive Bayes Model: 0.9031751021691292


In [39]:
# List to store the accuracies
model_accuracy = []

# Multinomial Logistic Regression
model_accuracy.append(("Multinomial Logistic Regression", lr_accuracy))

# Decision Tree
model_accuracy.append(("Decision Tree", dt_accuracy))

# Naive Bayes
model_accuracy.append(("Naive Bayes", nb_accuracy))

# Random Forest
model_accuracy.append(("Random Forest", rf_accuracy))

# Create a DataFrame from the list
accuracy_df = pd.DataFrame(model_accuracy, columns=["model", "accuracy"])

In [40]:
accuracy_df

Unnamed: 0,model,accuracy
0,Multinomial Logistic Regression,0.972807
1,Decision Tree,0.905533
2,Naive Bayes,0.903175
3,Random Forest,0.887928


In [51]:

# For F1-Score
f1_evaluator = MulticlassClassificationEvaluator(labelCol="cluster", predictionCol="prediction", metricName="f1")
lr_f1_score = f1_evaluator.evaluate(lr_predictions)
rf_f1_score = f1_evaluator.evaluate(rf_predictions)
dt_f1_score = f1_evaluator.evaluate(dt_predictions)
nb_f1_score = f1_evaluator.evaluate(nb_predictions)

# For Precision
precision_evaluator = MulticlassClassificationEvaluator(labelCol="cluster", predictionCol="prediction", metricName="weightedPrecision")
lr_precision = precision_evaluator.evaluate(lr_predictions)
rf_precision = precision_evaluator.evaluate(rf_predictions)
dt_precision = precision_evaluator.evaluate(dt_predictions)
nb_precision = precision_evaluator.evaluate(nb_predictions)

# For Recall
recall_evaluator = MulticlassClassificationEvaluator(labelCol="cluster", predictionCol="prediction", metricName="weightedRecall")
lr_recall = recall_evaluator.evaluate(lr_predictions)
rf_recall = recall_evaluator.evaluate(rf_predictions)
dt_recall = recall_evaluator.evaluate(dt_predictions)
nb_recall = recall_evaluator.evaluate(nb_predictions)

# Now you can store all these metrics in a list and build the DataFrame
metrics = []

# Assuming you have similar metrics for other models
metrics.append(("Multinomial Logistic Regression", lr_accuracy, lr_f1_score, lr_precision, lr_recall))  # Replace with actual metrics
metrics.append(("Decision Tree", dt_accuracy, dt_f1_score, dt_precision, dt_recall))
metrics.append(("Naive Bayes", nb_accuracy, nb_f1_score, nb_precision, nb_recall))  # Replace with actual metrics
metrics.append(("Random Forest", rf_accuracy, rf_f1_score, rf_precision, rf_recall))  # Replace with actual metrics


# Create a DataFrame from the list
metrics_df = pd.DataFrame(metrics, columns=["Model", "Accuracy", "F1_score", "Precision", "Recall"])

In [52]:
metrics_df

Unnamed: 0,Model,Accuracy,F1_score,Precision,Recall
0,Decision Tree,0.905533,0.891291,0.906826,0.905533
1,Multinomial Logistic Regression,0.972807,0.972222,0.972222,0.972807
2,Naive Bayes,0.903175,0.905019,0.910685,0.903175
3,Random Forest,0.887928,0.870472,0.867024,0.887928


In [28]:
# Build a parameter grid for hyperparameter tuning
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.1, 1.0])  # Regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])  # ElasticNet mixing parameter
             .addGrid(lr.maxIter, [10, 50, 100])  # Maximum number of iterations
             .build())


In [29]:
# Set up cross-validation
cv = CrossValidator(estimator=lr, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=lr_evaluator, 
                    numFolds=5)  # 5-fold cross-validation


In [30]:
# Fit the model using cross-validation
cvModel = cv.fit(train_data)

In [32]:
# Get the best model from cross-validation
bestModel = cvModel.bestModel

# Make predictions on the test data
predictions = bestModel.transform(test_data)

# Evaluate the model
accuracy = lr_evaluator.evaluate(predictions)
print("Test Accuracy = ", accuracy)

# Print the best hyperparameters
print("Best Parameters:")
print("  MaxIter:", bestModel._java_obj.getMaxIter())
print("  RegParam:", bestModel._java_obj.getRegParam())
print("  ElasticNetParam:", bestModel._java_obj.getElasticNetParam())


Test Accuracy =  0.9652624960704181
Best Parameters:
  MaxIter: 100
  RegParam: 0.01
  ElasticNetParam: 0.0


#### The result do not show any improvement in the accuracy instead there is slight decrease 