IMPORTING PACKAGES

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql import DataFrame
import pandas as pd
from pyspark.sql.functions import (
    min, max, year, month, col, isnan, isnull, when, count, countDistinct, 
    round, desc, sum as sum_, mean, stddev, variance, skewness, kurtosis, 
    explode, split, regexp_replace, to_timestamp, to_date, lit, datediff, current_date
)
from pyspark.sql.types import StringType
import matplotlib.pyplot as plt
from functools import reduce
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from pyspark.sql.window import Window
from pyspark.sql.functions import create_map, lit
from itertools import chain
from pyspark.sql.functions import avg as F_avg
from pyspark.sql.functions import sum as F_sum
from pyspark.sql.functions import count as F_count
from pyspark.sql.functions import col, round as F_round
from pyspark.sql.functions import log1p 
import seaborn as sns
from pyspark.ml.stat import Correlation
import numpy as np
import pylab as pl
from pyspark.ml.linalg import Vectors
import numpy as np
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import FMClassifier



import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [2]:
# Changing the directory in the Colab notebook to a specific location within the Google Drive
%cd C:\Users\june3\OneDrive\Desktop\my_output_result\

C:\Users\june3\OneDrive\Desktop\my_output_result


CREATE SPARK SESSION

In [3]:
# Step 1: Create a SparkSession
spark = SparkSession.builder \
    .appName("SDWA DATA ANALYSIS 2") \
    .getOrCreate()

In [4]:
# Step 2: Read all CSV files into a list of Spark DataFrames
df = spark.read.csv("gmm_k5.csv", header=True, inferSchema=True)

In [37]:
cluster_mapping = {0: 'High-Risk', 
                   1: 'Low-Risk', 
                   2: 'Moderate-Risk', 
                   3: 'Near-Compliant', 
                   4: 'Compliant'}

mapping_expr = F.create_map([F.lit(x) for pair in cluster_mapping.items() for x in pair])
df = df.withColumn('label', mapping_expr[df['cluster']])

In [38]:
df = df.toPandas()

In [39]:
# Set index=True if you want to save the DataFrame index
df.to_csv('C:/Users/june3/OneDrive/Desktop/my_output_result/cluster_label.csv', index=False) 

In [6]:
df.printSchema()

root
 |-- PWSID: string (nullable = true)
 |-- PWS_ACTIVITY_CODE: string (nullable = true)
 |-- IS_SCHOOL_OR_DAYCARE_IND: string (nullable = true)
 |-- SOURCE_WATER_PROTECTION_CODE: string (nullable = true)
 |-- OUTSTANDING_PERFORMER: string (nullable = true)
 |-- MANAGEMENT_OPS_EVAL_CODE: string (nullable = true)
 |-- SOURCE_WATER_EVAL_CODE: string (nullable = true)
 |-- SECURITY_EVAL_CODE: string (nullable = true)
 |-- PUMPS_EVAL_CODE: string (nullable = true)
 |-- OTHER_EVAL_CODE: string (nullable = true)
 |-- COMPLIANCE_EVAL_CODE: string (nullable = true)
 |-- DATA_VERIFICATION_EVAL_CODE: string (nullable = true)
 |-- TREATMENT_EVAL_CODE: string (nullable = true)
 |-- FINISHED_WATER_STOR_EVAL_CODE: string (nullable = true)
 |-- DISTRIBUTION_EVAL_CODE: string (nullable = true)
 |-- FINANCIAL_EVAL_CODE: string (nullable = true)
 |-- VIOLATION_CATEGORY_CODE: string (nullable = true)
 |-- IS_HEALTH_BASED_IND: string (nullable = true)
 |-- IS_MAJOR_VIOL_IND: string (nullable = true)
 |-

In [7]:
df = df.select("PWSID", 
            "PWS_ACTIVITY_CODE", 
            "IS_SCHOOL_OR_DAYCARE_IND",
            "SOURCE_WATER_PROTECTION_CODE", 
            "OUTSTANDING_PERFORMER",
            "MANAGEMENT_OPS_EVAL_CODE",
            "SOURCE_WATER_EVAL_CODE",
            "SECURITY_EVAL_CODE",
            "PUMPS_EVAL_CODE",
            "OTHER_EVAL_CODE",
            "COMPLIANCE_EVAL_CODE",
            "DATA_VERIFICATION_EVAL_CODE",
            "TREATMENT_EVAL_CODE",
            "FINISHED_WATER_STOR_EVAL_CODE",
            "DISTRIBUTION_EVAL_CODE",
            "FINANCIAL_EVAL_CODE",
            "VIOLATION_CATEGORY_CODE",
            "IS_HEALTH_BASED_IND",
            "IS_MAJOR_VIOL_IND",
            "VIOLATION_STATUS",
            "ENF_ACTION_CATEGORY",
           "COMPLIANCE_STATUS",
            "TOTAL_POPULATION_SERVED_COUNT_LOG",
            "TOTAL_SERVICE_CONNECTIONS_COUNT_LOG",
            "VIOL_MEASURE_LOG",
            "TOTAL_VIOLATIONS_LOG",
            "AVG_VIOLATION_DURATION_DAYS_LOG",
            "OPEN_VIOLATIONS_COUNT_LOG",
            "TOTAL_LATE_COMPLIANT_ACTIONS_LOG",
            "cluster")

In [8]:
df.printSchema()

root
 |-- PWSID: string (nullable = true)
 |-- PWS_ACTIVITY_CODE: string (nullable = true)
 |-- IS_SCHOOL_OR_DAYCARE_IND: string (nullable = true)
 |-- SOURCE_WATER_PROTECTION_CODE: string (nullable = true)
 |-- OUTSTANDING_PERFORMER: string (nullable = true)
 |-- MANAGEMENT_OPS_EVAL_CODE: string (nullable = true)
 |-- SOURCE_WATER_EVAL_CODE: string (nullable = true)
 |-- SECURITY_EVAL_CODE: string (nullable = true)
 |-- PUMPS_EVAL_CODE: string (nullable = true)
 |-- OTHER_EVAL_CODE: string (nullable = true)
 |-- COMPLIANCE_EVAL_CODE: string (nullable = true)
 |-- DATA_VERIFICATION_EVAL_CODE: string (nullable = true)
 |-- TREATMENT_EVAL_CODE: string (nullable = true)
 |-- FINISHED_WATER_STOR_EVAL_CODE: string (nullable = true)
 |-- DISTRIBUTION_EVAL_CODE: string (nullable = true)
 |-- FINANCIAL_EVAL_CODE: string (nullable = true)
 |-- VIOLATION_CATEGORY_CODE: string (nullable = true)
 |-- IS_HEALTH_BASED_IND: string (nullable = true)
 |-- IS_MAJOR_VIOL_IND: string (nullable = true)
 |-

In [9]:
def get_missing_values(df, dataframe_name):
    # Compute missing values for each column
    missing_df = df.select([
        count(when(isnull(c) | isnan(c), c)).alias(c) for c in df.columns
    ])
    
    # Convert to Pandas for better formatting
    missing = missing_df.toPandas().transpose().reset_index()
    missing.columns = ['Column', 'Missing_Count']
    
    # Display the missing values
    print(f"--- Missing Values in {dataframe_name} ---")
    print(missing.to_markdown(index=False))
    print("\n")

In [10]:
def get_summary_statistics(df, dataframe_name):
    # Compute summary statistics
    summary = df.describe().toPandas().set_index('summary').transpose()
    
    # Rename the index to the dataframe name for clarity
    summary.index.name = 'Column'
    
    # Display the summary statistics
    print(f"--- Summary Statistics for {dataframe_name} ---")
    print(summary.to_markdown())
    print("\n")

DATAFRAME

In [11]:
print(f" Dataframe 1: rows: {df.count()}, columns: {len(df.columns)} ")

 Dataframe 1: rows: 43073, columns: 30 


In [12]:
cat_vars = ["PWS_ACTIVITY_CODE", 
            "IS_SCHOOL_OR_DAYCARE_IND",
            "SOURCE_WATER_PROTECTION_CODE", 
            "OUTSTANDING_PERFORMER",
            "MANAGEMENT_OPS_EVAL_CODE",
            "SOURCE_WATER_EVAL_CODE",
            "SECURITY_EVAL_CODE",
            "PUMPS_EVAL_CODE",
            "OTHER_EVAL_CODE",
            "COMPLIANCE_EVAL_CODE",
            "DATA_VERIFICATION_EVAL_CODE",
            "TREATMENT_EVAL_CODE",
            "FINISHED_WATER_STOR_EVAL_CODE",
            "DISTRIBUTION_EVAL_CODE",
            "FINANCIAL_EVAL_CODE",
            "VIOLATION_CATEGORY_CODE",
            "IS_HEALTH_BASED_IND",
            "IS_MAJOR_VIOL_IND",
            "VIOLATION_STATUS",
            "ENF_ACTION_CATEGORY",
           "COMPLIANCE_STATUS",
           ]

In [13]:
num_vars = ["TOTAL_POPULATION_SERVED_COUNT_LOG",
            "TOTAL_SERVICE_CONNECTIONS_COUNT_LOG",
            "VIOL_MEASURE_LOG",
            "TOTAL_VIOLATIONS_LOG",
            "AVG_VIOLATION_DURATION_DAYS_LOG",
            "OPEN_VIOLATIONS_COUNT_LOG",
            "TOTAL_LATE_COMPLIANT_ACTIONS_LOG",
           ]

In [14]:
# Pipeline stages
stages = []

# Step 1: Process categorical variables
for col in cat_vars:
    # StringIndexer for categorical variable
    indexer = StringIndexer(inputCol=col, outputCol=f"{col}_indexed")
    stages.append(indexer)

    # OneHotEncoder for indexed variable
    encoder = OneHotEncoder(inputCol=f"{col}_indexed", outputCol=f"{col}_encoded")
    stages.append(encoder)

# Step 2: Combine all encoded categorical and numerical columns using VectorAssembler
# Collect encoded categorical columns and add numerical columns
assembled_input_cols = [f"{col}_encoded" for col in cat_vars] + num_vars

assembler = VectorAssembler(inputCols=assembled_input_cols, outputCol="features")
stages.append(assembler)

# Step 3: Create the pipeline
pipeline = Pipeline(stages=stages)

# Fit and transform the data
final_df = pipeline.fit(df).transform(df)

# final_df will have a "features" column with all categorical and numerical variables combined


In [15]:
# Standardize the feature vector
scaler1 = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
scaler_model1 = scaler1.fit(final_df)
scaled_data1 = scaler_model1.transform(final_df)

In [16]:
scaler2 = MinMaxScaler(inputCol="features", outputCol="scaled_features")
scaler_model2 = scaler2.fit(final_df)
scaled_data2 = scaler_model2.transform(final_df)


In [17]:
# # Extract 'features' column from PySpark DataFrame and convert it to Pandas
# pandas_df = scaled_data.select('scaled_features').toPandas()

# # Convert the PySpark vector to a NumPy array that can be used in sklearn
# X = np.array(pandas_df['scaled_features'].tolist())

In [18]:
# Split the data into training and testing sets (70% training, 30% testing)
train_data, test_data = scaled_data1.randomSplit([0.7, 0.3], seed=42)

In [19]:
trainData, testData = scaled_data2.randomSplit([0.7, 0.3], seed=42)

MULTINOMIAL LOGISTIC REGRESSION

In [20]:
# Initialize Multinomial Logistic Regression model
lr = LogisticRegression(featuresCol="scaled_features", labelCol="cluster", family="multinomial", maxIter=10)

# Train the Multinomial Logistic Regression model
lr_model = lr.fit(train_data)

# Make predictions on test data
lr_predictions = lr_model.transform(test_data)

# Evaluate the model using accuracy
lr_evaluator = MulticlassClassificationEvaluator(labelCol="cluster", predictionCol="prediction", metricName="accuracy")
lr_accuracy = lr_evaluator.evaluate(lr_predictions)
print(f"Accuracy of Multinomial Logistic Regression Model: {lr_accuracy}")

# You can also evaluate using other metrics, such as F1-Score or AUC (if applicable)
f1_evaluator = MulticlassClassificationEvaluator(labelCol="cluster", predictionCol="prediction", metricName="f1")
f1_score = f1_evaluator.evaluate(lr_predictions)
print(f"F1-Score of Multinomial Logistic Regression Model: {f1_score}")

Accuracy of Multinomial Logistic Regression Model: 0.9728072933039924
F1-Score of Multinomial Logistic Regression Model: 0.9722223615356852


In [33]:
# Initialize an empty dictionary to hold the metrics
metrics_dict = {}


trainingSummary = lr_model.summary


# False Positive Rate (FPR) by label
fpr_by_label = [rate for rate in trainingSummary.falsePositiveRateByLabel]
metrics_dict["false_positive_rate_by_label"] = fpr_by_label

# True Positive Rate (TPR) by label
tpr_by_label = [rate for rate in trainingSummary.truePositiveRateByLabel]
metrics_dict["true_positive_rate_by_label"] = tpr_by_label

# Precision by label
precision_by_label = [prec for prec in trainingSummary.precisionByLabel]
metrics_dict["precision_by_label"] = precision_by_label

# Recall by label
recall_by_label = [rec for rec in trainingSummary.recallByLabel]
metrics_dict["recall_by_label"] = recall_by_label

# F-measure by label
f_measure_by_label = [f for f in trainingSummary.fMeasureByLabel()]
metrics_dict["f_measure_by_label"] = f_measure_by_label

# Overall Metrics
metrics_dict["accuracy"] = trainingSummary.accuracy
metrics_dict["weighted_false_positive_rate"] = trainingSummary.weightedFalsePositiveRate
metrics_dict["weighted_true_positive_rate"] = trainingSummary.weightedTruePositiveRate
metrics_dict["weighted_f_measure"] = trainingSummary.weightedFMeasure()
metrics_dict["weighted_precision"] = trainingSummary.weightedPrecision
metrics_dict["weighted_recall"] = trainingSummary.weightedRecall

# Convert the dictionary to a pandas DataFrame
metrics_df = pd.DataFrame([metrics_dict])

In [34]:
metrics_df

Unnamed: 0,false_positive_rate_by_label,true_positive_rate_by_label,precision_by_label,recall_by_label,f_measure_by_label,accuracy,weighted_false_positive_rate,weighted_true_positive_rate,weighted_f_measure,weighted_precision,weighted_recall
0,"[0.001559057786815794, 0.005448063133437487, 0...","[0.7535545023696683, 0.9860750092833271, 0.999...","[0.9325513196480938, 0.9750321277767579, 0.982...","[0.7535545023696683, 0.9860750092833271, 0.999...","[0.8335517693315858, 0.9805224776146958, 0.990...",0.972849,0.014064,0.972849,0.972234,0.972391,0.972849


In [35]:
# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# for multiclass, we can inspect metrics on a per-label basis
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

objectiveHistory:
1.2607909003227766
0.5001694740841492
0.4514971615696827
0.31319607814292844
0.25267344481613324
0.20535604539735605
0.18063481920463284
0.1594381227542598
0.1343653077898365
0.10887408972637413
0.09793206662087564
False positive rate by label:
label 0: 0.001559057786815794
label 1: 0.005448063133437487
label 2: 0.020755117942905715
label 3: 0.009492919863934815
label 4: 0.003959460932107959
True positive rate by label:
label 0: 0.7535545023696683
label 1: 0.9860750092833271
label 2: 0.9998151798915722
label 3: 0.9102032761002565
label 4: 0.9705673758865249
Precision by label:
label 0: 0.9325513196480938
label 1: 0.9750321277767579
label 2: 0.9822660694831135
label 3: 0.9505358615004122
label 4: 0.9617006324666199
Recall by label:
label 0: 0.7535545023696683
label 1: 0.9860750092833271
label 2: 0.9998151798915722
label 3: 0.9102032761002565
label 4: 0.9705673758865249
F-measure by label:
label 0: 0.8335517693315858
label 1: 0.9805224776146958
label 2: 0.99096293582463

RANDOM FOREST CLASSIFIER

In [24]:
# Initialize the RandomForestClassifier
rf = RandomForestClassifier(labelCol="cluster", featuresCol="scaled_features", numTrees=100)

# Train the model
rf_model = rf.fit(train_data)

# Make predictions on the test set
rf_predictions = rf_model.transform(test_data)

# Evaluate the model
rf_evaluator = MulticlassClassificationEvaluator(labelCol="cluster", predictionCol="prediction", metricName="accuracy")
rf_accuracy = rf_evaluator.evaluate(rf_predictions)
print(f"Accuracy of Random Forest Classifier: {rf_accuracy}")
print("Test Error for Random Forest Classifier = %g" % (1.0 - rf_accuracy))
# # Feature importances
# importances = rf_model.featureImportances
# print(f"Feature Importances obtained by Random Forest Classifier: {importances}")

Accuracy of Random Forest Classifier: 0.8847846589122917
Test Error for Random Forest Classifier = 0.115215


In [25]:
# trainingSummary = rf_model.summary

# # for multiclass, we can inspect metrics on a per-label basis
# print("False positive rate by label:")
# for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
#     print("label %d: %s" % (i, rate))

# print("True positive rate by label:")
# for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
#     print("label %d: %s" % (i, rate))

# print("Precision by label:")
# for i, prec in enumerate(trainingSummary.precisionByLabel):
#     print("label %d: %s" % (i, prec))

# print("Recall by label:")
# for i, rec in enumerate(trainingSummary.recallByLabel):
#     print("label %d: %s" % (i, rec))

# print("F-measure by label:")
# for i, f in enumerate(trainingSummary.fMeasureByLabel()):
#     print("label %d: %s" % (i, f))

# accuracy = trainingSummary.accuracy
# falsePositiveRate = trainingSummary.weightedFalsePositiveRate
# truePositiveRate = trainingSummary.weightedTruePositiveRate
# fMeasure = trainingSummary.weightedFMeasure()
# precision = trainingSummary.weightedPrecision
# recall = trainingSummary.weightedRecall
# print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
#       % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

DECISION TREE CLASSIFIER

In [26]:
# Decision Tree
dt = DecisionTreeClassifier(featuresCol='scaled_features', labelCol='cluster')
dt_model = dt.fit(train_data)

# Make predictions on the testing data
dt_predictions = dt_model.transform(test_data)

# Select (prediction, true label) and compute test error
dt_evaluator = MulticlassClassificationEvaluator(labelCol="cluster", predictionCol="prediction", metricName="accuracy")
dt_accuracy =dt_evaluator.evaluate(dt_predictions)
print("Test Error = %g " % (1.0 - dt_accuracy))
print(f"Accuracy of Decision Tree Classifer: {dt_accuracy}")

Test Error = 0.0944671 
Accuracy of Decision Tree Classifer: 0.9055328513046211


NAIVE BAYES CLASSIFIER

In [27]:
# Initialize Naive Bayes model
naive_bayes = NaiveBayes(featuresCol="scaled_features", labelCol="cluster", modelType="multinomial")

# Train the Naive Bayes model
nb_model = naive_bayes.fit(trainData)

# Make predictions on test data
nb_predictions = nb_model.transform(testData)

# Evaluate the model using accuracy
nb_evaluator = MulticlassClassificationEvaluator(labelCol="cluster", predictionCol="prediction", metricName="accuracy")
nb_accuracy = nb_evaluator.evaluate(nb_predictions)
print(f"Accuracy of Naive Bayes Model: {nb_accuracy}")

Accuracy of Naive Bayes Model: 0.9031751021691292


In [28]:
from pyspark.ml.tuning import ParamGridBuilder

# Build a parameter grid for hyperparameter tuning
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.1, 1.0])  # Regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])  # ElasticNet mixing parameter
             .addGrid(lr.maxIter, [10, 50, 100])  # Maximum number of iterations
             .build())


In [29]:
from pyspark.ml.tuning import CrossValidator

# Set up cross-validation
cv = CrossValidator(estimator=lr, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=lr_evaluator, 
                    numFolds=5)  # 5-fold cross-validation


In [None]:
# from pyspark.ml.tuning import TrainValidationSplit

# # Set up train-validation split
# tvs = TrainValidationSplit(estimator=lr,
#                            estimatorParamMaps=paramGrid,
#                            evaluator=lr_evaluator,
#                            trainRatio=0.8)  # 80% for training, 20% for validation


In [30]:
# Fit the model using cross-validation
cvModel = cv.fit(train_data)

In [None]:
# # Or using train-validation split
# tvsModel = tvs.fit(train_data)

In [32]:
# Get the best model from cross-validation
bestModel = cvModel.bestModel

# Make predictions on the test data
predictions = bestModel.transform(test_data)

# Evaluate the model
accuracy = lr_evaluator.evaluate(predictions)
print("Test Accuracy = ", accuracy)

# Print the best hyperparameters
print("Best Parameters:")
print("  MaxIter:", bestModel._java_obj.getMaxIter())
print("  RegParam:", bestModel._java_obj.getRegParam())
print("  ElasticNetParam:", bestModel._java_obj.getElasticNetParam())


Test Accuracy =  0.9652624960704181
Best Parameters:
  MaxIter: 100
  RegParam: 0.01
  ElasticNetParam: 0.0


In [None]:
bestModel.save("path_to_save_model")

In [None]:
# # Define KNN classifier
# knn = KNNClassifier(k=5, featuresCol="scaledFeatures", labelCol="label", predictionCol="prediction")
# knn_model = knn.fit(train_data)

# # Predict on test data
# predictions = knn_model.transform(test_data)
# predictions.select("label", "prediction").show()

# evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
# accuracy = evaluator.evaluate(predictions)
# print(f"Accuracy: {accuracy}")

LINEAR SUPPORT VECTOR MACHINE CLASSIFIER

In [30]:
# # Initialize Linear SVC
# svm = LinearSVC(featuresCol="scaled_features", labelCol="cluster", maxIter=10, regParam=0.1)

# # Train the model
# svm_model = svm.fit(train_data)

# # Predict on test data
# svm_predictions = svm_model.transform(test_data)
# svm_predictions.select("cluster", "prediction").show(5)

# # Initialize evaluator
# svm_evaluator = MulticlassClassificationEvaluator(labelCol="cluster", predictionCol="prediction", metricName="accuracy")

# # Calculate accuracy
# svm_accuracy = svm_evaluator.evaluate(predictions)

# print("Test Error = %g " % (1.0 - svm_accuracy))
# print(f"Accuracy of Support Vector Machine Model: {svm_accuracy}")

GRADIENT-BOOSTED TREE CLASSIFIER

In [32]:
# # Initialize Gradient Boosting Classifier
# gbt = GBTClassifier(featuresCol="scaled_features", labelCol="cluster", maxIter=10)

# # Train the model
# gbt_model = gbt.fit(train_data)

# # Make predictions on test data
# gbt_predictions = gbt_model.transform(test_data)

# # Evaluate the model using accuracy
# gbt_evaluator = MulticlassClassificationEvaluator(labelCol="cluster", predictionCol="prediction", metricName="accuracy")
# gbt_accuracy = gbt_evaluator.evaluate(gbt_predictions)
# print(f"Accuracy of Gradient Boosting Model: {gbt_accuracy}")

Gradient Boosted Trees require sufficient samples for each class. If the dataset is heavily imbalanced, the model may fail.

FACTORIZATION MACHINES CLASSIFIER

In [34]:
# fm = FMClassifier(labelCol="cluster", featuresCol="scaled_features", stepSize=0.001)

# # Train model.
# fm_model = fm.fit(train_data)

# # Make predictions.
# fm_predictions =fm_model.transform(test_data)


# # Select (prediction, true label) and compute test accuracy
# fm_evaluator = MulticlassClassificationEvaluator(labelCol="cluster", predictionCol="prediction", metricName="accuracy")
# fm_accuracy = fm_evaluator.evaluate(fm_predictions)
# print("Test set accuracy = %g" % fm_accuracy)

# # fmModel = model.stages[2]
# # print("Factors: " + str(fmModel.factors))  # type: ignore
# # print("Linear: " + str(fmModel.linear))  # type: ignore
# # print("Intercept: " + str(fmModel.intercept))  # type: ignore

If  dataset has highly imbalanced labels (e.g., 90% of the data belongs to one class), it may affect the model fitting process.

ONE-VS-REST CLASSIFIER

In [43]:
# # Step 3: Instantiate Logistic Regression model
# lr = LogisticRegression(featuresCol="scaled_features", labelCol="cluster", maxIter=10, tol=1E-6, fitIntercept=True)

# # Step 4: Instantiate the OneVsRest classifier
# ovr = OneVsRest(classifier=lr, labelCol="cluster", featuresCol="scaled_features")

# ovr_model = ovr.fit(train_data)

# # score the model on test data.
# ovr_predictions = ovr_model.transform(test_data)

# # Select (prediction, true label) and compute test accuracy
# ovr_evaluator = MulticlassClassificationEvaluator(labelCol="cluster", predictionCol="prediction", metricName="accuracy")
# ovr_accuracy = ovr_evaluator.evaluate(ovr_predictions)
# print("Test set accuracy = %g" % ovr_accuracy)