Question: Can we use the predicted turbidity levels to determine the optimal chemical dosage (chlorine and alum) required for efficient treatment of water at the plant?

# 1. XGBoost

In [8]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Predicting Chlorine

In [27]:
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType
from xgboost.spark import SparkXGBRegressor
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()


sparkDF = spark.read.csv("/content/data.csv", header=True, inferSchema=True)
sparkDF = sparkDF.select(sparkDF.columns[2:4]).dropna()

# Remove rows with missing or zero values in the turbidity_raw column
sparkDF = sparkDF.dropna(subset=["turbidity"]).filter(col("turbidity") != 0)
sparkDF = sparkDF.dropna(subset=["chlorine"]).filter(col("chlorine") != 0)
# Split the data into training and test sets (70% training, 30% test)

(train_cl, test_cl) = sparkDF.randomSplit([0.8, 0.2])

sparkDF.show()
sparkDF.dtypes

+---------+------------------+
|turbidity|          chlorine|
+---------+------------------+
|    0.401|2.8764551519644184|
|    0.374| 4.180292307692309|
|    0.361|3.1440257723955907|
|    0.351| 2.930638991845812|
|    0.339| 2.828991513437058|
|    0.374|3.6329153225806454|
|    0.373| 2.846970024721879|
|    0.369|3.8199298752191404|
|    0.334| 2.640179351921628|
|    0.336|2.9609897974722093|
|    0.368|3.8589270799871502|
|    0.321|2.4871304347826086|
|    0.307| 2.693291314373559|
|    0.306|2.7784936234058515|
|    0.289| 2.519682684973303|
|    0.279|2.6954764397905757|
|    0.254|2.7801233328552994|
|    0.347| 2.697494011976048|
|    0.346| 2.768611695086818|
|    0.371| 2.818426525998493|
+---------+------------------+
only showing top 20 rows



[('turbidity', 'double'), ('chlorine', 'double')]

In [28]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator


assembler = VectorAssembler().setInputCols(['turbidity']).setOutputCol('features')
assembledTrainingData = assembler.transform(train_cl)

# create a xgboost pyspark regressor estimator and set use_gpu=True
regressor = SparkXGBRegressor(
  features_col="features",
  label_col="chlorine",
  num_workers=2,
)
# train and return the model
model = regressor.fit(assembledTrainingData)

# predict on test data
assembledTestingData = assembler.transform(test_cl)

predict_df = model.transform(assembledTestingData)

# Evaluate the model using RMSE for the target variable Chemical Dosages_Chlorine
evaluatorChlorine = RegressionEvaluator(labelCol="chlorine", predictionCol="prediction", metricName="rmse")
rmseChlorine = evaluatorChlorine.evaluate(predict_df)
print("RMSE for Chemical Dosages_Chlorine: ", rmseChlorine)




RMSE for Chemical Dosages_Chlorine:  0.33892043805824856


### Predicting Alum

In [29]:
sparkDF = spark.read.csv("/content/data.csv", header=True, inferSchema=True)
sparkDF = sparkDF.select(["turbidity", "alum"]).dropna()

# Remove rows with missing or zero values in the turbidity_raw column
sparkDF = sparkDF.dropna(subset=["turbidity"]).filter(col("turbidity") != 0)
sparkDF = sparkDF.dropna(subset=["alum"]).filter(col("alum") != 0)
# Split the data into training and test sets (70% training, 30% test)

(train_al, test_al) = sparkDF.randomSplit([0.8, 0.2])

In [26]:
assembler = VectorAssembler().setInputCols(['turbidity']).setOutputCol('features')
assembledTrainingData = assembler.transform(train_al)

# create a xgboost pyspark regressor estimator and set use_gpu=True
regressor = SparkXGBRegressor(
  features_col="features",
  label_col="alum",
  num_workers=2,
)
# train and return the model
model = regressor.fit(assembledTrainingData)

# predict on test data
assembledTestingData = assembler.transform(test_al)

predict_df = model.transform(assembledTestingData)

# Evaluate the model using RMSE for the target variable Chemical Dosages_Chlorine
evaluatorAlum = RegressionEvaluator(labelCol="alum", predictionCol="prediction", metricName="rmse")
rmseAlum = evaluatorAlum.evaluate(predict_df)
print("RMSE for Chemical Dosages_Alum: ", rmseAlum)




RMSE for Chemical Dosages_Alum:  2.53850537993534


### Predicting Chlorine and Alum with XGBoos + k-fold

In [34]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

spark = SparkSession.builder.appName("RandomForestRegressor").getOrCreate()

# Define the feature and target columns
featuresCol = "features"
targetCol_Chlorine = "chlorine"
targetCol_Alum = "alum"

# Create a VectorAssembler to combine the features into a single vector column
assembler = VectorAssembler().setInputCols(['turbidity']).setOutputCol('features')


# Fit the VectorAssembler to the training data
assembledTrainingData = assembler.transform(train_cl)

# Train a RandomForestRegressor on the training data using k-fold cross-validation
regressor = SparkXGBRegressor(
  features_col=featuresCol,
  label_col=targetCol_Chlorine,
  num_workers=2,
)
paramGrid = ParamGridBuilder().addGrid(regressor.max_depth, [3, 6, 9]).build()
crossval = CrossValidator(estimator=regressor, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(labelCol=targetCol_Chlorine, predictionCol="prediction", metricName="rmse"), numFolds=3)
model = crossval.fit(assembledTrainingData)

# Apply the trained model to the test data
assembledTestData = assembler.transform(test_cl)
predictions = model.transform(assembledTestData)

# Evaluate the model using RMSE for Chemical Dosages_Chlorine
evaluator_Chlorine = RegressionEvaluator(labelCol=targetCol_Chlorine, predictionCol="prediction", metricName="rmse")
rmse_Chlorine = evaluator_Chlorine.evaluate(predictions)
print("RMSE for Chemical Dosages_Chlorine: ", rmse_Chlorine)

# Train a RandomForestRegressor on the training data using k-fold cross-validation
assembledTrainingData = assembler.transform(train_al)
regressor = SparkXGBRegressor(
  features_col=featuresCol,
  label_col=targetCol_Alum,
  num_workers=2,
)
paramGrid = ParamGridBuilder().addGrid(regressor.max_depth, [3, 6, 9]).build()
crossval = CrossValidator(estimator=regressor, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(labelCol=targetCol_Alum, predictionCol="prediction", metricName="rmse"), numFolds=3)
model = crossval.fit(assembledTrainingData)

# Apply the trained model to the test data
assembledTestData = assembler.transform(test_al)
predictions = model.transform(assembledTestData)

# Evaluate the model using RMSE for Chemical Dosages_Alum
evaluator_Alum = RegressionEvaluator(labelCol=targetCol_Alum, predictionCol="prediction", metricName="rmse")
rmse_Alum = evaluator_Alum.evaluate(predictions)
print("RMSE for Chemical Dosages_Alum: ", rmse_Alum)

RMSE for Chemical Dosages_Chlorine:  0.3192960046496115
RMSE for Chemical Dosages_Alum:  2.7887978491914196


# RandomForestRegressor

**Predicting Chemical Dosage for chlorine and Alum using Random Forest Regressor and evaluating model using RMSE**

In [14]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder.appName("RandomForestRegressor").getOrCreate()
data = spark.read.csv("/content/data.csv", header=True, inferSchema=True).dropna()
# Convert Turbidity_Raw column to DoubleType
data = data.withColumn("turbidity", col("turbidity").cast(DoubleType()))
# Remove rows with missing or zero values in the turbidity_raw column
data = data.dropna(subset=["turbidity"]).filter(col("turbidity") != 0)
# Split the data into training and test sets (70% training, 30% test)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Define the feature and target columns
featuresCol = ["turbidity"]
targetColChlorine = "chlorine"
targetColAlum = "alum"
# Create a VectorAssembler to combine the features into a single vector column
assembler = VectorAssembler(inputCols=featuresCol, outputCol="features")
# Fit the VectorAssembler to the training data
assembledTrainingData = assembler.transform(trainingData)
# Train a RandomForestRegressor on the training data for target column Chemical Dosages_Chlorine
rfChlorine = RandomForestRegressor(featuresCol="features", labelCol=targetColChlorine, numTrees=60, maxDepth=5)
modelChlorine = rfChlorine.fit(assembledTrainingData)
# Apply the trained model to the test data
assembledTestData = assembler.transform(testData)
# Predict for the target variable Chemical Dosages_Chlorine using the trained model
predictionsChlorine = modelChlorine.transform(assembledTestData)
# Evaluate the model using RMSE for the target variable Chemical Dosages_Chlorine
evaluatorChlorine = RegressionEvaluator(labelCol=targetColChlorine, predictionCol="prediction", metricName="rmse")
rmseChlorine = evaluatorChlorine.evaluate(predictionsChlorine)
print("RMSE for Chemical Dosages_Chlorine: ", rmseChlorine)

# Train a RandomForestRegressor on the training data for target column Chemical Dosages_Alum
rfAlum = RandomForestRegressor(featuresCol="features", labelCol=targetColAlum, numTrees=60, maxDepth=5)
modelAlum = rfAlum.fit(assembledTrainingData)
# Predict for the target variable Chemical Dosages_Alum using the trained model
predictionsAlum = modelAlum.transform(assembledTestData)
# Evaluate the model using RMSE for the target variable Chemical Dosages_Alum
evaluatorAlum = RegressionEvaluator(labelCol=targetColAlum, predictionCol="prediction", metricName="rmse")
rmseAlum = evaluatorAlum.evaluate(predictionsAlum)
print("RMSE for Chemical Dosages_Alum: ", rmseAlum)

RMSE for Chemical Dosages_Chlorine:  0.2961245748141294
RMSE for Chemical Dosages_Alum:  3.1418770753858682


**Predicting Chemical Dosage for chlorine using Random Forest Regressor and evaluating model using K Fold Cross Validation**

**K Fold Cross Validation Technique**

In [15]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

spark = SparkSession.builder.appName("RandomForestRegressor").getOrCreate()
data = spark.read.csv("/content/data.csv", header=True, inferSchema=True).dropna()

# Convert Turbidity_Raw column to DoubleType
data = data.withColumn("turbidity", col("turbidity").cast(DoubleType()))

# Remove rows with missing or zero values in the turbidity_raw column
data = data.dropna(subset=["turbidity"]).filter(col("turbidity") != 0)

# Split the data into training and test sets (70% training, 30% test)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Define the feature and target columns
featuresCol = ["turbidity"]
targetCol_Chlorine = "chlorine"
targetCol_Alum = "alum"

# Create a VectorAssembler to combine the features into a single vector column
assembler = VectorAssembler(inputCols=featuresCol, outputCol="features")

# Fit the VectorAssembler to the training data
assembledTrainingData = assembler.transform(trainingData)

# Train a RandomForestRegressor on the training data using k-fold cross-validation
rf = RandomForestRegressor(featuresCol="features", labelCol=targetCol_Chlorine, numTrees=60, maxDepth=5)
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [30, 60, 90]).addGrid(rf.maxDepth, [3, 5, 7]).build()
crossval = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(labelCol=targetCol_Chlorine, predictionCol="prediction", metricName="rmse"), numFolds=3)
model = crossval.fit(assembledTrainingData)

# Apply the trained model to the test data
assembledTestData = assembler.transform(testData)
predictions = model.transform(assembledTestData)

# Evaluate the model using RMSE for Chemical Dosages_Chlorine
evaluator_Chlorine = RegressionEvaluator(labelCol=targetCol_Chlorine, predictionCol="prediction", metricName="rmse")
rmse_Chlorine = evaluator_Chlorine.evaluate(predictions)
print("RMSE for Chemical Dosages_Chlorine: ", rmse_Chlorine)

# Train a RandomForestRegressor on the training data using k-fold cross-validation
rf = RandomForestRegressor(featuresCol="features", labelCol=targetCol_Alum, numTrees=100, maxDepth=5)
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [30, 60, 90]).addGrid(rf.maxDepth, [3, 5, 7]).build()
crossval = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(labelCol=targetCol_Alum, predictionCol="prediction", metricName="rmse"), numFolds=3)
model = crossval.fit(assembledTrainingData)

# Apply the trained model to the test data
assembledTestData = assembler.transform(testData)
predictions = model.transform(assembledTestData)

# Evaluate the model using RMSE for Chemical Dosages_Alum
evaluator_Alum = RegressionEvaluator(labelCol=targetCol_Alum, predictionCol="prediction", metricName="rmse")
rmse_Alum = evaluator_Alum.evaluate(predictions)
print("RMSE for Chemical Dosages_Alum: ", rmse_Alum)

RMSE for Chemical Dosages_Chlorine:  0.30294470463509265
RMSE for Chemical Dosages_Alum:  3.0305412560502205
