# Import packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
sklearn.set_config(display='text')
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

: 

# Load and read the data

In [None]:
data = pd.read_csv('E:\jar-model\jartest_augmented1.csv')
data = data.drop(columns=['Date'])
data.head()

In [None]:
# Drop rows with missing target values
data = data.dropna(subset=['Turbidity', 'PH', 'Colour'])
data.head()

In [None]:
# Remove outliers using IQR-based filtering
def remove_outliers_iqr(df, cols, factor=1.5):
    Q1 = df[cols].quantile(0.25)
    Q3 = df[cols].quantile(0.75)
    IQR = Q3 - Q1
    mask = ~((df[cols] < (Q1 - factor * IQR)) | (df[cols] > (Q3 + factor * IQR))).any(axis=1)
    return df.loc[mask]

# Apply outlier removal on all predictors and targets
cols = ['Raw_Turbidity','Raw_PH','Raw_Colour','PAC','KMnO4','ACD','Turbidity','PH','Colour']
data = remove_outliers_iqr(data, cols)
print(f"Data shape after outlier removal: {data.shape}")
data.head()

In [None]:
X_turbidity = data[['Raw_Turbidity', 'Raw_Colour', 'Raw_PH', 'PAC', 'KMnO4','ACD']]
X_ph = data[['Raw_Turbidity', 'Raw_Colour', 'Raw_PH', 'PAC', 'KMnO4','ACD']]
X_colour = data[['Raw_Turbidity', 'Raw_Colour', 'Raw_PH', 'PAC', 'KMnO4','ACD']]
y_turbidity = data['Turbidity']
y_ph = data['PH']
y_colour = data['Colour']

# Model development

## Turbidiy

In [None]:
X_turbidity_train, X_turbidity_test, y_turbidity_train, y_turbidity_test = train_test_split(X_turbidity, y_turbidity, test_size=0.2, random_state=42)

In [None]:
# Initialize RandomForest model for Turbidity
evaluate_model = None  # placeholder to preserve context if any
model_turbidity = RandomForestRegressor(n_estimators=100, random_state=42)
model_turbidity.fit(X_turbidity_train, y_turbidity_train)

## PH

In [None]:
X_ph_train, X_ph_test, y_ph_train, y_ph_test = train_test_split(X_ph, y_ph, test_size=0.2, random_state=42)

In [None]:
model_ph =  RandomForestRegressor(
    n_estimators=200,
    min_samples_split=2,
    min_samples_leaf=2,
    max_features="sqrt",
    max_depth=None,
    bootstrap=False,
    random_state=42
)

In [None]:
model_ph.fit(X_ph_train, y_ph_train)

## Colour

In [None]:
X_colour_train, X_colour_test, y_colour_train, y_colour_test = train_test_split(X_colour, y_colour, test_size=0.2, random_state=42)

In [None]:
model_colour = RandomForestRegressor(
    n_estimators=300,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="log2",
    max_depth=None,
    bootstrap=False,
    random_state=42
)

In [None]:
model_colour.fit(X_colour_train, y_colour_train)

# Model Testing & Validation

In [None]:
y_turbidity_pred = model_turbidity.predict(X_turbidity_test)
y_turbidity_pred

In [None]:
results_Tur = pd.DataFrame({
    'Actual': y_turbidity_test.flatten() if hasattr(y_turbidity_test, 'flatten') else y_turbidity_test,
    'Predicted': y_turbidity_pred.flatten() if hasattr(y_turbidity_pred, 'flatten') else y_turbidity_pred
})


results_Tur

In [None]:
import matplotlib.pyplot as plt

# Scatter plot with actual values on the x-axis and predicted values on the y-axis
plt.figure(figsize=(10, 6))

plt.scatter(results_Tur['Actual'], results_Tur['Predicted'], color='blue', alpha=0.7, label='Predictions')

# Adding a reference line (y = x) for comparison
min_val = min(results_Tur['Actual'].min(), results_Tur['Predicted'].min())
max_val = max(results_Tur['Actual'].max(), results_Tur['Predicted'].max())
plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--', label='Ideal Fit (y=x)')

# Labels, title, and legend
plt.title('Actual vs Predicted')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.grid()

# Display the plot
plt.show()

In [None]:
y_ph_pred = model_ph.predict(X_ph_test)
y_ph_pred

In [None]:
results_Ph = pd.DataFrame({
    'Actual': y_ph_test.flatten() if hasattr(y_ph_test, 'flatten') else y_ph_test,
    'Predicted': y_ph_pred.flatten() if hasattr(y_ph_pred, 'flatten') else y_ph_pred
})

results_Ph

In [None]:
import matplotlib.pyplot as plt

# Scatter plot with actual values on the x-axis and predicted values on the y-axis
plt.figure(figsize=(10, 6))

plt.scatter(results_Ph['Actual'], results_Ph['Predicted'], color='blue', alpha=0.7, label='Predictions')

# Adding a reference line (y = x) for comparison
min_val = min(results_Ph['Actual'].min(), results_Ph['Predicted'].min())
max_val = max(results_Ph['Actual'].max(), results_Ph['Predicted'].max())
plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--', label='Ideal Fit (y=x)')

# Labels, title, and legend
plt.title('Actual vs Predicted')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.grid()

# Display the plot
plt.show()


In [None]:
y_colour_pred = model_colour.predict(X_colour_test)
y_colour_pred

In [None]:
results_Colo = pd.DataFrame({
    'Actual': y_colour_test.flatten() if hasattr(y_colour_test, 'flatten') else y_colour_test,
    'Predicted': y_colour_pred.flatten() if hasattr(y_colour_pred, 'flatten') else y_colour_pred
})

results_Colo

In [None]:
import matplotlib.pyplot as plt

# Scatter plot with actual values on the x-axis and predicted values on the y-axis
plt.figure(figsize=(10, 6))

plt.scatter(results_Colo['Actual'], results_Colo['Predicted'], color='blue', alpha=0.7, label='Predictions')

# Adding a reference line (y = x) for comparison
min_val = min(results_Colo['Actual'].min(), results_Colo['Predicted'].min())
max_val = max(results_Colo['Actual'].max(), results_Colo['Predicted'].max())
plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--', label='Ideal Fit (y=x)')

# Labels, title, and legend
plt.title('Actual vs Predicted')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.grid()

# Display the plot
plt.show()


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate_model(y_test, y_pred, target_name):
    r2 = r2_score(y_test, y_pred)
    accuracy = r2 * 100  # Convert R² to percentage
    print(f"\nEvaluation Metrics for {target_name}:")
    print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred):.2f}")
    print(f"R-squared (R²): {r2:.2f}")


evaluate_model(y_turbidity_test, y_turbidity_pred, "Turbidity")
evaluate_model(y_ph_test, y_ph_pred, "PH")
evaluate_model(y_colour_test, y_colour_pred, "Colour")


In [None]:
# Remove outliers using IQR-based filtering
def remove_outliers_iqr(df, cols, factor=1.0):
     Q1 = df[cols].quantile(0.25)
     Q3 = df[cols].quantile(0.75)
     IQR = Q3 - Q1
     mask = ~((df[cols] < (Q1 - factor * IQR)) | (df[cols] > (Q3 + factor * IQR))).any(axis=1)
     return df.loc[mask]

# Improve model accuracy via hyperparameter tuning for Turbidity
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}
search_turb = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='r2',
    random_state=42
)
search_turb.fit(X_turbidity_train, y_turbidity_train)
model_turbidity = search_turb.best_estimator_
print('Best params for Turbidity:', search_turb.best_params_)

# Retrain and evaluate
model_turbidity.fit(X_turbidity_train, y_turbidity_train)
y_turbidity_pred = model_turbidity.predict(X_turbidity_test)
evaluate_model(y_turbidity_test, y_turbidity_pred, 'Turbidity')

# Repeat similar tuning for PH and Colour targets
# Improve model accuracy via hyperparameter tuning for PH
param_dist_ph = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}
search_ph = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=param_dist_ph,
    n_iter=20,
    cv=5,
    scoring='r2',
    random_state=42
)
search_ph.fit(X_ph_train, y_ph_train)
model_ph = search_ph.best_estimator_
print('Best params for PH:', search_ph.best_params_)

# Retrain and evaluate
model_ph.fit(X_ph_train, y_ph_train)
y_ph_pred = model_ph.predict(X_ph_test)
evaluate_model(y_ph_test, y_ph_pred, 'PH')

# Improve model accuracy via hyperparameter tuning for Colour
param_dist_colour = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}
search_colour = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=param_dist_colour,
    n_iter=20,
    cv=5,
    scoring='r2',
    random_state=42
)
search_colour.fit(X_colour_train, y_colour_train)
model_colour = search_colour.best_estimator_
print('Best params for Colour:', search_colour.best_params_)

# Retrain and evaluate
model_colour.fit(X_colour_train, y_colour_train)
y_colour_pred = model_colour.predict(X_colour_test)
evaluate_model(y_colour_test, y_colour_pred, 'Colour')