# Data Augmentation and Prediction of Water Quality Parameters
This notebook loads the dataset, applies SMOTE to augment the data, and prepares the augmented dataset for download.

In [6]:
# Install necessary packages
%pip install imbalanced-learn pandas numpy matplotlib seaborn

Note: you may need to restart the kernel to use updated packages.


In [13]:
# Import libraries and load dataset
import pandas as pd
import numpy as np
from IPython.display import FileLink
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv(r"E:\jar-model\jartest.csv")
print(df.head())
print(df.info())

# Drop rows with missing target values
# df = df.dropna(subset=['Turbidity', 'PH', 'Colour'])

         Date  Raw_Turbidity  Raw_PH  Raw_Colour  PAC  KMnO4  ACD  Turbidity  \
0  31/12/2019           90.4    7.07       500.0  2.5    0.4  0.5       19.0   
1  31/12/2019           90.4    7.07       500.0  3.0    0.4  0.5       14.2   
2  31/12/2019           90.4    7.07       500.0  3.5    0.4  0.5       12.6   
3  31/12/2019           90.4    7.07       500.0  4.0    0.4  0.5       10.6   
4  31/12/2019           90.4    7.07       500.0  4.5    0.4  0.5       10.7   

     PH  Colour  
0  7.07    48.0  
1  7.05    39.0  
2  7.02    31.0  
3  6.98    27.0  
4  6.93    29.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5112 entries, 0 to 5111
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           5106 non-null   object 
 1   Raw_Turbidity  5106 non-null   float64
 2   Raw_PH         5106 non-null   float64
 3   Raw_Colour     5106 non-null   float64
 4   PAC            5106 non-null   floa

In [14]:
# Define input features and target variables
X = df[['Raw_Turbidity', 'Raw_PH', 'Raw_Colour', 'PAC', 'KMnO4', 'ACD']]
y = df[['Turbidity', 'PH', 'Colour']]

# Display original dataset shape
print(f"Original dataset shape: {df.shape}")

# Augment data by adding Gaussian noise (regression targets)
n = len(df)
noise_factor = 0.01
df_noise = df.copy()
for col in ['Raw_Turbidity', 'Raw_PH', 'Raw_Colour', 'PAC', 'KMnO4', 'ACD', 'Turbidity', 'PH', 'Colour']:
    df_noise[col] = df_noise[col] + np.random.normal(0, df_noise[col].std() * noise_factor, size=n)
# Combine original and noisy data
df_augmented = pd.concat([df, df_noise], ignore_index=True)

# Display augmented dataset shape
print(f"Augmented dataset shape: {df_augmented.shape}")

# Save augmented dataset to CSV
output_path = r"E:\jar-model\jartest_augmented1.csv"
df_augmented.to_csv(output_path, index=False)

# Provide link to download the augmented CSV file
FileLink(output_path)

Original dataset shape: (5112, 10)
Augmented dataset shape: (10224, 10)
