# Data Augmentation with SMOTE
This notebook demonstrates how to augment an imbalanced classification dataset using **SMOTE**.

In [8]:
{
  "cell_type": "code",
  "metadata": {
    "language": "python"
  },
  "source": [
    "%pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1"
  ]
}

{'cell_type': 'code',
 'metadata': {'language': 'python'},
 'source': ['%pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1']}

In [9]:
# Import libraries
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [10]:
# Load data
data = pd.read_csv('E:\\jar-model\\jartest.csv')
data = data.drop(columns=['Date'])
data.head()

Unnamed: 0,Raw_Turbidity,Raw_PH,Raw_Colour,PAC,KMnO4,ACD,Turbidity,PH,Colour
0,90.4,7.07,500.0,2.5,0.4,0.5,19.0,7.07,48.0
1,90.4,7.07,500.0,3.0,0.4,0.5,14.2,7.05,39.0
2,90.4,7.07,500.0,3.5,0.4,0.5,12.6,7.02,31.0
3,90.4,7.07,500.0,4.0,0.4,0.5,10.6,6.98,27.0
4,90.4,7.07,500.0,4.5,0.4,0.5,10.7,6.93,29.0


In [14]:
# For demonstration, discretize 'Turbidity' into 3 classes
data['Turbidity_class'] = pd.qcut(data['Turbidity'], q=3, labels=['low','medium','high'])

# Prepare features and target: select numeric columns only
feature_cols = ['Raw_Turbidity', 'Raw_Colour', 'Raw_PH', 'PAC', 'KMnO4', 'ACD']
X = data[feature_cols]
y = data['Turbidity_class']

In [18]:
# Split before augmentation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure features are numeric
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_train = X_train.fillna(0)

# Apply SMOTE with fewer neighbors
smote = SMOTE(random_state=42, k_neighbors=2)
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
X_resampled, y_resampled_enc = smote.fit_resample(X_train, y_train_enc)
y_resampled = le.inverse_transform(y_resampled_enc)
y_resampled = pd.Series(y_resampled, name='Turbidity_class')

In [19]:
# Compare class distributions
print('Before SMOTE:', y_train.value_counts())
print('After SMOTE :', y_resampled.value_counts())

Before SMOTE: Turbidity_class
medium    1373
high      1365
low       1346
Name: count, dtype: int64
After SMOTE : Turbidity_class
low       1373
medium    1373
high      1373
Name: count, dtype: int64


In [21]:
# Save augmented dataset to CSV
df_aug = pd.DataFrame(X_resampled, columns=feature_cols)
df_aug['Turbidity_class'] = y_resampled
output_path = 'E:\\jar-model\\jartest_augmented.csv'
df_aug.to_csv(output_path, index=False)
print(f'Augmented dataset saved to {output_path}')

Augmented dataset saved to E:\jar-model\jartest_augmented.csv
