In [None]:

# Water Potability Dataset - Data Analysis


# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10,6)


# 2. Load Dataset

url = "https://raw.githubusercontent.com/lin-010/IT326-Water-Potability/refs/heads/main/Dataset/Raw_dataset.csv"
df = pd.read_csv(url)   


print(df.head())


# 3. Check Missing Values

print("\nMissing Values per Column:\n")
print(df.isnull().sum())


plt.figure(figsize=(8,5))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap")
plt.show()


# 4. Statistical Summary (Five-number summary)

print("\nStatistical Summary:\n")
print(df.describe())


# 5. Plot 1: Histogram - Variable Distributions

df.hist(figsize=(12,10), bins=20, color='skyblue')
plt.suptitle("Histograms of Numeric Attributes", fontsize=16)
plt.show()


# 6. Plot 2: Boxplot - Outliers Detection

plt.figure(figsize=(12,6))
sns.boxplot(data=df, orient="h")
plt.title("Boxplot for Detecting Outliers")
plt.show()


# 7. Plot 3: Countplot - Class Label Distribution

plt.figure(figsize=(6,4))
sns.countplot(x='Potability', data=df, palette='pastel')
plt.title("Class Label Distribution (Potability)")
plt.xlabel("Potability (0 = Not Drinkable, 1 = Drinkable)")
plt.ylabel("Count")
plt.show()


# 8. Plot 4: Scatter Plot - Relationship Example

plt.figure(figsize=(7,5))
sns.scatterplot(x='ph', y='Hardness', hue='Potability', data=df, alpha=0.7)
plt.title("Scatter Plot of pH vs Hardness by Potability")
plt.show()

# -----------------------------------------------------------
# 9. Brief Observations :

# - Missing values exist in several columns, so preprocessing (imputation) is required.
# - Histograms show that some features like 'ph' and 'Sulfate' are skewed.
# - Boxplots reveal outliers, especially in 'Sulfate' and 'Turbidity'.
# - Class label distribution is imbalanced (more non-drinkable samples).
# - Scatter plot shows weak correlation between pH and Hardness.

# Water Potability Dataset - Data Preprocessing
# 10. Data Preprocessing

# 10.1 Handle Missing Values (Imputation)


df_filled = df.fillna(df.median())
print("\n Missing values handled using median imputation.\n")
print(df_filled.isnull().sum())


# 10.2 Handle Outliers 
Q1 = df_filled.quantile(0.25)
Q3 = df_filled.quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Cap outliers 
df_no_outliers = df_filled.clip(lower=lower_bound, upper=upper_bound, axis=1)

print("\n Outliers handled using IQR method.\n")


# 10.3 Normalization 
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df_no_outliers.drop('Potability', axis=1))

df_scaled = pd.DataFrame(scaled_data, columns=df_no_outliers.columns[:-1])
df_scaled['Potability'] = df_no_outliers['Potability']

print("\n Data normalized using Min-Max scaling.\n")
print(df_scaled.head())


# 10.4 Feature Selection (Correlation Analysis)
plt.figure(figsize=(10,6))
corr_matrix = df_scaled.corr()
sns.heatmap(corr_matrix, annot=False, cmap="coolwarm")
plt.title("Correlation Matrix Heatmap")
plt.show()

# Drop low-correlation features 
low_corr_features = corr_matrix['Potability'][abs(corr_matrix['Potability']) < 0.05].index
print("Low correlation features (optional to drop):", list(low_corr_features))


# 10.5 Save Preprocessed Data
df_scaled.to_csv("Preprocessed_dataset.csv", index=False)
print("\n Preprocessed dataset saved as 'Preprocessed_dataset.csv'\n")


# -----------------------------------------------------------
# 11. Summary of Preprocessing Steps

print("""
Preprocessing Summary:
1. Missing values handled with median imputation.
2. Outliers capped using IQR-based winsorization.
3. Data normalized using Min-Max Scaling (0–1 range).
4. Features evaluated using correlation analysis.
5. Final dataset saved for Phase 3 (classification & clustering).
""")