In [None]:

# Water Potability Dataset - Data Analysis


# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10,6)


# 2. Load Dataset

url = "https://raw.githubusercontent.com/lin-010/IT326-Water-Potability/refs/heads/main/Dataset/Raw_dataset.csv"
data = pd.read_csv(url)   


print(df.head())


# 3. Check Missing Values

print("\nMissing Values per Column:\n")
print(df.isnull().sum())


plt.figure(figsize=(8,5))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap")
plt.show()


# 4. Statistical Summary (Five-number summary)

print("\nStatistical Summary:\n")
print(df.describe())


# 5. Plot 1: Histogram - Variable Distributions

df.hist(figsize=(12,10), bins=20, color='skyblue')
plt.suptitle("Histograms of Numeric Attributes", fontsize=16)
plt.show()


# 6. Plot 2: Boxplot - Outliers Detection

plt.figure(figsize=(12,6))
sns.boxplot(data=df, orient="h")
plt.title("Boxplot for Detecting Outliers")
plt.show()


# 7. Plot 3: Countplot - Class Label Distribution

plt.figure(figsize=(6,4))
sns.countplot(x='Potability', data=df, palette='pastel')
plt.title("Class Label Distribution (Potability)")
plt.xlabel("Potability (0 = Not Drinkable, 1 = Drinkable)")
plt.ylabel("Count")
plt.show()


# 8. Plot 4: Scatter Plot - Relationship Example

plt.figure(figsize=(7,5))
sns.scatterplot(x='ph', y='Hardness', hue='Potability', data=df, alpha=0.7)
plt.title("Scatter Plot of pH vs Hardness by Potability")
plt.show()

# -----------------------------------------------------------
# 9. Brief Observations :

# - Missing values exist in several columns, so preprocessing (imputation) is required.
# - Histograms show that some features like 'ph' and 'Sulfate' are skewed.
# - Boxplots reveal outliers, especially in 'Sulfate' and 'Turbidity'.
# - Class label distribution is imbalanced (more non-drinkable samples).
# - Scatter plot shows weak correlation between pH and Hardness.

# Water Potability Dataset - Data Preprocessing