In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import QuantileTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils import class_weight
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [2]:
# Read the CSV file
df = pd.read_csv("resources/pollution_dataset.csv")

# Review the DataFrame
df.head(5)

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,Air Quality
0,27.2,51.7,35.1,46.2,26.7,32.2,0.98,11.2,314,Hazardous
1,26.3,59.3,1.0,6.2,38.3,20.4,0.68,13.5,298,Good
2,27.9,73.2,20.0,39.4,19.6,5.8,0.95,5.4,309,Good
3,23.9,51.9,14.7,24.3,5.2,12.6,1.24,4.5,282,Poor
4,25.2,59.0,26.3,30.9,26.8,13.5,1.06,5.6,293,Poor


In [3]:
num_entries = len(df)
print(f"Number of entries: {num_entries}")

Number of entries: 5000


In [4]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)

Temperature                      0
Humidity                         0
PM2.5                            0
PM10                             0
NO2                              0
SO2                              0
CO                               0
Proximity_to_Industrial_Areas    0
Population_Density               0
Air Quality                      0
dtype: int64


In [6]:
df_cleaned = df.drop(columns=["Temperature", "Humidity", "Proximity_to_Industrial_Areas", "Population_Density"])

## Find and remove outliers

In [7]:
df_cleaned.columns

Index(['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'Air Quality'], dtype='object')

In [None]:
# Create a dictionary to hold outliers
outliers_dict = {}

In [None]:
# Define a function to detect outliers using the IQR method
def detect_outliers_iqr(data):
    outliers_dict = {}
    for column in data.columns:
        if column != 'air_quality_encoded':  # Exclude the target variable
            Q1 = data[column].quantile(0.25)
            Q3 = data[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
            outliers_dict[column] = outliers
    return outliers_dict

In [None]:
# Detect outliers in the DataFrame
outliers_dict = detect_outliers_iqr(df)

In [None]:
# Print outliers for each column (excluding air_quality_encoded)
for column, outliers in outliers_dict.items():
    print(f"Outliers in column '{column}':")
    print(outliers)
    print("\n")

In [None]:
outliers_dict[column] = outliers

In [None]:
# Define a function to remove outliers using the IQR method

def remove_outliers_iqr(data):
    for column in data.columns:
        if column != 'air_quality_encoded':  # Exclude the target variable
            Q1 = data[column].quantile(0.25)
            Q3 = data[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            # Remove outliers
            data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
    return data

In [None]:
# Confirm outliers were removed
print(f"Original DataFrame shape: {df.shape}")
print(f"Cleaned DataFrame shape: {cleaned_df.shape}")

In [None]:
cleaned_df = remove_outliers_iqr(df)

In [8]:
X = df_cleaned.drop('Air Quality', axis=1)  # Features
y = df_cleaned['Air Quality']  # Target

In [9]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

In [10]:
# Fit and transform the target variable
y_encoded = label_encoder.fit_transform(y)

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [21]:
from tensorflow import keras
from tensorflow.keras import layers

In [22]:
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  # Input layer
    layers.Dense(32, activation='relu'),  # Hidden layer
    layers.Dense(1, activation='sigmoid')  # Output layer for binary classification
])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [23]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [24]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.1123 - loss: -8.9062 - val_accuracy: 0.1075 - val_loss: -54.0501
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0991 - loss: -109.7452 - val_accuracy: 0.1075 - val_loss: -371.4854
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1072 - loss: -613.1423 - val_accuracy: 0.1075 - val_loss: -1407.4409
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0984 - loss: -2227.5774 - val_accuracy: 0.1075 - val_loss: -3659.3137
Epoch 5/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0923 - loss: -6053.3647 - val_accuracy: 0.1075 - val_loss: -7550.6357
Epoch 6/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1039 - loss: -10635.9209 - val_accuracy: 0.1075 - val_loss

<keras.src.callbacks.history.History at 0x18c3ef24950>

In [25]:
loss, accuracy = model.evaluate(X_test, y_test)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1019 - loss: -70880.6328  


In [26]:
print(f'Test Accuracy: {accuracy:.2f}')

Test Accuracy: 0.10


In [13]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
rf_model.fit(X_train, y_train)

In [15]:
y_pred = rf_model.predict(X_test)

In [17]:
from sklearn.metrics import accuracy_score

In [18]:
accuracy = accuracy_score(y_test, y_pred)

In [19]:
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.39


In [None]:
print("X_train_scaled shape:", X_train_scaled.shape)
print("y_train shape:", y_train.shape)

In [None]:
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)  # Use the same scaler for the test data

In [None]:
print("X_train_scaled shape:", X_train_scaled.shape)
print("y_train_resampled shape:", y_train_resampled.shape)

In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train_scaled, y_train_resampled)  # Use y_train_resampled here

In [None]:
# Predictions
y_pred = rf_model.predict(X_test_scaled)

In [None]:
# Print the classification report
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Model 2: Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train_resampled)  # Use y_train_resampled her)

In [None]:
y_pred_lr = lr_model.predict(X_test_scaled)

In [None]:
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))

In [None]:
# K-means Clustering
# Choose the number of clusters (k)
k = 4  # You can adjust this based on your specific use case

In [None]:
# Initialize and fit the K-means model
kmeans_model = KMeans(n_clusters=k, random_state=42)
kmeans_model.fit(X_train_scaled)

In [None]:
# Predict cluster labels for the training data
clusters = kmeans_model.predict(X_train_scaled)

In [None]:
X_train_with_clusters = np.hstack((X_train_scaled, clusters.reshape(-1, 1)))

In [None]:
# Print cluster centers
print("Cluster Centers:")
print(kmeans_model.cluster_centers_)

In [None]:
# Dimensionality Reduction with PCA
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)

In [None]:
# Visualize the clusters
plt.figure(figsize=(10, 6))
plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=clusters, cmap='viridis', marker='o', edgecolor='k', s=50)
plt.title('K-means Clustering Visualization')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')

In [None]:
# Plot cluster centers
centers = pca.transform(kmeans_model.cluster_centers_)
plt.scatter(centers[:, 0], centers[:, 1], c='red', marker='X', s=200, label='Cluster Centers')
plt.legend()
plt.grid()
plt.show()

## Check feature importance

In [None]:
importances = model.feature_importances_

In [None]:
# Create a DataFrame for feature importances
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
plt.barh(feature_importances['Feature'], feature_importances['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.show()