In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import QuantileTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import OneHotEncoder

## Preprocessing

In [2]:
# Read the CSV file
df = pd.read_csv("resources/pollution_dataset.csv")

# Review the DataFrame
df.head(5)

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,Air Quality
0,27.2,51.7,35.1,46.2,26.7,32.2,0.98,11.2,314,Hazardous
1,26.3,59.3,1.0,6.2,38.3,20.4,0.68,13.5,298,Good
2,27.9,73.2,20.0,39.4,19.6,5.8,0.95,5.4,309,Good
3,23.9,51.9,14.7,24.3,5.2,12.6,1.24,4.5,282,Poor
4,25.2,59.0,26.3,30.9,26.8,13.5,1.06,5.6,293,Poor


In [3]:
num_entries = len(df)
print(f"Number of entries: {num_entries}")

Number of entries: 5000


In [4]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)

Temperature                      0
Humidity                         0
PM2.5                            0
PM10                             0
NO2                              0
SO2                              0
CO                               0
Proximity_to_Industrial_Areas    0
Population_Density               0
Air Quality                      0
dtype: int64


In [5]:
# Map out classes, since we have 4 of them
air_quality_mapping = {
    'Good': 0,
    'Moderate': 1,
    'Poor': 2,
    'Hazardous': 3
}

In [6]:
# Encode the categorical features (Air Quality)
label_encoder = LabelEncoder()
df['air_quality_encoded'] = df['Air Quality'].map(air_quality_mapping)

In [7]:
df.head(5)

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,Air Quality,air_quality_encoded
0,27.2,51.7,35.1,46.2,26.7,32.2,0.98,11.2,314,Hazardous,3
1,26.3,59.3,1.0,6.2,38.3,20.4,0.68,13.5,298,Good,0
2,27.9,73.2,20.0,39.4,19.6,5.8,0.95,5.4,309,Good,0
3,23.9,51.9,14.7,24.3,5.2,12.6,1.24,4.5,282,Poor,2
4,25.2,59.0,26.3,30.9,26.8,13.5,1.06,5.6,293,Poor,2


In [8]:
# Drop the Air Quality column
df=df.drop(columns=['Air Quality'])

In [9]:
df.head(5)

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,air_quality_encoded
0,27.2,51.7,35.1,46.2,26.7,32.2,0.98,11.2,314,3
1,26.3,59.3,1.0,6.2,38.3,20.4,0.68,13.5,298,0
2,27.9,73.2,20.0,39.4,19.6,5.8,0.95,5.4,309,0
3,23.9,51.9,14.7,24.3,5.2,12.6,1.24,4.5,282,2
4,25.2,59.0,26.3,30.9,26.8,13.5,1.06,5.6,293,2


## Scale Features

In [10]:
# Feature scaling
features_to_scale = df[['Temperature', 'Humidity', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'Population_Density']]
target_variables = df['air_quality_encoded']

In [11]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_to_scale)

In [12]:
scaled_df = pd.DataFrame(scaled_features, columns=features_to_scale.columns)

In [13]:
# Don't scale the air_quality_encoded column
scaled_df['air_quality_encoded'] = target_variables.values

In [14]:
scaled_df.head(5)

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Population_Density,air_quality_encoded
0,0.316355,-0.55627,0.17153,0.201899,0.504399,2.22478,0.254142,0.816329,3
1,0.152291,-0.051062,-0.954519,-1.101455,1.530961,0.685856,-0.753211,-0.113179,0
2,0.44396,0.872937,-0.327102,-0.019671,-0.123927,-1.218235,0.153406,0.525857,0
3,-0.285212,-0.542975,-0.502118,-0.511687,-1.398279,-0.331398,1.12718,-1.042687,2
4,-0.048231,-0.071004,-0.119063,-0.296634,0.513249,-0.214022,0.522769,-0.40365,2


## Check and normalize skewness

In [None]:
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

In [None]:
# Values over 1 or under -1 are skewed
skewness_values = df[numerical_cols].skew()
skewness_values

In [None]:
# Use quartile transformer to normalize the data
qt = QuantileTransformer(output_distribution='normal')
df[['PM2.5', 'PM10', 'Proximity_to_Industrial_Areas']] = qt.fit_transform(df[['PM2.5', 'PM10', 'Proximity_to_Industrial_Areas']])

In [None]:
# Confirm skewed columns were fixed
skewness_values = df[numerical_cols].skew()
skewness_values

## Find and remove outliers

In [None]:
# Create a dictionary to hold outliers
outliers_dict = {}

In [None]:
# Define a function to detect outliers using the IQR method
def detect_outliers_iqr(data):
    outliers_dict = {}
    for column in data.columns:
        if column != 'air_quality_encoded':  # Exclude the target variable
            Q1 = data[column].quantile(0.25)
            Q3 = data[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
            outliers_dict[column] = outliers
    return outliers_dict

In [None]:
# Detect outliers in the DataFrame
outliers_dict = detect_outliers_iqr(df)

In [None]:
# Print outliers for each column (excluding air_quality_encoded)
for column, outliers in outliers_dict.items():
    print(f"Outliers in column '{column}':")
    print(outliers)
    print("\n")

In [None]:
outliers_dict[column] = outliers

In [None]:
# Define a function to remove outliers using the IQR method

def remove_outliers_iqr(data):
    for column in data.columns:
        if column != 'air_quality_encoded':  # Exclude the target variable
            Q1 = data[column].quantile(0.25)
            Q3 = data[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            # Remove outliers
            data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
    return data

In [None]:
cleaned_df = remove_outliers_iqr(df)

In [None]:
# Confirm outliers were removed
print(f"Original DataFrame shape: {df.shape}")
print(f"Cleaned DataFrame shape: {cleaned_df.shape}")

## Check class distribution and use SMOTE to resample it

In [None]:
class_distribution = df['air_quality_encoded'].value_counts()
print(class_distribution)

In [None]:
X = df.drop(columns=['air_quality_encoded'])  # Features
y = df['air_quality_encoded']  # Target variables

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize SMOTE
smote = SMOTE(random_state=42)

In [None]:
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Confirming resampling was implemented
print("Original class distribution:")
print(y_train.value_counts())
print("\nResampled class distribution:")
print(y_resampled.value_counts())

## Random Forest model

In [None]:
# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
report = classification_report(y_test, y_pred)
print(report)

## Check feature importance

In [None]:
importances = model.feature_importances_

In [None]:
# Create a DataFrame for feature importances
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
plt.barh(feature_importances['Feature'], feature_importances['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.show()

## KMEANS model

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
k = 3
model = KNeighborsClassifier(n_neighbors=k)

In [None]:
model.fit(X_train_scaled, y_train)

In [None]:
predictions = model.predict(X_test_scaled)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

In [None]:
print("Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

## NN Model

In [None]:
print(X_train.shape)

In [None]:
y_train_array = y_train.values.reshape(-1, 1)
encoder = OneHotEncoder(sparse_output=False)
y_train_encoded = encoder.fit_transform(y_train_array)

In [None]:
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Input(shape=(9,)))  # Ensure this matches the number of features in X_train
nn_model.add(tf.keras.layers.Dense(10, activation='relu'))  # First hidden layer
nn_model.add(tf.keras.layers.Dense(5, activation='relu'))   # Second hidden layer

In [None]:
nn_model.add(tf.keras.layers.Dense(4, activation='softmax'))  # Output layer for 4 classes

In [None]:
nn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
nn_model.fit(X_train, y_train_encoded, epochs=50, batch_size=10)

In [None]:
y_test_array = y_test.values.reshape(-1, 1)
y_test_encoded = encoder.transform(y_test_array)  # Use transform to keep the same encoding

In [None]:
loss, accuracy = nn_model.evaluate(X_test, y_test_encoded)
print(f'Loss: {loss}, Accuracy: {accuracy}')

In [None]:
y_pred = nn_model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)  # Get the class with the highest probability
y_test_classes = np.argmax(y_test_encoded, axis=1)  # Get the true classes

In [None]:
report = classification_report(y_test_classes, y_pred_classes)
print(report)

In [None]:
model = LogisticRegression(multi_class='ovr', max_iter=1000)  # Use 'multinomial' for softmax if needed

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)