## Preprocessing

In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the pollution_dataset.csv
import pandas as pd
pollution_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/23-Project-4-Week-1/resources/updated_pollution_dataset.csv")
pollution_df.shape

(5000, 10)

In [None]:
# Determine the number of unique values in each column.
pollution_df.nunique()

Unnamed: 0,0
Temperature,362
Humidity,723
PM2.5,815
PM10,955
NO2,445
SO2,348
CO,265
Proximity_to_Industrial_Areas,179
Population_Density,683
Air Quality,4


In [None]:
# Convert categorical data to numeric with `pd.get_dummies`
# pollution_df = pd.get_dummies(pollution_df)
# pollution_df.head()

In [None]:
# Split our preprocessed data into our features and target arrays
X = pollution_df.drop(columns='Air Quality')
y = pd.get_dummies(pollution_df['Air Quality'])
z = pd.get_dummies(pollution_df)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
z

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,Air Quality_Good,Air Quality_Hazardous,Air Quality_Moderate,Air Quality_Poor
0,29.8,59.1,5.2,17.9,18.9,9.2,1.72,6.3,319,False,False,True,False
1,28.3,75.6,2.3,12.2,30.8,9.7,1.64,6.0,611,False,False,True,False
2,23.1,74.7,26.7,33.8,24.4,12.6,1.63,5.2,619,False,False,True,False
3,27.1,39.1,6.1,6.3,13.5,5.3,1.15,11.1,551,True,False,False,False
4,26.5,70.7,6.9,16.0,21.9,5.6,1.01,12.7,303,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,40.6,74.1,116.0,126.7,45.5,25.7,2.11,2.8,765,False,True,False,False
4996,28.1,96.9,6.9,25.0,25.3,10.8,1.54,5.7,709,False,False,True,False
4997,25.9,78.2,14.2,22.1,34.8,7.8,1.63,9.6,379,False,False,True,False
4998,25.3,44.4,21.4,29.0,23.7,5.7,0.89,11.6,241,True,False,False,False


In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [None]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])

    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=20,
        step=2), activation=activation, input_dim=9))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 10)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=20,
            step=2),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=4, activation="softmax"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [None]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [None]:
# Import the kerastuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled, y_train, epochs=20, validation_data=(X_test_scaled,y_test))

Trial 60 Complete [00h 00m 16s]
val_accuracy: 0.7039999961853027

Best val_accuracy So Far: 0.9488000273704529
Total elapsed time: 00h 09m 01s


In [None]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'relu',
 'first_units': 17,
 'num_layers': 5,
 'units_0': 17,
 'units_1': 17,
 'units_2': 3,
 'units_3': 9,
 'units_4': 17,
 'units_5': 15,
 'units_6': 11,
 'units_7': 1,
 'units_8': 1,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 7,
 'tuner/bracket': 1,
 'tuner/round': 1,
 'tuner/trial_id': '0048',
 'units_9': 17}

In [None]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


40/40 - 0s - 9ms/step - accuracy: 0.9488 - loss: 0.0798
Loss: 0.0798335149884224, Accuracy: 0.9488000273704529


In [None]:
pollution_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Temperature                    5000 non-null   float64
 1   Humidity                       5000 non-null   float64
 2   PM2.5                          5000 non-null   float64
 3   PM10                           5000 non-null   float64
 4   NO2                            5000 non-null   float64
 5   SO2                            5000 non-null   float64
 6   CO                             5000 non-null   float64
 7   Proximity_to_Industrial_Areas  5000 non-null   float64
 8   Population_Density             5000 non-null   int64  
 9   Air Quality                    5000 non-null   object 
dtypes: float64(8), int64(1), object(1)
memory usage: 390.8+ KB


In [None]:
from sklearn.metrics import confusion_matrix, classification_report
# Convert predictions to discrete class labels using argmax
predictions = best_model.predict(X_test_scaled) # Use scaled data for predictions
predicted_labels = predictions.argmax(axis=1)  # Get the index of the highest probability

# Convert y_test to a single-column format if it is one-hot encoded
y_test_labels = y_test.values.argmax(axis=1)  # Get the original class labels

# Create and save the testing classification report
testing_report = classification_report(y_test_labels, predicted_labels)

# Print the testing classification report
print(testing_report)

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       508
           1       0.96      0.79      0.87       121
           2       0.94      0.97      0.95       376
           3       0.86      0.89      0.88       245

    accuracy                           0.95      1250
   macro avg       0.94      0.91      0.92      1250
weighted avg       0.95      0.95      0.95      1250



In [None]:
from sklearn.metrics import confusion_matrix, classification_report
testing_matrix = confusion_matrix(y_test_labels, predicted_labels)
print(testing_matrix)

[[507   0   1   0]
 [  0  96   0  25]
 [  2   0 364  10]
 [  0   4  22 219]]


In [None]:
predictions

array([[1.36756242e-07, 1.09908838e-09, 9.99999344e-01, 5.10950827e-07],
       [9.99999940e-01, 3.82273442e-21, 8.23593277e-12, 8.22998916e-29],
       [6.00355143e-09, 9.80061054e-01, 2.25661330e-11, 1.99389998e-02],
       ...,
       [3.91897032e-07, 4.16335801e-08, 9.99896228e-01, 1.03213024e-04],
       [1.28387567e-08, 9.91064191e-01, 4.10619455e-11, 8.93586501e-03],
       [6.30222473e-07, 1.03461607e-05, 8.63283873e-01, 1.36705115e-01]],
      dtype=float32)

In [None]:
y_test

Unnamed: 0,Good,Hazardous,Moderate,Poor
2764,False,False,True,False
4767,True,False,False,False
3814,False,True,False,False
3499,False,False,True,False
2735,False,False,False,True
...,...,...,...,...
3015,False,False,False,True
1891,False,False,False,True
2773,False,False,True,False
3803,False,True,False,False


## Save Results to HDF5 file

In [None]:
# Export our model to HDF5 file
best_model.save('AirQuality_Assessment.keras')