In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
df=pd.read_csv('/content/updated_pollution_dataset.csv')
df.head()

In [None]:
df.isnull().sum( )

In [None]:
df.describe()

In [None]:
df['Air Quality'].unique()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

label_encoder = LabelEncoder()
df['Air Quality'] = label_encoder.fit_transform(df['Air Quality'])

corr_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of DataFrame')
plt.show()

In [None]:
# PM2.5
# PM10



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = df.drop('Air Quality', axis=1)
y= df['Air Quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
cols_to_scale=['Temperature', 'Humidity', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO',
       'Proximity_to_Industrial_Areas', 'Population_Density']
scaler=StandardScaler()


X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

lr=LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled,y_train)
y_pred=lr.predict(X_test_scaled)
accuracy_score(y_test,y_pred)
# output 0.947 before hyperparamter tune


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train_scaled, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Best parameters: {'C': 10}
# Best cross-validation accuracy: 0.9427499999999999

In [None]:
best_lr = grid_search.best_estimator_
y_pred_tuned = best_lr.predict(X_test_scaled)
test_accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
print("Test accuracy with tuned model:", test_accuracy_tuned)
train_accuracy= best_lr.predict(X_train_scaled)
print("Train accuracy with tuned model:", accuracy_score(y_train, train_accuracy))

# Test accuracy with tuned model: 0.942
# Train accuracy with tuned model: 0.945

In [None]:

import pickle

filename = 'final_model.pkl'
pickle.dump(best_lr, open(filename, 'wb'))

# Load the model from the pickle file


In [None]:

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

with open('label_encoder.pkl', 'wb') as encoder_file:
    pickle.dump(label_encoder, encoder_file)

# To load them later:
# with open('scaler.pkl', 'rb') as scaler_file:
#     loaded_scaler = pickle.load(scaler_file)

# with open('label_encoder.pkl', 'rb') as encoder_file:
#     loaded_encoder = pickle.load(encoder_file)

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))

y_pred = loaded_model.predict(X_test_scaled)

accuracy_loaded = accuracy_score(y_test, y_pred)
print("Accuracy of loaded model:", accuracy_loaded)


In [None]:

new_user_data = pd.DataFrame({
    'Temperature': [25.5],
    'Humidity': [60.0],
    'PM2.5': [15.0],
    'PM10': [30.0],
    'NO2': [10.0],
    'SO2': [5.0],
    'CO': [2.0],
    'Proximity_to_Industrial_Areas': [0.1],
    'Population_Density': [1500]
})

user_data_scaled = scaler.transform(new_user_data)

# Predict the air quality for the new user data
predicted_air_quality_encoded = loaded_model.predict(user_data_scaled)

# Inverse transform the predicted value to get the original label
predicted_air_quality_label = label_encoder.inverse_transform(predicted_air_quality_encoded)

print("Predicted Air Quality for the new user:", predicted_air_quality_label[0])


In [None]:
# rf= RandomForestClassifier()
# rf.fit(X_train_scaled,y_train)
# y_pred=rf.predict(X_test_scaled)
# accuracy_score(y_test,y_pred)
# 0.957

In [None]:

y_train_pred_lr = lr.predict(X_train_scaled)

print(f"Random Forest Training Accuracy: {accuracy_score(y_train, y_train_pred_lr)}")
y_test_pred_rf = lr.predict(X_test_scaled)

print(f"Random Forest Testing Accuracy: {accuracy_score(y_test, y_test_pred_rf)}")


***HYPER PARAMETER TUNING***

LETS TRY TO USE FCNN FOR BETTER ACCURACY

In [None]:
!pip install optuna tensorflow scikeras

In [None]:

import optuna
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.optimizers import Adam # Import Adam optimizer
import numpy as np # Import numpy for flattening
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the build_model function within this cell
def build_model(optimizers='adam', dropout_rate=0.2, units=64):
  model = Sequential()
  model.add(Dense(units=units, activation='relu', input_dim=X_train_scaled.shape[1]))
  model.add(Dropout(dropout_rate))
  model.add(Dense(units=units, activation='relu'))
  model.add(Dropout(dropout_rate))
  model.add(Dense(units=units, activation='relu'))
  model.add(Dropout(dropout_rate))

  model.add(Dense(units=len(y_train.unique()), activation='softmax'))

  model.compile(optimizer=optimizers, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return model

def objective(trial):
    # Define hyperparameters to tune
    optimizers = trial.suggest_categorical('optimizers', ['adam', 'rmsprop'])
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    units = trial.suggest_categorical('units', [32, 64, 128])
    # Add learning rate to tune
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)

    model = build_model(optimizers=optimizers, dropout_rate=dropout_rate, units=units)
    if optimizers == 'adam':
        optimizer_instance = Adam(learning_rate=learning_rate)
    else: # rmsprop
        optimizer_instance = tf.keras.optimizers.RMSprop(learning_rate=learning_rate) # Use RMSprop

    model.compile(optimizer=optimizer_instance, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


    keras_classifier = KerasClassifier(model=model, epochs=100, batch_size=10, verbose=0)


    keras_classifier.fit(X_train_scaled, y_train.values.flatten())
    # Flatten y_test before scoring
    accuracy = keras_classifier.score(X_test_scaled, y_test.values.flatten())

    return accuracy

# Create an Optuna study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Print the best parameters and the best accuracy
print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# You can now build the final model with the best parameters
best_params = study.best_params

# Build the best model with the found parameters
best_model = build_model(optimizers=best_params['optimizers'],
                         dropout_rate=best_params['dropout_rate'],
                         units=best_params['units'])

# Compile the best model with the best learning rate
if best_params['optimizers'] == 'adam':
    best_optimizer = Adam(learning_rate=best_params['learning_rate'])
else:
    best_optimizer = tf.keras.optimizers.RMSprop(learning_rate=best_params['learning_rate'])

best_model.compile(optimizer=best_optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


# Train the best model on the full training data
# Flatten y_train before fitting the best model
best_model.fit(X_train_scaled, y_train.values.flatten(), epochs=100, batch_size=10, verbose=0)

# Evaluate the best model on the test data
# Flatten y_test before evaluating
loss, accuracy = best_model.evaluate(X_test_scaled, y_test.values.flatten(), verbose=0)
print(f"Accuracy of the best model on test data: {accuracy}")