<a href="https://colab.research.google.com/github/leeh-nix/air-quality-index-prediction/blob/main/AQI_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import tensorflow as tf
import pathlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [None]:
# dataset_path = pathlib.Path(
#     "/content/drive/MyDrive/"
# )
df = pd.read_csv("weather_dataset.csv")
dataset = df

## Preprocessing

In [None]:
columns_to_drop = [
    "last_updated",
    "last_updated_epoch",
    "wind_direction",
    "sunrise",
    "sunset",
    "moonrise",
    "moonset",
    "moon_phase",
    "moon_illumination",
    "feels_like_celsius",
    "feels_like_fahrenheit",
    "precip_in",
    "temperature_fahrenheit",
    "pressure_in",
    "timezone",
]
dataset = dataset.drop(columns=columns_to_drop)

us_dataset = dataset.drop(
    columns=["air_quality_gb-defra-index", "visibility_km", "gust_kph", "wind_kph"]
)

gb_dataset = dataset.drop(
    columns=["air_quality_us-epa-index", "visibility_miles", "gust_mph", "wind_mph"]
)

In [None]:
# Display columns of dataset
# print(df.columns)
# print(dataset.columns)
# print(us_dataset.columns)
# print(gb_dataset.columns)


In [None]:
# gb_dataset = gb_dataset.drop(columns=[""])
gb_dataset.columns

In [None]:
X = gb_dataset.iloc[:, :-1]
y = gb_dataset.iloc[:, -1]
print(X.columns)
print(y)

### One Hot Encoding

In [None]:
columns_to_encode = ["country", "location_name", "region", "condition_text"]
encoder = OneHotEncoder(sparse_output=False, drop="first")
encoded_columns = encoder.fit_transform(X[columns_to_encode])
X_encoded = pd.DataFrame(
    encoded_columns, columns=encoder.get_feature_names_out(columns_to_encode)
)
X = pd.concat([X, X_encoded], axis=1)
X = X.drop(columns=columns_to_encode)

print(X.head())

In [None]:
X.shape

### Splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

## Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lr = model_lr.predict(X_test)

In [None]:
# Evaluate the model_1
mse = mean_squared_error(y_test, y_pred_lr)
print(f"Mean Squared Error: {mse}")

In [None]:
# Plotting
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_lr, alpha=0.3)
plt.title("Actual vs Predicted")
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(model_lr.coef_)
plt.show()

## Neural Network Model

In [None]:
import tensorflow as tf

# Define the model
model_nn = tf.keras.models.Sequential()
model_nn.add(tf.keras.layers.Dense(128, activation="relu", input_shape=(X_train.shape[1],)))
model_nn.add(tf.keras.layers.Dense(64, activation="relu"))
model_nn.add(tf.keras.layers.Dense(1))  # Output layer with 1 neuron for regression

# Compile the model_nn
model_nn.compile(
    optimizer="adam", loss="mean_squared_error"
)  # Use mean squared error for regression

# Train the model_nn
history = model_nn.fit(
    X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test)
)

# Evaluate the model_nn
loss = model_nn.evaluate(X_test, y_test)
print(f"Mean Squared Error on Test Set: {loss:.2f}")

# Make predictions
y_pred = model_nn.predict(X_test)

In [None]:
# Plotting the true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.show()

In [None]:
# visualize the model
plt.plot(history.history["loss"])
plt.xlabel("Epoch")
plt.ylabel("Loss")