In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, accuracy_score, classification_report

In [3]:
# Load dataset
file_path = 'AQI and Lat Long of Countries.csv'  # Adjust path if necessary
dataset = pd.read_csv(file_path)

# Drop rows with missing Country values
dataset_cleaned = dataset.dropna(subset=['Country']).reset_index(drop=True)

In [4]:
# Selecting relevant features and targets
features = ['Country', 'City', 'CO AQI Value', 'Ozone AQI Value', 
            'NO2 AQI Value', 'PM2.5 AQI Value', 'lat', 'lng']
target_regression = 'AQI Value'
target_classification = 'AQI Category'

X = dataset_cleaned[features]
y_regression = dataset_cleaned[target_regression]
y_classification = dataset_cleaned[target_classification]

In [5]:
# Splitting data into train and test sets
X_train, X_test, y_train_reg, y_test_reg = train_test_split(X, y_regression, test_size=0.2, random_state=42)
_, _, y_train_class, y_test_class = train_test_split(X, y_classification, test_size=0.2, random_state=42)

In [6]:
# Define preprocessing for numerical and categorical data
categorical_features = ['Country', 'City']
numerical_features = ['CO AQI Value', 'Ozone AQI Value', 'NO2 AQI Value', 'PM2.5 AQI Value', 'lat', 'lng']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [7]:
# Random Forest Pipelines for Regression and Classification
regressor_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [8]:
classifier_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [9]:
# Train models
regressor_pipeline.fit(X_train, y_train_reg)
classifier_pipeline.fit(X_train, y_train_class)

# Predictions
y_pred_reg = regressor_pipeline.predict(X_test)
y_pred_class = classifier_pipeline.predict(X_test)

In [10]:
# Evaluation
regression_r2 = r2_score(y_test_reg, y_pred_reg)
classification_accuracy = accuracy_score(y_test_class, y_pred_class)
classification_report_summary = classification_report(y_test_class, y_pred_class)

print("Regression Model R² Score:", regression_r2)
print("Classification Model Accuracy:", classification_accuracy)
print("Classification Report:\n", classification_report_summary)

Regression Model R² Score: 0.9985088604897426
Classification Model Accuracy: 0.9807868252516011
Classification Report:
                                 precision    recall  f1-score   support

                          Good       1.00      1.00      1.00      1489
                     Hazardous       1.00      0.62      0.77         8
                      Moderate       0.98      1.00      0.99      1408
                     Unhealthy       0.88      0.93      0.91       179
Unhealthy for Sensitive Groups       0.95      0.81      0.88       169
                Very Unhealthy       1.00      0.46      0.63        26

                      accuracy                           0.98      3279
                     macro avg       0.97      0.80      0.86      3279
                  weighted avg       0.98      0.98      0.98      3279



In [11]:
# Example Prediction
def predict_aqi(country, city, co, ozone, no2, pm25, lat, lng):
    input_data = pd.DataFrame({
        'Country': [country],
        'City': [city],
        'CO AQI Value': [co],
        'Ozone AQI Value': [ozone],
        'NO2 AQI Value': [no2],
        'PM2.5 AQI Value': [pm25],
        'lat': [lat],
        'lng': [lng]
    })
    aqi_value = regressor_pipeline.predict(input_data)[0]
    aqi_category = classifier_pipeline.predict(input_data)[0]
    return aqi_value, aqi_category

In [12]:
# Example usage
example_result = predict_aqi("United States", "Los Angeles", 15.0, 50.0, 25.0, 30.0, 34.05, -118.25)
print("Predicted AQI Value:", example_result[0])
print("Predicted AQI Category:", example_result[1])

Predicted AQI Value: 50.0
Predicted AQI Category: Good


In [13]:
import joblib

# Save the pipelines
joblib.dump(regressor_pipeline, "regressor_pipeline.pkl")
joblib.dump(classifier_pipeline, "classifier_pipeline.pkl")

print("Models saved successfully!")


Models saved successfully!
