In [4]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('preprocessed_data_2.csv')

# Convert relevant columns to numeric data types
numeric_columns = ['NO2_mean', 'PM10_mean', 'PM2_mean', 'SO2_mean', 'CO_mean', 'CO2_mean']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Define breakpoints and corresponding AQI categories for each pollutant
breakpoints = {
    'NO2_mean': [0, 50, 100, 200, 300, 400, 500],
    'PM10_mean': [0, 50, 150, 250, 350, 420, 500],
    'PM2_mean': [0, 12, 35.4, 55.4, 150.4, 250.4, 500.4],
    'SO2_mean': [0, 40, 80, 380, 800, 1600, 2100],
    'CO_mean': [0, 30, 60, 90, 120, 145],  # Adjusted breakpoints for CO_mean
    'CO2_mean': [0, 300, 500, 1000, 2000, 5000, 10000]
}


# Define AQI categories for each range of AQI values
aqi_categories = [0, 50, 100, 150, 200, 300, 500]

# Define a function to calculate AQI for a given pollutant concentration
def calculate_AQI(concentration, breakpoints, categories):
    for i in range(len(breakpoints) - 1):
        if breakpoints[i] <= concentration <= breakpoints[i + 1]:
            AQI = ((categories[i + 1] - categories[i]) / (breakpoints[i + 1] - breakpoints[i])) * (concentration - breakpoints[i]) + categories[i]
            return AQI
    return -float('inf')  # Return -infinity if concentration is outside the defined breakpoints


# Define a function to determine the air quality label based on the maximum AQI value
def determine_air_quality_label(max_aqi):
    if max_aqi is not None:
        if max_aqi <= 100:
            return 'Livable'
        elif max_aqi <= 300:
            return 'Un-Livable'
        else:
            return 'Highly_un-livable'
    else:
        return None

# Define a function to calculate the overall AQI for each row
def determine_air_quality(row):
    aqi_values = [calculate_AQI(row[pollutant], breakpoints[pollutant], aqi_categories) for pollutant in breakpoints]
    max_aqi = max(aqi_values)
    if max_aqi == -float('inf'):
        return None
    else:
        print(max_aqi)
        return determine_air_quality_label(max_aqi)


# Apply the function to calculate Air Quality for each row
df['Air_Quality'] = df.apply(determine_air_quality, axis=1)

# Save the updated dataset
df.to_csv('pre_data_2_phase2_try.csv', index=False)


258.0
160.83333333333334
131.66666666666666
185.83333333333334
131.66666666666666
160.83333333333334
131.66666666666666
258.0
131.66666666666666
141.66666666666669
185.83333333333334
120.83333333333334
185.83333333333334
131.66666666666666
160.83333333333334
131.66666666666666
258.0
95.83333333333334
120.83333333333334
141.66666666666669
182.5
168.33333333333334
123.33333333333334
135.83333333333334
88.33333333333334
159.16666666666666
189.16666666666669
164.16666666666666
123.33333333333334
159.16666666666666
182.5
88.33333333333334
164.16666666666666
135.83333333333334
189.16666666666669
168.33333333333334
182.5
159.16666666666666
189.16666666666669
168.33333333333334
88.33333333333334
135.83333333333334
123.33333333333334
164.16666666666666
168.33333333333334
159.16666666666666
123.33333333333334
189.16666666666669
182.5
135.83333333333334
88.33333333333334
164.16666666666666
168.33333333333334
159.16666666666666
182.5
135.83333333333334
164.16666666666666
189.16666666666669
123.333

In [5]:
df = pd.read_csv('pre_data_2_phase2_try.csv')
unique_values = df['Air_Quality'].unique()
print(unique_values)


['Un-Livable' 'Livable']


In [22]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
df = pd.read_csv('pre_data_2_phase2_try.csv')

# Drop non-numeric columns and columns with datetime values
X = df.drop(['HUMIDITY','TEMPRATURE_MAX', 'UV_MAX','TEMPRATURE_MIN', 'UV_MIN', 'Date', 'Time', 'Air_Quality', 'AIR_PRESSURE', 'NAME', 'Unnamed: 0'], axis=1)
y = df['Air_Quality']

# Define the model
model = RandomForestClassifier()

# Define the number of folds for cross-validation
n_splits = 10

# Initialize Stratified K-Fold cross-validator
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform k-fold cross-validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict the target variable
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='macro'))
    recall_scores.append(recall_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

# Compute the mean evaluation metrics
mean_accuracy = sum(accuracy_scores) / n_splits
mean_precision = sum(precision_scores) / n_splits
mean_recall = sum(recall_scores) / n_splits
mean_f1 = sum(f1_scores) / n_splits

# Print the mean evaluation metrics
print("Mean Accuracy:", mean_accuracy)
print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1-score:", mean_f1)


Mean Accuracy: 1.0
Mean Precision: 1.0
Mean Recall: 1.0
Mean F1-score: 1.0


In [23]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
df = pd.read_csv('pre_data_2_phase2_try.csv')

# Drop non-numeric columns and columns with datetime values
X = df.drop(['HUMIDITY','TEMPRATURE_MAX', 'UV_MAX','TEMPRATURE_MIN', 'UV_MIN', 'Date', 'Time', 'Air_Quality', 'AIR_PRESSURE', 'NAME', 'Unnamed: 0'], axis=1)
y = df['Air_Quality']

# Define the model (SVM classifier)
model = SVC()

# Define the number of folds for cross-validation
n_splits = 10

# Initialize Stratified K-Fold cross-validator
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform k-fold cross-validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict the target variable
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='macro'))
    recall_scores.append(recall_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

# Compute the mean evaluation metrics
mean_accuracy = sum(accuracy_scores) / n_splits
mean_precision = sum(precision_scores) / n_splits
mean_recall = sum(recall_scores) / n_splits
mean_f1 = sum(f1_scores) / n_splits

# Print the mean evaluation metrics
print("Mean Accuracy:", mean_accuracy)
print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1-score:", mean_f1)

Mean Accuracy: 0.9631929455307443
Mean Precision: 0.9397414611222213
Mean Recall: 0.9009737367629619
Mean F1-score: 0.9129296879883574


In [6]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
df = pd.read_csv('pre_data_2_phase2_try.csv')

# Drop non-numeric columns and columns with datetime values
X = df.drop(['HUMIDITY','TEMPRATURE_MAX', 'UV_MAX','TEMPRATURE_MIN', 'UV_MIN', 'Date', 'Time', 'Air_Quality', 'AIR_PRESSURE', 'NAME', 'Unnamed: 0'], axis=1)
y = df['Air_Quality']

# Define the model (KNN classifier)
model = KNeighborsClassifier()

# Define the number of folds for cross-validation
n_splits = 5

# Initialize Stratified K-Fold cross-validator
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform k-fold cross-validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict the target variable
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='macro'))
    recall_scores.append(recall_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

# Compute the mean evaluation metrics
mean_accuracy = sum(accuracy_scores) / n_splits
mean_precision = sum(precision_scores) / n_splits
mean_recall = sum(recall_scores) / n_splits
mean_f1 = sum(f1_scores) / n_splits

# Print the mean evaluation metrics
print("Mean Accuracy:", mean_accuracy)
print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1-score:", mean_f1)

Mean Accuracy: 0.9988153751034767
Mean Precision: 0.998474750343268
Mean Recall: 0.9984887191974895
Mean F1-score: 0.9984814698469137


In [7]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
df = pd.read_csv('pre_data_2_phase2_try.csv')

# Drop non-numeric columns and columns with datetime values
X = df.drop(['HUMIDITY','TEMPRATURE_MAX', 'UV_MAX','TEMPRATURE_MIN', 'UV_MIN', 'Date', 'Time', 'Air_Quality', 'AIR_PRESSURE', 'NAME', 'Unnamed: 0'], axis=1)
y = df['Air_Quality']

# Define the individual models
model1 = RandomForestClassifier()
model2 = KNeighborsClassifier()

# Define the ensemble model
modelfinal = VotingClassifier(estimators=[('rf', model1), ('knn', model2)], voting='hard')

# Define the number of folds for cross-validation
n_splits = 20

# Initialize Stratified K-Fold cross-validator
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform k-fold cross-validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    modelfinal.fit(X_train, y_train)
    
    # Predict the target variable
    y_pred = modelfinal.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='macro'))
    recall_scores.append(recall_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

# Compute the mean evaluation metrics
mean_accuracy = sum(accuracy_scores) / n_splits
mean_precision = sum(precision_scores) / n_splits
mean_recall = sum(recall_scores) / n_splits
mean_f1 = sum(f1_scores) / n_splits

# Print the mean evaluation metrics
print("Mean Accuracy:", mean_accuracy)
print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1-score:", mean_f1)


Mean Accuracy: 0.999641033818022
Mean Precision: 0.9993260636659365
Mean Recall: 0.9997556047925104
Mean F1-score: 0.9995403200444034


In [8]:
import joblib

# Save the model as a pickle file
joblib.dump(modelfinal, 'phase2_ensemble_model.pkl')


['phase2_ensemble_model.pkl']

In [36]:
df = pd.read_csv('pre_data_2_phase2_try.csv')

df.NO2_mean.describe()

count    83571.000000
mean        68.103152
std         35.509639
min          0.000000
25%         41.500000
50%         72.000000
75%         92.500000
max        315.500000
Name: NO2_mean, dtype: float64

In [37]:
df.PM10_mean.describe()

count    83571.000000
mean        17.336995
std         11.413509
min          0.000000
25%          7.500000
50%         15.500000
75%         26.500000
max         48.500000
Name: PM10_mean, dtype: float64

In [38]:
df.PM2_mean.describe()

count    83571.000000
mean        13.727878
std          8.535012
min          0.000000
25%          6.500000
50%         12.500000
75%         20.500000
max         35.500000
Name: PM2_mean, dtype: float64

In [39]:
df.SO2_mean.describe()

count    83571.000000
mean         4.135837
std          8.849288
min          0.000000
25%          1.000000
50%          3.000000
75%          5.000000
max        157.000000
Name: SO2_mean, dtype: float64

In [40]:
df.CO_mean.describe()

count    83571.000000
mean        71.506892
std         27.891269
min         13.500000
25%         49.500000
50%         75.500000
75%         89.500000
max        144.500000
Name: CO_mean, dtype: float64

In [41]:
df.CO2_mean.describe()

count    83571.000000
mean       188.314906
std        202.069735
min          0.000000
25%         28.000000
50%         97.000000
75%        378.500000
max       1128.500000
Name: CO2_mean, dtype: float64