In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [6]:
# Load data
df = pd.read_csv("extracted_data.csv")
df

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Talcher,2017-12-22,,,,,,,5.15,,0.02,,,,,
1,Talcher,2017-12-23,,,,,,,,,,,,,,
2,Talcher,2017-12-24,,,,,,,,,,,,,,
3,Talcher,2017-12-25,,,,,,,,,,,,,,
4,Talcher,2017-12-26,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
918,Talcher,2020-06-27,20.75,25.94,8.17,21.95,15.89,6.73,1.18,18.62,48.87,0.0,,,90.0,Satisfactory
919,Talcher,2020-06-28,24.88,31.07,,,0.00,,1.55,21.33,48.55,0.0,,,98.0,Satisfactory
920,Talcher,2020-06-29,20.37,25.46,22.82,19.64,18.96,10.23,1.59,20.79,46.84,0.0,,,99.0,Satisfactory
921,Talcher,2020-06-30,18.62,23.26,21.24,26.53,47.60,4.45,1.30,14.34,28.77,0.0,,,86.0,Satisfactory


In [7]:
# Drop columns with too many missing values
df_1 = df.drop(['Benzene', 'Toluene', 'Xylene'], axis=1)
df_1

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,AQI,AQI_Bucket
0,Talcher,2017-12-22,,,,,,,5.15,,0.02,,
1,Talcher,2017-12-23,,,,,,,,,,,
2,Talcher,2017-12-24,,,,,,,,,,,
3,Talcher,2017-12-25,,,,,,,,,,,
4,Talcher,2017-12-26,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
918,Talcher,2020-06-27,20.75,25.94,8.17,21.95,15.89,6.73,1.18,18.62,48.87,90.0,Satisfactory
919,Talcher,2020-06-28,24.88,31.07,,,0.00,,1.55,21.33,48.55,98.0,Satisfactory
920,Talcher,2020-06-29,20.37,25.46,22.82,19.64,18.96,10.23,1.59,20.79,46.84,99.0,Satisfactory
921,Talcher,2020-06-30,18.62,23.26,21.24,26.53,47.60,4.45,1.30,14.34,28.77,86.0,Satisfactory


In [8]:
# Select columns with numerical data
float_cols = df_1.select_dtypes(include=[np.number, 'float']).columns.to_list()
selected_data = df_1[float_cols]
selected_data

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,AQI
0,,,,,,,5.15,,0.02,
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
918,20.75,25.94,8.17,21.95,15.89,6.73,1.18,18.62,48.87,90.0
919,24.88,31.07,,,0.00,,1.55,21.33,48.55,98.0
920,20.37,25.46,22.82,19.64,18.96,10.23,1.59,20.79,46.84,99.0
921,18.62,23.26,21.24,26.53,47.60,4.45,1.30,14.34,28.77,86.0


In [9]:
# Impute missing numerical values using interpolation
imputed_df = selected_data.interpolate(method='linear', limit_direction='both')
imputed_df

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,AQI
0,123.61,322.43,14.450,0.090,43.73,0.02,5.150,38.32,0.020,311.0
1,123.61,322.43,14.450,0.090,43.73,0.02,4.456,38.32,0.046,311.0
2,123.61,322.43,14.450,0.090,43.73,0.02,3.762,38.32,0.072,311.0
3,123.61,322.43,14.450,0.090,43.73,0.02,3.068,38.32,0.098,311.0
4,123.61,322.43,14.450,0.090,43.73,0.02,2.374,38.32,0.124,311.0
...,...,...,...,...,...,...,...,...,...,...
918,20.75,25.94,8.170,21.950,15.89,6.73,1.180,18.62,48.870,90.0
919,24.88,31.07,15.495,20.795,0.00,8.48,1.550,21.33,48.550,98.0
920,20.37,25.46,22.820,19.640,18.96,10.23,1.590,20.79,46.840,99.0
921,18.62,23.26,21.240,26.530,47.60,4.45,1.300,14.34,28.770,86.0


In [11]:
# Select columns with object data
df_2 = df_1.select_dtypes(include=["object"])
df_3 = ['City', 'Date', 'AQI_Bucket']

In [12]:
# Handle missing values in AQI_Bucket by filling with a placeholder
df_2['AQI_Bucket'] = df_2['AQI_Bucket'].fillna('Unknown')


In [13]:
# OneHotEncode the categorical columns
encoder = OneHotEncoder(handle_unknown='ignore')
df_4 = pd.DataFrame(encoder.fit_transform(df_2[df_3]).toarray(), columns=encoder.get_feature_names_out(df_3))
df_4

Unnamed: 0,City_Talcher,Date_2017-12-22,Date_2017-12-23,Date_2017-12-24,Date_2017-12-25,Date_2017-12-26,Date_2017-12-27,Date_2017-12-28,Date_2017-12-29,Date_2017-12-30,...,Date_2020-06-29,Date_2020-06-30,Date_2020-07-01,AQI_Bucket_Good,AQI_Bucket_Moderate,AQI_Bucket_Poor,AQI_Bucket_Satisfactory,AQI_Bucket_Severe,AQI_Bucket_Unknown,AQI_Bucket_Very Poor
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
918,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
919,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
920,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
921,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [14]:
# Combine imputed numerical data with one-hot encoded categorical data
transformed_data = pd.concat([df_4, imputed_df], axis=1)
transformed_data

Unnamed: 0,City_Talcher,Date_2017-12-22,Date_2017-12-23,Date_2017-12-24,Date_2017-12-25,Date_2017-12-26,Date_2017-12-27,Date_2017-12-28,Date_2017-12-29,Date_2017-12-30,...,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,AQI
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,123.61,322.43,14.450,0.090,43.73,0.02,5.150,38.32,0.020,311.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,123.61,322.43,14.450,0.090,43.73,0.02,4.456,38.32,0.046,311.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,123.61,322.43,14.450,0.090,43.73,0.02,3.762,38.32,0.072,311.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,123.61,322.43,14.450,0.090,43.73,0.02,3.068,38.32,0.098,311.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,123.61,322.43,14.450,0.090,43.73,0.02,2.374,38.32,0.124,311.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
918,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20.75,25.94,8.170,21.950,15.89,6.73,1.180,18.62,48.870,90.0
919,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,24.88,31.07,15.495,20.795,0.00,8.48,1.550,21.33,48.550,98.0
920,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20.37,25.46,22.820,19.640,18.96,10.23,1.590,20.79,46.840,99.0
921,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18.62,23.26,21.240,26.530,47.60,4.45,1.300,14.34,28.770,86.0


In [15]:
# Split data into features and target
X = transformed_data
y = df_2['AQI_Bucket']  # Using AQI_Bucket as the target

In [16]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Train Logistic Regression model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# Make predictions
pred = lr.predict(X_test)
pred

array(['Poor', 'Moderate', 'Unknown', 'Moderate', 'Good', 'Very Poor',
       'Moderate', 'Unknown', 'Moderate', 'Moderate', 'Very Poor',
       'Moderate', 'Poor', 'Moderate', 'Moderate', 'Unknown', 'Very Poor',
       'Unknown', 'Satisfactory', 'Moderate', 'Unknown', 'Moderate',
       'Moderate', 'Poor', 'Severe', 'Unknown', 'Unknown', 'Satisfactory',
       'Moderate', 'Satisfactory', 'Unknown', 'Poor', 'Unknown', 'Good',
       'Unknown', 'Very Poor', 'Moderate', 'Good', 'Moderate', 'Moderate',
       'Poor', 'Satisfactory', 'Moderate', 'Moderate', 'Moderate',
       'Moderate', 'Poor', 'Good', 'Unknown', 'Moderate', 'Unknown',
       'Unknown', 'Satisfactory', 'Poor', 'Good', 'Moderate', 'Moderate',
       'Very Poor', 'Moderate', 'Moderate', 'Moderate', 'Satisfactory',
       'Moderate', 'Unknown', 'Very Poor', 'Satisfactory', 'Poor',
       'Unknown', 'Moderate', 'Moderate', 'Moderate', 'Poor', 'Unknown',
       'Satisfactory', 'Satisfactory', 'Unknown', 'Very Poor',
       'Ve

In [19]:
# Calculate accuracy
accuracy = accuracy_score(y_test, pred)
print("Accuracy:", accuracy)

Accuracy: 0.9621621621621622


In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Train Logistic Regression model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Make predictions
pred = lr.predict(X_test)
probs = lr.predict_proba(X_test)

# Calculate various metrics
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')
f1 = f1_score(y_test, pred, average='weighted')
roc_auc = roc_auc_score(y_test, probs, multi_class='ovr')  # Use this for binary classification

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)


Accuracy: 0.9621621621621622
Precision: 0.9614830849155792
Recall: 0.9621621621621622
F1 Score: 0.9615043057111212
ROC AUC Score: 0.9955245991179884


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
