In [114]:
# importing libraries

import pandas as pd
import numpy as np

In [115]:
# reading the extracted csv file which contains Talcher data

ex_data = pd.read_csv('extracted_data.csv')

In [116]:
# displaying extracted data

ex_data.head() # 923 rows × 16 columns

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Talcher,2017-12-22,,,,,,,5.15,,0.02,,,,,
1,Talcher,2017-12-23,,,,,,,,,,,,,,
2,Talcher,2017-12-24,,,,,,,,,,,,,,
3,Talcher,2017-12-25,,,,,,,,,,,,,,
4,Talcher,2017-12-26,,,,,,,,,,,,,,


In [117]:
ex_data.isnull().sum() # checks how many missing data are present in each column

City            0
Date            0
PM2.5         182
PM10          177
NO            198
NO2           220
NOx           154
NH3           184
CO            148
SO2           190
O3            169
Benzene       239
Toluene       756
Xylene        923
AQI           225
AQI_Bucket    225
dtype: int64

In [118]:
# Drop columns with too many missing values
df = ex_data.drop(['Benzene', 'Toluene', 'Xylene'], axis=1)

In [119]:
df.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,AQI,AQI_Bucket
0,Talcher,2017-12-22,,,,,,,5.15,,0.02,,
1,Talcher,2017-12-23,,,,,,,,,,,
2,Talcher,2017-12-24,,,,,,,,,,,
3,Talcher,2017-12-25,,,,,,,,,,,
4,Talcher,2017-12-26,,,,,,,,,,,


In [120]:
# To know datatype of each column 

for col in df.columns:
    print(f"{col}:", df[col].dtype)

City: object
Date: object
PM2.5: float64
PM10: float64
NO: float64
NO2: float64
NOx: float64
NH3: float64
CO: float64
SO2: float64
O3: float64
AQI: float64
AQI_Bucket: object


In [121]:
# Select columns with numerical data
float_cols = df.select_dtypes(include=[np.number, 'float']).columns.to_list()
selected_data = df[float_cols]

In [122]:
selected_data

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,AQI
0,,,,,,,5.15,,0.02,
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
918,20.75,25.94,8.17,21.95,15.89,6.73,1.18,18.62,48.87,90.0
919,24.88,31.07,,,0.00,,1.55,21.33,48.55,98.0
920,20.37,25.46,22.82,19.64,18.96,10.23,1.59,20.79,46.84,99.0
921,18.62,23.26,21.24,26.53,47.60,4.45,1.30,14.34,28.77,86.0


In [123]:
# handling missing data
# replacing missing values using backward and forward fill
val_1 = selected_data.bfill() # backward fill
val_1 

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,AQI
0,123.61,322.43,14.45,0.09,43.73,0.02,5.15,38.32,0.02,311.0
1,123.61,322.43,14.45,0.09,43.73,0.02,1.68,38.32,0.15,311.0
2,123.61,322.43,14.45,0.09,43.73,0.02,1.68,38.32,0.15,311.0
3,123.61,322.43,14.45,0.09,43.73,0.02,1.68,38.32,0.15,311.0
4,123.61,322.43,14.45,0.09,43.73,0.02,1.68,38.32,0.15,311.0
...,...,...,...,...,...,...,...,...,...,...
918,20.75,25.94,8.17,21.95,15.89,6.73,1.18,18.62,48.87,90.0
919,24.88,31.07,22.82,19.64,0.00,10.23,1.55,21.33,48.55,98.0
920,20.37,25.46,22.82,19.64,18.96,10.23,1.59,20.79,46.84,99.0
921,18.62,23.26,21.24,26.53,47.60,4.45,1.30,14.34,28.77,86.0


In [124]:
val_1.isnull().sum()

PM2.5    0
PM10     0
NO       0
NO2      0
NOx      0
NH3      0
CO       0
SO2      0
O3       0
AQI      0
dtype: int64

In [125]:
# Select columns with object data
df_2 = df.select_dtypes(include=["object"])
df_3 = ['City', 'Date', 'AQI_Bucket']

In [126]:
# Handle missing values in AQI_Bucket by filling with a placeholder, you can choose an appropriate method
df_2['AQI_Bucket'] = df_2['AQI_Bucket'].fillna('Unknown')

In [127]:
# OneHotEncode the categorical columns
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')
df_4 = pd.DataFrame(encoder.fit_transform(df_2[df_3]).toarray(), columns=encoder.get_feature_names_out(df_3))

In [128]:
# Combine imputed numerical data with one-hot encoded categorical data
transformed_data = pd.concat([df_4, val_1], axis=1)

In [129]:
transformed_data

Unnamed: 0,City_Talcher,Date_2017-12-22,Date_2017-12-23,Date_2017-12-24,Date_2017-12-25,Date_2017-12-26,Date_2017-12-27,Date_2017-12-28,Date_2017-12-29,Date_2017-12-30,...,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,AQI
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,123.61,322.43,14.45,0.09,43.73,0.02,5.15,38.32,0.02,311.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,123.61,322.43,14.45,0.09,43.73,0.02,1.68,38.32,0.15,311.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,123.61,322.43,14.45,0.09,43.73,0.02,1.68,38.32,0.15,311.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,123.61,322.43,14.45,0.09,43.73,0.02,1.68,38.32,0.15,311.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,123.61,322.43,14.45,0.09,43.73,0.02,1.68,38.32,0.15,311.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
918,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20.75,25.94,8.17,21.95,15.89,6.73,1.18,18.62,48.87,90.0
919,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,24.88,31.07,22.82,19.64,0.00,10.23,1.55,21.33,48.55,98.0
920,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20.37,25.46,22.82,19.64,18.96,10.23,1.59,20.79,46.84,99.0
921,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18.62,23.26,21.24,26.53,47.60,4.45,1.30,14.34,28.77,86.0


In [130]:
# Split data into features and target
X = transformed_data
y = df_2['AQI_Bucket']  # Using AQI_Bucket as the target

In [131]:
# Split data into training and testing sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [132]:
# Train Logistic Regression model
from sklearn.linear_model import LogisticRegression 
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [133]:
# Make predictions
pred = lr.predict(X_test)
pred

array(['Poor', 'Moderate', 'Unknown', 'Moderate', 'Good', 'Severe',
       'Moderate', 'Unknown', 'Moderate', 'Moderate', 'Very Poor',
       'Moderate', 'Poor', 'Moderate', 'Moderate', 'Unknown', 'Very Poor',
       'Unknown', 'Satisfactory', 'Moderate', 'Unknown', 'Moderate',
       'Moderate', 'Poor', 'Severe', 'Unknown', 'Unknown', 'Satisfactory',
       'Moderate', 'Satisfactory', 'Unknown', 'Poor', 'Unknown',
       'Moderate', 'Unknown', 'Very Poor', 'Moderate', 'Good', 'Moderate',
       'Moderate', 'Poor', 'Satisfactory', 'Moderate', 'Moderate',
       'Moderate', 'Moderate', 'Very Poor', 'Good', 'Unknown', 'Moderate',
       'Unknown', 'Unknown', 'Satisfactory', 'Poor', 'Good', 'Moderate',
       'Moderate', 'Very Poor', 'Moderate', 'Moderate', 'Moderate',
       'Satisfactory', 'Moderate', 'Unknown', 'Very Poor', 'Satisfactory',
       'Poor', 'Unknown', 'Moderate', 'Moderate', 'Moderate', 'Poor',
       'Unknown', 'Satisfactory', 'Satisfactory', 'Unknown', 'Very Poor',
    

In [134]:
# Calculate accuracy

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, pred)
print("Accuracy:", accuracy)


Accuracy: 0.972972972972973


In [135]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Train Logistic Regression model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Make predictions
pred = lr.predict(X_test)
probs = lr.predict_proba(X_test)

# Calculate various metrics
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')
f1 = f1_score(y_test, pred, average='weighted')
roc_auc = roc_auc_score(y_test, probs, multi_class='ovr')  # Use this for binary classification

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)


Accuracy: 0.972972972972973
Precision: 0.9717391304347825
Recall: 0.972972972972973
F1 Score: 0.9721017721017722
ROC AUC Score: 0.9956828964639912


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
