In [10]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load data
df = pd.read_csv("extracted_data.csv")

# Drop columns with too many missing values
df_1 = df.drop(['Benzene', 'Toluene', 'Xylene'], axis=1)

# Select columns with numerical data
float_cols = df_1.select_dtypes(include=[np.number, 'float']).columns.to_list()
selected_data = df_1[float_cols]

# Impute missing numerical values
imputer_data = KNNImputer(n_neighbors=5)
imputed_data = imputer_data.fit_transform(selected_data)
imputed_df = pd.DataFrame(imputed_data, columns=float_cols)

# Select columns with object data
df_2 = df_1.select_dtypes(include=["object"])
df_3 = ['City', 'Date', 'AQI_Bucket']

# Handle missing values in AQI_Bucket by filling with a placeholder, you can choose an appropriate method
df_2['AQI_Bucket'] = df_2['AQI_Bucket'].fillna('Unknown')

# OneHotEncode the categorical columns
encoder = OneHotEncoder(handle_unknown='ignore')
df_4 = pd.DataFrame(encoder.fit_transform(df_2[df_3]).toarray(), columns=encoder.get_feature_names_out(df_3))

# Combine imputed numerical data with one-hot encoded categorical data
transformed_data = pd.concat([df_4, imputed_df], axis=1)

# Split data into features and target
X = transformed_data
y = df_2['AQI_Bucket']  # Using AQI_Bucket as the target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Make predictions
pred = lr.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, pred)
print("Accuracy:", accuracy)


Accuracy: 0.9621621621621622


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Train Logistic Regression model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Make predictions
pred = lr.predict(X_test)
probs = lr.predict_proba(X_test)

# Calculate various metrics
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')
f1 = f1_score(y_test, pred, average='weighted')
roc_auc = roc_auc_score(y_test, probs, multi_class='ovr')  # Use this for binary classification

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)


Accuracy: 0.9621621621621622
Precision: 0.9602168589962543
Recall: 0.9621621621621622
F1 Score: 0.9605261547026366
ROC AUC Score: 0.9884044236747433


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load data
df = pd.read_csv("extracted_data.csv")

# Drop columns with too many missing values
df_1 = df.drop(['Benzene', 'Toluene', 'Xylene'], axis=1)

# Select columns with numerical data
float_cols = df_1.select_dtypes(include=[np.number, 'float']).columns.to_list()
selected_data = df_1[float_cols]

# Impute missing numerical values using extrapolation
imputed_df = selected_data.interpolate(method='linear', limit_direction='both', limit_area='outside')

# Select columns with object data
df_2 = df_1.select_dtypes(include=["object"])
df_3 = ['City', 'Date', 'AQI_Bucket']

# Handle missing values in AQI_Bucket by filling with a placeholder
df_2['AQI_Bucket'] = df_2['AQI_Bucket'].fillna('Unknown')

# OneHotEncode the categorical columns
encoder = OneHotEncoder(handle_unknown='ignore')
df_4 = pd.DataFrame(encoder.fit_transform(df_2[df_3]).toarray(), columns=encoder.get_feature_names_out(df_3))

# Combine imputed numerical data with one-hot encoded categorical data
transformed_data = pd.concat([df_4, imputed_df], axis=1)

# Split data into features and target
X = transformed_data
y = df_2['AQI_Bucket']  # Using AQI_Bucket as the target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Logistic Regression model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Make predictions
pred = lr.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, pred)
print("Accuracy:", accuracy)


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values