In [22]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.metrics import classification_report_imbalanced

---

## Split the Data into Training and Testing Sets

In [23]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df = pd.read_csv("Resources/global_air_pollution_dataset.csv")

#lending_data_df = pd.read_csv("C:/Users/kfbie/OneDrive/Desktop/credit-risk-classification/Resources/lending_data.csv")

# Review the DataFrame
display(df.head())
display(df.tail())

Unnamed: 0,Country,City,AQI Value,AQI Category,CO AQI Value,CO AQI Category,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,NO2 AQI Category,PM2.5 AQI Value,PM2.5 AQI Category
0,Russian Federation,Praskoveya,51,Moderate,1,Good,36,Good,0,Good,51,Moderate
1,Brazil,Presidente Dutra,41,Good,1,Good,5,Good,1,Good,41,Good
2,Italy,Priolo Gargallo,66,Moderate,1,Good,39,Good,2,Good,66,Moderate
3,Poland,Przasnysz,34,Good,1,Good,34,Good,0,Good,20,Good
4,France,Punaauia,22,Good,0,Good,22,Good,0,Good,6,Good


Unnamed: 0,Country,City,AQI Value,AQI Category,CO AQI Value,CO AQI Category,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,NO2 AQI Category,PM2.5 AQI Value,PM2.5 AQI Category
23458,India,Gursahaiganj,184,Unhealthy,3,Good,154,Unhealthy,2,Good,184,Unhealthy
23459,France,Sceaux,50,Good,1,Good,20,Good,5,Good,50,Good
23460,India,Mormugao,50,Good,1,Good,22,Good,1,Good,50,Good
23461,United States of America,Westerville,71,Moderate,1,Good,44,Good,2,Good,71,Moderate
23462,Malaysia,Marang,70,Moderate,1,Good,38,Good,0,Good,70,Moderate


In [24]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df['AQI Category']

# Separate the X variable, the features
dfcopy = df.copy()
columns_to_remove = ["AQI Value", "Country", "City", "AQI Category", "CO AQI Category",
                     "Ozone AQI Category", "NO2 AQI Category", "PM2.5 AQI Category"]
x = dfcopy.drop(columns=columns_to_remove)

In [25]:
# Review the y variable Series
display(y.head())
display(y.tail())

0    Moderate
1        Good
2    Moderate
3        Good
4        Good
Name: AQI Category, dtype: object

23458    Unhealthy
23459         Good
23460         Good
23461     Moderate
23462     Moderate
Name: AQI Category, dtype: object

In [26]:
# Review the X variable DataFrame
display(x.head())
display(x.tail())

Unnamed: 0,CO AQI Value,Ozone AQI Value,NO2 AQI Value,PM2.5 AQI Value
0,1,36,0,51
1,1,5,1,41
2,1,39,2,66
3,1,34,0,20
4,0,22,0,6


Unnamed: 0,CO AQI Value,Ozone AQI Value,NO2 AQI Value,PM2.5 AQI Value
23458,3,154,2,184
23459,1,20,5,50
23460,1,22,1,50
23461,1,44,2,71
23462,1,38,0,70


In [27]:
# Check the balance of our target values
y.value_counts()

Good                              9936
Moderate                          9231
Unhealthy                         2227
Unhealthy for Sensitive Groups    1591
Very Unhealthy                     287
Hazardous                          191
Name: AQI Category, dtype: int64

In [28]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
x_train, x_test, y_train, y_test = train_test_split(
    x, 
    y, 
    random_state = 1
)

In [29]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
LR_model = LogisticRegression(random_state = 1)

# Fit the model using training data
LR_model.fit(x_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=1)

In [30]:
# Make a prediction using the testing data
LR_predictions = LR_model.predict(x_test)

In [31]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, LR_predictions)

0.5341096668227903

In [32]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, LR_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Good", "Actual Moderate", "Actual Unhealthy For Sensitive Groups","Actual Unhealthy", "Actual Very Unhealthy", "Actual Hazardous"], columns=["Predicted Good", "Predicted Moderate", "Predicted Unhealthy For Sensitive Groups", "Predicted Unhealthy", "Predicted Very Unhealthy", "Predicted Hazardous"]
)
cm_df

Unnamed: 0,Predicted Good,Predicted Moderate,Predicted Unhealthy For Sensitive Groups,Predicted Unhealthy,Predicted Very Unhealthy,Predicted Hazardous
Actual Good,2238,3,211,0,0,0
Actual Moderate,0,20,0,15,0,0
Actual Unhealthy For Sensitive Groups,106,3,2170,8,15,24
Actual Unhealthy,1,35,80,409,35,5
Actual Very Unhealthy,3,15,307,69,27,3
Actual Hazardous,0,7,2,52,3,0


In [33]:
# Print the classification report for the model
print(classification_report_imbalanced(y_test, LR_predictions))

                                      pre       rec       spe        f1       geo       iba       sup

                          Good       0.95      0.91      0.97      0.93      0.94      0.88      2452
                     Hazardous       0.24      0.57      0.99      0.34      0.75      0.54        35
                      Moderate       0.78      0.93      0.83      0.85      0.88      0.78      2326
                     Unhealthy       0.74      0.72      0.97      0.73      0.84      0.69       565
Unhealthy for Sensitive Groups       0.34      0.06      0.99      0.11      0.25      0.06       424
                Very Unhealthy       0.00      0.00      0.99      0.00      0.00      0.00        64

                   avg / total       0.81      0.83      0.92      0.81      0.85      0.75      5866



In [34]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
ROS_model = RandomOverSampler(random_state = 1)

# Fit the original training data to the random_oversampler model
x_oversampled, y_oversampled = ROS_model.fit_resample(x_train, y_train)

In [35]:
# Count the distinct values of the resampled labels data
y_oversampled.value_counts()

Moderate                          7484
Unhealthy                         7484
Good                              7484
Unhealthy for Sensitive Groups    7484
Very Unhealthy                    7484
Hazardous                         7484
Name: AQI Category, dtype: int64

In [36]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
LR_oversampled_model = LogisticRegression(random_state = 1)

# Fit the model using the resampled training data
LR_oversampled_model.fit(x_oversampled, y_oversampled)

# Make a prediction using the testing data
LR_oversampled_pred = LR_oversampled_model.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
# Print the balanced_accuracy score of the model 
balanced_accuracy_score(y_test, LR_oversampled_pred)

0.5074317170197137

In [38]:
# Generate a confusion matrix for the model
cm_oversampled = confusion_matrix(y_test, LR_oversampled_pred)
cm_oversampled_df = pd.DataFrame(cm_oversampled, 
                                index=["Actual Good", "Actual Moderate", "Actual Unhealthy For Sensitive Groups","Actual Unhealthy", "Actual Very Unhealthy", "Actual Hazardous"], columns=["Predicted Good", "Predicted Moderate", "Predicted Unhealthy For Sensitive Groups", "Predicted Unhealthy", "Predicted Very Unhealthy", "Predicted Hazardous"]
)
cm_oversampled_df


Unnamed: 0,Predicted Good,Predicted Moderate,Predicted Unhealthy For Sensitive Groups,Predicted Unhealthy,Predicted Very Unhealthy,Predicted Hazardous
Actual Good,1669,8,702,0,60,13
Actual Moderate,0,28,0,4,0,3
Actual Unhealthy For Sensitive Groups,137,44,1079,96,817,153
Actual Unhealthy,20,146,0,90,69,240
Actual Very Unhealthy,22,50,0,105,167,80
Actual Hazardous,1,19,0,9,0,35


In [39]:
# Print the classification report for the model
print(classification_report(y_test, LR_oversampled_pred))

                                precision    recall  f1-score   support

                          Good       0.90      0.68      0.78      2452
                     Hazardous       0.09      0.80      0.17        35
                      Moderate       0.61      0.46      0.53      2326
                     Unhealthy       0.30      0.16      0.21       565
Unhealthy for Sensitive Groups       0.15      0.39      0.22       424
                Very Unhealthy       0.07      0.55      0.12        64

                      accuracy                           0.52      5866
                     macro avg       0.35      0.51      0.34      5866
                  weighted avg       0.66      0.52      0.57      5866

