# Adult Income Classification Experiments

## Import Libraries Needed for Experiment

In [None]:
# Import Python Libraries
import sys
import warnings
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from category_encoders import TargetEncoder
from imblearn.over_sampling import SMOTENC
from xgboost import XGBClassifier

# Append Path to Prevent Module Not Found Errors in Notebooks
sys.path.append('C:/Users/ameen/OneDrive/Documents/github/Classic-ML-Models/src')
warnings.filterwarnings('ignore')

# Import Custom Modules/Classes/Functions
from utils.helpers.etl_helpers import extract_csv, train_test_val_split
from utils.helpers.visual_helpers import cat_distribution, num_distribution
from utils.aws.s3 import S3Buckets
s3_conn = S3Buckets.credentials()

## Import Dataset and View Dataset Information

In [None]:
csv_file = s3_conn.read_file("jibbs-raw-datasets", "uncleaned_AdultData.csv")
df = extract_csv(csv_file)

In [None]:
# Make a copy of the dataframe and View the Data Info
adult = df.copy()
adult.info()

## Identify Datatypes in Dataset

In [None]:
# Identify the target column
target_col = 'salary'

# Identifying numerical columns
numerical_cols = adult.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Identifying categorical columns
categorical_cols = adult.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_cols = [col for col in categorical_cols if col not in [target_col]]

# Display the target, numerical and categorical columns
print("Target Column:", target_col)
print("Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)

## Check Data Distributions

In [None]:
for col in categorical_cols:
    print(f"Number of Unique values in {col}: {adult[col].nunique()}")

In [None]:
# Visualize the Distribution of the Target
cat_distribution(adult, "salary")

In [None]:
for col in categorical_cols:
    cat_distribution(adult, col)

In [None]:
for col in numerical_cols:
    num_distribution(adult, col)

## Data Preprocessing and Feature Engineering

- **Scale Numeric Variables** : Numerical variables will be scaled to ensure that the model is not biased towards
variables with higher magnitudes in the features.

- **Label Encoding** : Categorical variables are encoded with numeric data. I choose target encoding here to deal with the issue of high cardinality in several categorical features. 

- **Missing Values** : Missing numeric variables will br imputed using the median as it is less sensitive to outliers in the data. For categorical varables, missing variables are imputed using the mode of the feature. This imputation was not done because the data came with no missing values. 

- **Resample Target** : The target will be resampled to allow representation in the dataset. The distribution of the target classes is currently around 3:1 and could introduce some bias to the model since it is trained on more of one target than another (I call this the familiarity bias).

In [None]:
# Scaling the numerical columns
scaler = MinMaxScaler()
adult[numerical_cols] = scaler.fit_transform(adult[numerical_cols])

In [None]:
# Label Encode the Target
le = LabelEncoder()
adult[target_col] = le.fit_transform(adult[target_col])


In [None]:
# Target Encoding the Categorical Columns
encoder = TargetEncoder()
adult[categorical_cols] = encoder.fit_transform(adult[categorical_cols], adult[target_col])

## Dataset Splits (Test and Train Datasets)

In [None]:
# Split the Data into Train, Test and Validation Sets
X_train, X_test, X_val, y_train, y_test, y_val = train_test_val_split(adult, target_col)

## Train Models, Evaluate Metrics on Validation and Test Datasets

In [None]:
# First, we will use the SMOTENC technique to oversample the minority class
print('Original train data distribution:')
print('Class 0:', np.bincount(y_train)[0], 'Class 1:', np.bincount(y_train)[1])

# Initialize SMOTENC specifying which features are categorical
smote_nc = SMOTENC(categorical_features=categorical_cols, random_state=2024)

# Perform the resampling
X_resampled, y_resampled = smote_nc.fit_resample(X_train, y_train)

# Print the resampled data size and class distribution
print('Resampled train data distribution:')
print('Class 0:', np.bincount(y_resampled)[0], 'Class 1:', np.bincount(y_resampled)[1])

- **Model** : The data is first trained on the model with no hyperparameters tuned.

- **Model with Hyperparameter Tuning** : The model is also trained with its hyperparameters tuned to get the best performing set of hyperparameters based on the specified metric.

- **Metrics** : Several metrics such as accuracy, F1 Score, Precision, Recall and ROC-AUC Scores are added and the metric selected is based on the use case of the model.

### Logistic Regression - Base Model

In [None]:
base_model = LogisticRegression(random_state=2024)
base_model.fit(X_resampled, y_resampled)

In [None]:
# Check the Model Metrics
print("Model Metrics on Validation Set")
y_val_pred = base_model.predict(X_val)
print("Model Accuracy:", accuracy_score(y_val, y_val_pred))
print("Precision:", precision_score(y_val, y_val_pred))
print("Recall:", recall_score(y_val, y_val_pred))
print("F1 Score:", f1_score(y_val, y_val_pred))
print("ROC-AUC:", roc_auc_score(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("\n")
print ("Model Metrics on Test Set")
y_test_pred = base_model.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred))
print("Recall:", recall_score(y_test, y_test_pred))
print("F1 Score:", f1_score(y_test, y_test_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

In [None]:
# Hyperparameter Tuning using GridSearchCV
# Define the parameter grid
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300, 500]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=LogisticRegression(random_state=2024), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_resampled, y_resampled)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Initialize the model with the best parameters
tuned_base_model = LogisticRegression(**grid_search.best_params_, random_state=2024)
tuned_base_model.fit(X_resampled, y_resampled)

In [None]:
# Check the Model Metrics
print("Model Metrics on Validation Set")
y_val_pred = tuned_base_model.predict(X_val)
print("Model Accuracy:", accuracy_score(y_val, y_val_pred))
print("Precision:", precision_score(y_val, y_val_pred))
print("Recall:", recall_score(y_val, y_val_pred))
print("F1 Score:", f1_score(y_val, y_val_pred))
print("ROC-AUC:", roc_auc_score(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("\n")
print ("Model Metrics on Test Set")
y_test_pred = tuned_base_model.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred))
print("Recall:", recall_score(y_test, y_test_pred))
print("F1 Score:", f1_score(y_test, y_test_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


### Decision Tree Classifier - Alternative Model 1

In [None]:
alt_model_1 = DecisionTreeClassifier(random_state=2024)
alt_model_1.fit(X_resampled, y_resampled)

In [None]:
# Check the Model Metrics
print("Model Metrics on Validation Set")
y_val_pred = alt_model_1.predict(X_val)
print("Model Accuracy:", accuracy_score(y_val, y_val_pred))
print("Precision:", precision_score(y_val, y_val_pred))
print("Recall:", recall_score(y_val, y_val_pred))
print("F1 Score:", f1_score(y_val, y_val_pred))
print("ROC-AUC:", roc_auc_score(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("\n")
print ("Model Metrics on Test Set")
y_test_pred = alt_model_1.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred))
print("Recall:", recall_score(y_test, y_test_pred))
print("F1 Score:", f1_score(y_test, y_test_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

In [None]:
# Hyperparameter Tuning using GridSearchCV
# Define the parameter grid 
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=2024), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_resampled, y_resampled)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Initialize the model with the best parameters
tuned_alt_model_1 = DecisionTreeClassifier(**grid_search.best_params_, random_state=2024)
tuned_alt_model_1.fit(X_resampled, y_resampled)

In [None]:
# Check the Model Metrics
print("Model Metrics on Validation Set")
y_val_pred = tuned_alt_model_1.predict(X_val)
print("Model Accuracy:", accuracy_score(y_val, y_val_pred))
print("Precision:", precision_score(y_val, y_val_pred))
print("Recall:", recall_score(y_val, y_val_pred))
print("F1 Score:", f1_score(y_val, y_val_pred))
print("ROC-AUC:", roc_auc_score(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("\n")
print ("Model Metrics on Test Set")
y_test_pred = tuned_alt_model_1.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred))
print("Recall:", recall_score(y_test, y_test_pred))
print("F1 Score:", f1_score(y_test, y_test_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

### Random Forest Classifier - Alternative Model 2

In [None]:
alt_model_2 = RandomForestClassifier(random_state=2024)
alt_model_2.fit(X_resampled, y_resampled)

In [None]:
# Check the Model Metrics
print("Model Metrics on Validation Set")
y_val_pred = alt_model_2.predict(X_val)
print("Model Accuracy:", accuracy_score(y_val, y_val_pred))
print("Precision:", precision_score(y_val, y_val_pred))
print("Recall:", recall_score(y_val, y_val_pred))
print("F1 Score:", f1_score(y_val, y_val_pred))
print("ROC-AUC:", roc_auc_score(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("\n")
print ("Model Metrics on Test Set")
y_test_pred = alt_model_2.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred))
print("Recall:", recall_score(y_test, y_test_pred))
print("F1 Score:", f1_score(y_test, y_test_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

In [None]:
# Hyperparameter Tuning using GridSearchCV
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize the GridSearchCV object
tuned_alt_model_2 = GridSearchCV(estimator=RandomForestClassifier(random_state=2024), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
tuned_alt_model_2.fit(X_resampled, y_resampled)

# Print the best parameters and best score
print("Best Parameters:", tuned_alt_model_2.best_params_)
print("Best Score:", tuned_alt_model_2.best_score_)

# Initialize the model with the best parameters
tuned_alt_model_2 = RandomForestClassifier(**tuned_alt_model_2.best_params_, random_state=2024)
best_params = {'criterion': 'entropy', 'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
tuned_alt_model_2 = RandomForestClassifier(**best_params, random_state=2024)
tuned_alt_model_2.fit(X_resampled, y_resampled)

In [None]:
# Check the Model Metrics
print("Model Metrics on Validation Set")
y_val_pred = tuned_alt_model_2.predict(X_val)
print("Model Accuracy:", accuracy_score(y_val, y_val_pred))
print("Precision:", precision_score(y_val, y_val_pred))
print("Recall:", recall_score(y_val, y_val_pred))
print("F1 Score:", f1_score(y_val, y_val_pred))
print("ROC-AUC:", roc_auc_score(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("\n")
print ("Model Metrics on Test Set")
y_test_pred = tuned_alt_model_2.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred))
print("Recall:", recall_score(y_test, y_test_pred))
print("F1 Score:", f1_score(y_test, y_test_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

## XGBoost Classifier - Alternative Model 3

In [None]:
tuned_alt_model_3 = XGBClassifier(random_state=2024)
tuned_alt_model_3.fit(X_resampled, y_resampled)

In [None]:
# Check the Model Metrics
print("Model Metrics on Validation Set")
y_val_pred = tuned_alt_model_3.predict(X_val)
print("Model Accuracy:", accuracy_score(y_val, y_val_pred))
print("Precision:", precision_score(y_val, y_val_pred))
print("Recall:", recall_score(y_val, y_val_pred))
print("F1 Score:", f1_score(y_val, y_val_pred))
print("ROC-AUC:", roc_auc_score(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("\n")
print ("Model Metrics on Test Set")
y_test_pred = tuned_alt_model_3.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred))
print("Recall:", recall_score(y_test, y_test_pred))
print("F1 Score:", f1_score(y_test, y_test_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

In [None]:
# Hyperparameter Tuning using GridSearchCV
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, 40, 50],
    'learning_rate': [ 0.001, 0.01, 0.1, 0.2, 0.3],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=XGBClassifier(random_state=2024), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_resampled, y_resampled)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Initialize the model with the best parameters
tuned_alt_model_3 = XGBClassifier(**grid_search.best_params_, random_state=2024)
tuned_alt_model_3.fit(X_resampled, y_resampled)
tuned_alt_model_3 = XGBClassifier(**{'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 200, 'subsample': 1.0},
                                  random_state=2024)

Fitting 5 folds for each of 1125 candidates, totalling 5625 fits
Best Parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 200, 'subsample': 1.0}
Best Score: 0.890853155914734


In [None]:
# Check the Model Metrics
print("Model Metrics on Validation Set")
y_val_pred = tuned_alt_model_3.predict(X_val)
print("Model Accuracy:", accuracy_score(y_val, y_val_pred))
print("Precision:", precision_score(y_val, y_val_pred))
print("Recall:", recall_score(y_val, y_val_pred))
print("F1 Score:", f1_score(y_val, y_val_pred))
print("ROC-AUC:", roc_auc_score(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("\n")
print ("Model Metrics on Test Set")
y_test_pred = tuned_alt_model_3.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred))
print("Recall:", recall_score(y_test, y_test_pred))
print("F1 Score:", f1_score(y_test, y_test_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))