In [4]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix ,roc_auc_score, brier_score_loss
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
import time
from sklearn.model_selection import cross_val_predict,KFold
import tensorflow as tf

In [6]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
df = pd.concat([X, y], axis=1)

In [7]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [8]:
target_values = df.income
target_values

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
48837    <=50K.
48838    <=50K.
48839    <=50K.
48840    <=50K.
48841     >50K.
Name: income, Length: 48842, dtype: object

In [9]:
binary_target_values = df['income'].map({'>50K': 1, '>50K.': 1, '<=50K': 0 , '<=50K.' : 0})

# Display the resulting binary target values
print(binary_target_values)


0        0
1        0
2        0
3        0
4        0
        ..
48837    0
48838    0
48839    0
48840    0
48841    1
Name: income, Length: 48842, dtype: int64


In [10]:
census_data = df.drop('income',axis=1)

In [11]:
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import csr_matrix

categorical_cols = census_data.select_dtypes(include=['object']).columns.tolist()

# Create an instance of OneHotEncoder with sparse=True
encoder = OneHotEncoder(sparse_output=True)

# Fit the encoder on the categorical columns and transform the data
encoded_data = encoder.fit_transform(census_data[categorical_cols])

# Convert the encoded data to a pandas DataFrame
encoded_df = pd.DataFrame.sparse.from_spmatrix(encoded_data, columns=encoder.get_feature_names_out())



In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(encoded_df, binary_target_values, train_size=0.80, random_state=42)

In [13]:
def calculate_performance_metrics(y_true, y_pred):
    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    # Extract confusion matrix components
    TP = conf_matrix[1, 1]
    TN = conf_matrix[0, 0]
    FP = conf_matrix[0, 1]
    FN = conf_matrix[1, 0]
    
    # Calculate total positive and negative samples
    P = TP + FN
    N = TN + FP
    
    # Calculate True Positive Rate (Sensitivity)
    TPR = TP / P
    # Calculate True Negative Rate (Specificity)
    TNR = TN / N
    # Calculate False Positive Rate
    FPR = FP / N
    # Calculate False Negative Rate
    FNR = FN / P
    
    # Calculate Recall (Sensitivity)
    recall = TP / P
    # Calculate Precision
    precision = TP / (TP + FP)
    # Calculate F1 Score
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    # Calculate Accuracy
    accuracy = (TP + TN) / (P + N)
    # Calculate Error Rate
    error_rate = (FP + FN) / (P + N)
    # Calculate Balanced Accuracy
    balanced_accuracy = (TPR + TNR) / 2
    # Calculate True Skill Statistics
    true_skill_statistics = TP / (TP + FN) - FP / (FP + TN)
    # Calculate Heidke Skill Score
    heidke_skill_score = (2 * (TP * TN - FP * FN)) / ((TP + FN) * (FN + TN) + (TP + FP) * (FP + TN))
    
    # Calculate accuracy using the accuracy_score function
    acc_by_package = accuracy_score(y_true, y_pred)
    # Calculate Brier Score Loss
    brier_loss = brier_score_loss(y_true, y_pred)
    # Calculate Area Under the ROC Curve (AUC)
    auc = roc_auc_score(y_true, y_pred)
    return (TP, TN, FP, FN, TPR, TNR, FPR, FNR, recall, precision, f1_score, accuracy, error_rate, balanced_accuracy, true_skill_statistics, heidke_skill_score, acc_by_package, brier_loss, auc)



In [19]:
num_folds = 10
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42) # Define the KFold iterator
start = time.time()
DT_Model = DecisionTreeClassifier(max_depth=8, min_samples_split=15, min_samples_leaf=7, random_state=42)
y_pred_cv_dt = cross_val_predict(DT_Model, X_train, Y_train, cv=kf)
performance_metrics_cv = calculate_performance_metrics(Y_train, y_pred_cv_dt)


In [20]:
# Print the performance metrics for cross-validation
print("Performance Metrics for 10-fold Cross-Validation:")
print("True Positives (TP):", performance_metrics_cv[0])
print("True Negatives (TN):", performance_metrics_cv[1])
print("False Positives (FP):", performance_metrics_cv[2])
print("False Negatives (FN):", performance_metrics_cv[3])
print("True Positive Rate (TPR):", performance_metrics_cv[4])
print("True Negative Rate (TNR):", performance_metrics_cv[5])
print("False Positive Rate (FPR):", performance_metrics_cv[6])
print("False Negative Rate (FNR):", performance_metrics_cv[7])
print("Recall/Sensitivity (r):", performance_metrics_cv[8])
print("Precision (pr):", performance_metrics_cv[9])
print("F1 Score:", performance_metrics_cv[10])
print("Accuracy:", performance_metrics_cv[11])
print("Error Rate:", performance_metrics_cv[12])
print("Balanced Accuracy:", performance_metrics_cv[13])
print("True Skill Statistics (TSS):", performance_metrics_cv[14])
print("Heidke Skill Score (HSS):", performance_metrics_cv[15])
print("Accuracy by Package:", performance_metrics_cv[16])
print("Brier Score Loss:", performance_metrics_cv[17])
print("ROC AUC Score:", performance_metrics_cv[18])
#print(f"Time taken for cross-validation: {total_time_cv} seconds\n")
end = time.time()
total_time = end - start
print(f"Time taken for DT model to genearte the output: {total_time} seconds\n")


Performance Metrics for 10-fold Cross-Validation:
True Positives (TP): 4591
True Negatives (TN): 27710
False Positives (FP): 2031
False Negatives (FN): 4741
True Positive Rate (TPR): 0.49196313759108445
True Negative Rate (TNR): 0.9317104334084261
False Positive Rate (FPR): 0.06828956659157392
False Negative Rate (FNR): 0.5080368624089155
Recall/Sensitivity (r): 0.49196313759108445
Precision (pr): 0.6932950770160072
F1 Score: 0.5755296477372445
Accuracy: 0.8266833875054386
Error Rate: 0.17331661249456146
Balanced Accuracy: 0.7118367854997553
True Skill Statistics (TSS): 0.42367357099951053
Heidke Skill Score (HSS): 0.470559829013877
Accuracy by Package: 0.8266833875054386
Brier Score Loss: 0.17331661249456146
ROC AUC Score: 0.7118367854997552
Time taken for DT model to genearte the output: 4.674790143966675 seconds



In [24]:
start1 = time.time()
RNDF_Model = RandomForestClassifier(
    n_estimators=100,
    max_depth=8, 
    min_samples_split=15, 
    min_samples_leaf=7, 
    random_state=42, 
    verbose=1
)
y_pred_cv_rndf = cross_val_predict(RNDF_Model, X_train, Y_train, cv=kf)
performance_metrics_cv_rndf = calculate_performance_metrics(Y_train, y_pred_cv_rndf)

# End timing
end1 = time.time()
print(f"Time taken for Random Forest model: {end1 - start1} seconds")

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.7s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.7s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.7s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.7s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.7s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.7s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.7s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.7s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Do

Time taken for Random Forest model: 18.89300513267517 seconds


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [25]:

# Print the performance metrics for cross-validation
print("Performance Metrics for 10-fold Cross-Validation with RandomForestClassifier:")
print("True Positives (TP):", performance_metrics_cv_rndf[0])
print("True Negatives (TN):", performance_metrics_cv_rndf[1])
print("False Positives (FP):", performance_metrics_cv_rndf[2])
print("False Negatives (FN):", performance_metrics_cv_rndf[3])
print("True Positive Rate (TPR):", performance_metrics_cv_rndf[4])
print("True Negative Rate (TNR):", performance_metrics_cv_rndf[5])
print("False Positive Rate (FPR):", performance_metrics_cv_rndf[6])
print("False Negative Rate (FNR):", performance_metrics_cv_rndf[7])
print("Recall/Sensitivity (r):", performance_metrics_cv_rndf[8])
print("Precision (pr):", performance_metrics_cv_rndf[9])
print("F1 Score:", performance_metrics_cv_rndf[10])
print("Accuracy:", performance_metrics_cv_rndf[11])
print("Error Rate:", performance_metrics_cv_rndf[12])
print("Balanced Accuracy:", performance_metrics_cv_rndf[13])
print("True Skill Statistics (TSS):", performance_metrics_cv_rndf[14])
print("Heidke Skill Score (HSS):", performance_metrics_cv_rndf[15])
print("Accuracy by Package:", performance_metrics_cv_rndf[16])
print("Brier Score Loss:", performance_metrics_cv_rndf[17])
print("ROC AUC Score:", performance_metrics_cv_rndf[18])
#print(f"Time taken for cross-validation: {total_time_cv1} seconds\n")
end1 = time.time()
total_time1 = end1 - start1
print(f"Time taken for RNDF model to genearte the output: {total_time1} seconds\n")


Performance Metrics for 10-fold Cross-Validation with RandomForestClassifier:
True Positives (TP): 4141
True Negatives (TN): 28167
False Positives (FP): 1574
False Negatives (FN): 5191
True Positive Rate (TPR): 0.44374196313759107
True Negative Rate (TNR): 0.9470764264819609
False Positive Rate (FPR): 0.05292357351803907
False Negative Rate (FNR): 0.5562580368624089
Recall/Sensitivity (r): 0.44374196313759107
Precision (pr): 0.7245844269466317
F1 Score: 0.5504087193460492
Accuracy: 0.8268625393494229
Error Rate: 0.1731374606505771
Balanced Accuracy: 0.695409194809776
True Skill Statistics (TSS): 0.390818389619552
Heidke Skill Score (HSS): 0.45076428323872025
Accuracy by Package: 0.8268625393494229
Brier Score Loss: 0.1731374606505771
ROC AUC Score: 0.695409194809776
Time taken for RNDF model to genearte the output: 18.90281867980957 seconds



In [30]:
from keras.models import Sequential
from keras.layers import LSTM, Dense,Input

In [31]:
print("Shape of input data array (X_train):", X_train.shape)

Shape of input data array (X_train): (39073, 105)


In [32]:
num_samples = X_train.shape[0]
input_dim = X_train.shape[1]
time_steps = input_dim
print("Number of samples:", num_samples)
print("Number of time steps:", time_steps)
print("Input dimension:", input_dim)



Number of samples: 39073
Number of time steps: 105
Input dimension: 105


In [33]:
start2 = time.time()
num_folds = 10
kf = KFold(n_splits=num_folds, shuffle=True)

# Initialize variables to store the best accuracy and threshold
best_accuracy = 0
best_threshold = 0

In [36]:
end2 = time.time()
total_time2 = end2 - start2
print(f"Time taken for LSTM model to genearte the output: {total_time2} seconds\n")



Time taken for LSTM model to genearte the output: 3974.450767993927 seconds



In [37]:
X_test_re = X_test.values.reshape(X_test.shape[0], 105, 1)
predictions_test = model.predict(X_test_re)
binary_predictions_test = [0 if value <= best_threshold else 1 for value in predictions_test]

# Calculate performance metrics for the test set
performance_metrics_cv_LSTM = calculate_performance_metrics(Y_test, binary_predictions_test)


[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step


In [38]:
# Print the performance metrics for cross-validation
print("Performance Metrics for 10-fold Cross-Validation with LSTM Classifier:")
print("True Positives (TP):", performance_metrics_cv_LSTM[0])
print("True Negatives (TN):", performance_metrics_cv_LSTM[1])
print("False Positives (FP):", performance_metrics_cv_LSTM[2])
print("False Negatives (FN):", performance_metrics_cv_LSTM[3])
print("True Positive Rate (TPR):", performance_metrics_cv_LSTM[4])
print("True Negative Rate (TNR):", performance_metrics_cv_LSTM[5])
print("False Positive Rate (FPR):", performance_metrics_cv_LSTM[6])
print("False Negative Rate (FNR):", performance_metrics_cv_LSTM[7])
print("Recall/Sensitivity (r):", performance_metrics_cv_LSTM[8])
print("Precision (pr):", performance_metrics_cv_LSTM[9])
print("F1 Score:", performance_metrics_cv_LSTM[10])
print("Accuracy:", performance_metrics_cv_LSTM[11])
print("Error Rate:", performance_metrics_cv_LSTM[12])
print("Balanced Accuracy:", performance_metrics_cv_LSTM[13])
print("True Skill Statistics (TSS):", performance_metrics_cv_LSTM[14])
print("Heidke Skill Score (HSS):", performance_metrics_cv_LSTM[15])
print("Accuracy by Package:", performance_metrics_cv_LSTM[16])
print("Brier Score Loss:", performance_metrics_cv_LSTM[17])
print("ROC AUC Score:", performance_metrics_cv_LSTM[18])

Performance Metrics for 10-fold Cross-Validation with LSTM Classifier:
True Positives (TP): 5
True Negatives (TN): 7387
False Positives (FP): 27
False Negatives (FN): 2350
True Positive Rate (TPR): 0.0021231422505307855
True Negative Rate (TNR): 0.9963582411653629
False Positive Rate (FPR): 0.003641758834637173
False Negative Rate (FNR): 0.9978768577494692
Recall/Sensitivity (r): 0.0021231422505307855
Precision (pr): 0.15625
F1 Score: 0.0041893590280687055
Accuracy: 0.7566792916368104
Error Rate: 0.24332070836318967
Balanced Accuracy: 0.4992406917079468
True Skill Statistics (TSS): -0.0015186165841063874
Heidke Skill Score (HSS): -0.0022889445703778806
Accuracy by Package: 0.7566792916368104
Brier Score Loss: 0.24332070836318967
ROC AUC Score: 0.4992406917079468
