In [1]:
!pip install pandas scikit-learn xgboost




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier

In [4]:
# Load the dataset
file_path = r"E:\Research\A NHTS\NHTS Dataset\merged_data_clean.csv"
df = pd.read_csv(file_path)

# Check the first few rows of the data
df.head()

Unnamed: 0,HOUSEID,PERSONID,WTPERFIN,WTPERFIN5D,WTPERFIN2D,R_AGE,R_SEX,R_RELAT,WORKER,DRIVER,...,URBAN,URBANSIZE,URBRUR,PPT517,YOUNGCHILD,RESP_CNT,URBRUR_2010,TDAYDATE,WRKCOUNT,STRATUMID
0,9000013002,1,3938.688806,0.0,13363.809355,39,2,7,2,1,...,1,4,1,2,0,4,1,202202,1,1021
1,9000013016,1,3183.42081,4177.234452,0.0,32,2,7,1,1,...,1,2,1,0,0,2,1,202202,2,1021
2,9000013026,1,7727.266827,11702.30262,0.0,44,1,7,1,1,...,1,4,1,0,0,1,1,202202,1,1021
3,9000013039,1,12167.712239,12540.688961,0.0,38,1,7,1,1,...,1,4,1,2,0,4,1,202201,1,1021
4,9000013041,1,3206.344095,4228.326233,0.0,37,1,7,1,1,...,1,3,1,0,0,1,1,202201,1,1021


In [5]:
# Convert the COV1_OHD to binary (0 and 1) based on your criteria
df['COV1_OHD'] = df['COV1_OHD'].apply(lambda x: 0 if x == 1 else (1 if x in [2, 3, 4] else x))
# Check the counts of the binary variable
print(df['COV1_OHD'].value_counts())

COV1_OHD
1    4289
0    3479
Name: count, dtype: int64


In [6]:
# Define independent variables (IVs) and dependent variable (DV)
X = df[['R_AGE', 'HHFAMINC_IMP', 'EDUC', 'DELIV_FOOD', 'DELIV_GROC', 'COV2_OHD', 'COV1_WK', 'HOMEOWN']]
y = df['COV1_OHD']

In [7]:
# Split the dataset into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the number of samples in each set
print(f"Number of training samples: {X_train.shape[0]}")
print(f"Number of test samples: {X_test.shape[0]}")

# Save the train and test datasets to CSV files
X_train_df = pd.DataFrame(X_train, columns=X.columns)  # Convert X_train to DataFrame
X_test_df = pd.DataFrame(X_test, columns=X.columns)  # Convert X_test to DataFrame

# Save the corresponding target values (y_train, y_test) to CSV
y_train_df = pd.DataFrame(y_train, columns=['COV1_OHD'])
y_test_df = pd.DataFrame(y_test, columns=['COV1_OHD'])

# Save the dataframes to CSV files with correct filenames in the correct directory
X_train_df.to_csv(r'E:\Research\A NHTS\NHTS Dataset\X_train.csv', index=False)
X_test_df.to_csv(r'E:\Research\A NHTS\NHTS Dataset\X_test.csv', index=False)
y_train_df.to_csv(r'E:\Research\A NHTS\NHTS Dataset\y_train.csv', index=False)
y_test_df.to_csv(r'E:\Research\A NHTS\NHTS Dataset\y_test.csv', index=False)

print("Train and test datasets saved successfully.")

Number of training samples: 6214
Number of test samples: 1554
Train and test datasets saved successfully.


In [9]:
# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# SVM
svm = SVC()
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)
print("SVM Classifier Report:")
print(classification_report(y_test, y_pred_svm))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))

SVM Classifier Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       704
           1       1.00      0.93      0.96       850

    accuracy                           0.96      1554
   macro avg       0.96      0.96      0.96      1554
weighted avg       0.96      0.96      0.96      1554

Accuracy: 0.9588159588159588


In [11]:
# XGBoost
xgb = XGBClassifier()
xgb.fit(X_train, y_train)  # Fit the model with adjusted y_train
y_pred_xgb = xgb.predict(X_test)

# Evaluate the XGBoost model
print("XGBoost Classifier Report:")
print(classification_report(y_test, y_pred_xgb))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))


XGBoost Classifier Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       704
           1       0.97      0.94      0.96       850

    accuracy                           0.95      1554
   macro avg       0.95      0.95      0.95      1554
weighted avg       0.95      0.95      0.95      1554

Accuracy: 0.9517374517374517


In [10]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
print("Logistic Regression Classifier Report:")
print(classification_report(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))

Logistic Regression Classifier Report:
              precision    recall  f1-score   support

           0       0.80      0.62      0.70       704
           1       0.73      0.87      0.80       850

    accuracy                           0.76      1554
   macro avg       0.77      0.74      0.75      1554
weighted avg       0.76      0.76      0.75      1554

Accuracy: 0.7561132561132561


In [11]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
print("Logistic Regression Classifier Report:")
print(classification_report(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))

Logistic Regression Classifier Report:
              precision    recall  f1-score   support

           0       0.80      0.62      0.70       704
           1       0.73      0.87      0.80       850

    accuracy                           0.76      1554
   macro avg       0.77      0.74      0.75      1554
weighted avg       0.76      0.76      0.75      1554

Accuracy: 0.7561132561132561


In [12]:
# Install necessary libraries if they are not already installed
# !pip install pandas statsmodels

# Import necessary libraries
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Step 1: Load the merged dataset from the specified file path
file_path = r"E:\Research\A NHTS\NHTS Dataset\merged_data_clean.csv"
df = pd.read_csv(file_path)

# Step 2: List of independent variables (using the variables that you selected)
independent_vars = ['R_AGE', 'HHFAMINC_IMP', 'EDUC', 'DELIV_FOOD', 'DELIV_GROC', 
                    'COV2_OHD', 'COV1_WK', 'HOMEOWN']

# Step 3: Subset the dataframe to include only the independent variables
X = df[independent_vars]

# Step 4: Add a constant (intercept) to the dataset for the VIF calculation
X = sm.add_constant(X)

# Step 5: Calculate the VIF for each feature
vif_data = pd.DataFrame()
vif_data['Variable'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Step 6: Display the VIF results
print(vif_data)


       Variable        VIF
0         const  40.029321
1         R_AGE   1.307974
2  HHFAMINC_IMP   1.530576
3          EDUC   1.278855
4    DELIV_FOOD   1.658692
5    DELIV_GROC   1.594481
6      COV2_OHD   1.017819
7       COV1_WK   1.385392
8       HOMEOWN   1.179927


In [2]:
!pip install imbalanced-learn


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# --------------------------------------------
# 1. Load Dataset
# --------------------------------------------
df = pd.read_csv(r"E:\Research\A NHTS\NHTS Dataset\merged_data_clean.csv")

# --------------------------------------------
# 2. Convert COV1_OHD to Binary (0 and 1) Based on Criteria
# --------------------------------------------
df['COV1_OHD'] = df['COV1_OHD'].apply(lambda x: 0 if x == 1 else (1 if x in [2, 3, 4] else x))

# Check the distribution of the binary target variable
print(df['COV1_OHD'].value_counts())

# --------------------------------------------
# 3. Define Target and Independent Variables
# --------------------------------------------
y = df['COV1_OHD']  # The updated binary target variable

# Make sure to remove rows with missing values
df = df.dropna(subset=['COV1_OHD'])  # Drop rows where 'COV1_OHD' is NaN

# --------------------------------------------
# 4. Define Final Independent Variables
# --------------------------------------------
iv_columns = [
    'R_AGE', 'HHFAMINC_IMP', 'EDUC', 'HOMEOWN',
    'DELIV_FOOD', 'DELIV_GROC', 'COV2_OHD', 'COV1_WK'
]

X = df[iv_columns].dropna()  # Ensure no missing values in the independent variables
y = y.loc[X.index]  # Align the target variable with the independent variables

# --------------------------------------------
# 5. Balance Classes Using SMOTE
# --------------------------------------------
X_bal, y_bal = SMOTE(random_state=42).fit_resample(X, y)

# --------------------------------------------
# 6. Train/Test Split
# --------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal, test_size=0.2, random_state=42)

print(f"\n🧪 Dataset split:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Total after balancing: {X_bal.shape[0]} samples")

# --------------------------------------------
# 7. Scale Features (SVM, KNN, SGD)
# --------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_bal_scaled = scaler.fit_transform(X_bal)

# --------------------------------------------
# 8. Define Models
# --------------------------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": GaussianNB(),
    "SVM (RBF)": SVC(kernel='rbf', probability=True),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "SGD Classifier": SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

# --------------------------------------------
# 9. Train and Evaluate on Train/Test Split
# --------------------------------------------
results = {}

for name, model in models.items():
    print(f"\n🚀 Training {name}...")

    if name in ["SVM (RBF)", "SGD Classifier", "K-Nearest Neighbors"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }

    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# --------------------------------------------
# 10. Summary Table (Train/Test Split)
# --------------------------------------------
print("\n📊 Model Performance Summary (Train/Test Split):")
summary_df = pd.DataFrame(results).T.round(4)
print(summary_df)

# --------------------------------------------
# 11. 5-Fold Cross-Validation (Accuracy)
# --------------------------------------------
print("\n🔁 5-Fold Cross-Validation (Accuracy):\n")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = {}

for name, model in models.items():
    print(f"📦 {name}")
    X_used = X_bal_scaled if name in ["SVM (RBF)", "SGD Classifier", "K-Nearest Neighbors"] else X_bal

    scores = cross_val_score(
        model,
        X=X_used,
        y=y_bal,
        scoring='accuracy',
        cv=cv
    )

    print(f"  Fold Accuracies : {np.round(scores, 4)}")
    print(f"  Mean Accuracy   : {scores.mean():.4f}")
    print("-" * 40)

    cv_scores[name] = round(scores.mean(), 4)

# --------------------------------------------
# 12. 5-Fold Accuracy Summary Table
# --------------------------------------------
print("\n📊 5-Fold Cross-Validation Accuracy Summary:")
cv_df = pd.DataFrame.from_dict(cv_scores, orient='index', columns=['5-Fold Accuracy'])
cv_df = cv_df.sort_values(by='5-Fold Accuracy', ascending=False)
print(cv_df)


COV1_OHD
1    4289
0    3479
Name: count, dtype: int64

🧪 Dataset split:
Training set: 6862 samples
Test set: 1716 samples
Total after balancing: 8578 samples

🚀 Training Logistic Regression...
              precision    recall  f1-score   support

           0       0.65      0.71      0.68       876
           1       0.67      0.60      0.63       840

    accuracy                           0.66      1716
   macro avg       0.66      0.66      0.66      1716
weighted avg       0.66      0.66      0.66      1716

Confusion Matrix:
[[621 255]
 [333 507]]

🚀 Training Naive Bayes...
              precision    recall  f1-score   support

           0       0.92      0.95      0.94       876
           1       0.94      0.92      0.93       840

    accuracy                           0.93      1716
   macro avg       0.93      0.93      0.93      1716
weighted avg       0.93      0.93      0.93      1716

Confusion Matrix:
[[830  46]
 [ 68 772]]

🚀 Training SVM (RBF)...
              prec

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.94      0.98      0.96       876
           1       0.98      0.93      0.96       840

    accuracy                           0.96      1716
   macro avg       0.96      0.96      0.96      1716
weighted avg       0.96      0.96      0.96      1716

Confusion Matrix:
[[860  16]
 [ 55 785]]

📊 Model Performance Summary (Train/Test Split):
                     Accuracy  Precision  Recall  F1 Score
Logistic Regression    0.6573     0.6654  0.6036    0.6330
Naive Bayes            0.9336     0.9438  0.9190    0.9312
SVM (RBF)              0.9575     0.9961  0.9167    0.9547
Random Forest          0.9580     0.9848  0.9286    0.9559
Decision Tree          0.9353     0.9253  0.9440    0.9346
K-Nearest Neighbors    0.9394     0.9600  0.9143    0.9366
SGD Classifier         0.6638     0.6524  0.6702    0.6612
AdaBoost               0.9610     0.9911  0.9286    0.9588
XGBoost                0.9586     0.9800  0.9345    

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold Accuracies : [0.965  0.9575 0.9615 0.9592 0.9656]
  Mean Accuracy   : 0.9618
----------------------------------------

📊 5-Fold Cross-Validation Accuracy Summary:
                     5-Fold Accuracy
AdaBoost                      0.9656
SVM (RBF)                     0.9634
Random Forest                 0.9623
XGBoost                       0.9618
K-Nearest Neighbors           0.9500
Naive Bayes                   0.9393
Decision Tree                 0.9379
Logistic Regression           0.6815
SGD Classifier                0.6113


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# --------------------------------------------
# 1. Load Dataset
# --------------------------------------------
df = pd.read_csv(r"E:\Research\A NHTS\NHTS Dataset\merged_data_clean.csv")

# --------------------------------------------
# 2. Convert COV1_OHD to Binary (0 and 1) Based on Criteria
# --------------------------------------------
df['COV1_OHD'] = df['COV1_OHD'].apply(lambda x: 0 if x == 1 else (1 if x in [2, 3, 4] else x))

# Check the distribution of the binary target variable
print(df['COV1_OHD'].value_counts())

# --------------------------------------------
# 3. Define Target and Independent Variables
# --------------------------------------------
y = df['COV1_OHD']  # The updated binary target variable

# Make sure to remove rows with missing values
df = df.dropna(subset=['COV1_OHD'])  # Drop rows where 'COV1_OHD' is NaN

# --------------------------------------------
# 4. Define Final Independent Variables
# --------------------------------------------
iv_columns = [
    'R_AGE', 'HHFAMINC_IMP', 'EDUC', 'HOMEOWN',
    'HHSIZE', 'DELIV_FOOD', 'DELIV_GROC', 'DELIV_GOOD', 
    'COV2_OHD', 'COV1_WK', 'WRKCOUNT', 'LIF_CYC', 'URBRUR', 'MSACAT'
]

X = df[iv_columns].dropna()  # Ensure no missing values in the independent variables
y = y.loc[X.index]  # Align the target variable with the independent variables

# --------------------------------------------
# 5. Balance Classes Using SMOTE
# --------------------------------------------
X_bal, y_bal = SMOTE(random_state=42).fit_resample(X, y)

# --------------------------------------------
# 6. Train/Test Split
# --------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal, test_size=0.2, random_state=42)

print(f"\n🧪 Dataset split:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Total after balancing: {X_bal.shape[0]} samples")

# --------------------------------------------
# 7. Scale Features (SVM, KNN, SGD)
# --------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_bal_scaled = scaler.fit_transform(X_bal)

# --------------------------------------------
# 8. Define Models
# --------------------------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": GaussianNB(),
    "SVM (RBF)": SVC(kernel='rbf', probability=True),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "SGD Classifier": SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

# --------------------------------------------
# 9. Train and Evaluate on Train/Test Split
# --------------------------------------------
results = {}

for name, model in models.items():
    print(f"\n🚀 Training {name}...")

    if name in ["SVM (RBF)", "SGD Classifier", "K-Nearest Neighbors"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }

    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# --------------------------------------------
# 10. Summary Table (Train/Test Split)
# --------------------------------------------
print("\n📊 Model Performance Summary (Train/Test Split):")
summary_df = pd.DataFrame(results).T.round(4)
print(summary_df)

# --------------------------------------------
# 11. 5-Fold Cross-Validation (Accuracy)
# --------------------------------------------
print("\n🔁 5-Fold Cross-Validation (Accuracy):\n")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = {}

for name, model in models.items():
    print(f"📦 {name}")
    X_used = X_bal_scaled if name in ["SVM (RBF)", "SGD Classifier", "K-Nearest Neighbors"] else X_bal

    scores = cross_val_score(
        model,
        X=X_used,
        y=y_bal,
        scoring='accuracy',
        cv=cv
    )

    print(f"  Fold Accuracies : {np.round(scores, 4)}")
    print(f"  Mean Accuracy   : {scores.mean():.4f}")
    print("-" * 40)

    cv_scores[name] = round(scores.mean(), 4)

# --------------------------------------------
# 12. 5-Fold Accuracy Summary Table
# --------------------------------------------
print("\n📊 5-Fold Cross-Validation Accuracy Summary:")
cv_df = pd.DataFrame.from_dict(cv_scores, orient='index', columns=['5-Fold Accuracy'])
cv_df = cv_df.sort_values(by='5-Fold Accuracy', ascending=False)
print(cv_df)


COV1_OHD
1    4289
0    3479
Name: count, dtype: int64

🧪 Dataset split:
Training set: 6862 samples
Test set: 1716 samples
Total after balancing: 8578 samples

🚀 Training Logistic Regression...
              precision    recall  f1-score   support

           0       0.66      0.67      0.66       876
           1       0.65      0.64      0.65       840

    accuracy                           0.66      1716
   macro avg       0.65      0.65      0.65      1716
weighted avg       0.65      0.66      0.65      1716

Confusion Matrix:
[[586 290]
 [302 538]]

🚀 Training Naive Bayes...
              precision    recall  f1-score   support

           0       0.85      0.91      0.88       876
           1       0.90      0.83      0.86       840

    accuracy                           0.87      1716
   macro avg       0.87      0.87      0.87      1716
weighted avg       0.87      0.87      0.87      1716

Confusion Matrix:
[[798  78]
 [144 696]]

🚀 Training SVM (RBF)...
              prec

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold Accuracies : [0.6649 0.6702 0.6941 0.6857 0.6793]
  Mean Accuracy   : 0.6788
----------------------------------------
📦 Naive Bayes
  Fold Accuracies : [0.8712 0.8683 0.8893 0.8776 0.8892]
  Mean Accuracy   : 0.8791
----------------------------------------
📦 SVM (RBF)
  Fold Accuracies : [0.9703 0.9551 0.9627 0.9603 0.9621]
  Mean Accuracy   : 0.9621
----------------------------------------
📦 Random Forest
  Fold Accuracies : [0.9679 0.9551 0.9668 0.9609 0.9673]
  Mean Accuracy   : 0.9636
----------------------------------------
📦 Decision Tree
  Fold Accuracies : [0.9487 0.9353 0.9435 0.93   0.944 ]
  Mean Accuracy   : 0.9403
----------------------------------------
📦 K-Nearest Neighbors
  Fold Accuracies : [0.8852 0.8631 0.88   0.8816 0.8741]
  Mean Accuracy   : 0.8768
----------------------------------------
📦 SGD Classifier
  Fold Accuracies : [0.5944 0.6952 0.6748 0.6093 0.6501]
  Mean Accuracy   : 0.6448
----------------------------------------
📦 AdaBoost
  Fold Accuracies

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold Accuracies : [0.9656 0.9575 0.9679 0.9603 0.9656]
  Mean Accuracy   : 0.9634
----------------------------------------

📊 5-Fold Cross-Validation Accuracy Summary:
                     5-Fold Accuracy
AdaBoost                      0.9654
Random Forest                 0.9636
XGBoost                       0.9634
SVM (RBF)                     0.9621
Decision Tree                 0.9403
Naive Bayes                   0.8791
K-Nearest Neighbors           0.8768
Logistic Regression           0.6788
SGD Classifier                0.6448


In [1]:
# Install necessary libraries if they are not already installed
# !pip install pandas statsmodels

# Import necessary libraries
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Step 1: Load the merged dataset from the specified file path
file_path = r"E:\Research\A NHTS\NHTS Dataset\merged_data_clean.csv"
df = pd.read_csv(file_path)

# Step 2: List of independent variables (using the variables that you selected)
independent_vars = ['R_AGE', 'HHFAMINC_IMP', 'EDUC', 'HOMEOWN', 
                    'HHSIZE', 'DELIV_FOOD', 'DELIV_GROC', 'DELIV_GOOD', 
                    'COV2_OHD', 'COV1_WK', 'WRKCOUNT', 'LIF_CYC', 'URBRUR', 'MSACAT']

# Step 3: Subset the dataframe to include only the independent variables
X = df[independent_vars]

# Step 4: Add a constant (intercept) to the dataset for the VIF calculation
X = sm.add_constant(X)

# Step 5: Calculate the VIF for each feature
vif_data = pd.DataFrame()
vif_data['Variable'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Step 6: Display the VIF results
print(vif_data)


        Variable        VIF
0          const  73.666475
1          R_AGE   2.261371
2   HHFAMINC_IMP   1.752683
3           EDUC   1.329706
4        HOMEOWN   1.242743
5         HHSIZE   1.644864
6     DELIV_FOOD   1.676033
7     DELIV_GROC   1.602387
8     DELIV_GOOD   1.117242
9       COV2_OHD   1.030311
10       COV1_WK   2.145028
11      WRKCOUNT   2.663177
12       LIF_CYC   2.043388
13        URBRUR   1.213329
14        MSACAT   1.245051
