**Adult Dataset ML Model Prediction Using Logistic Regression and Naive Bayes**

In [1]:
# Load the dataset
import pandas as pd
df = pd.read_csv('../datasets/Adult/adult_dataset.csv')


In [2]:
# Data Cleaning: Replace '?' with NaN, drop missing values, and remove negatives
df.replace('?', pd.NA, inplace=True)
df.dropna(inplace=True)
df = df[df.select_dtypes(include='number').ge(0).all(axis=1)]


In [3]:
# Error Correction: Remove outliers using Z-score
from scipy.stats import zscore
import numpy as np

z_scores = np.abs(zscore(df.select_dtypes(include='number')))
df = df[(z_scores < 3).all(axis=1)]


In [4]:
# Data Transformation: Encode categorical columns and scale numerical features
from sklearn.preprocessing import LabelEncoder, StandardScaler

categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

X = df.drop('income', axis=1)
y = df['income']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [5]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [6]:
# Train a Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy:", acc_lr)


Logistic Regression Accuracy: 0.8215968112090832


In [7]:
# Train a Naive Bayes model
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
acc_nb = accuracy_score(y_test, y_pred_nb)
print("Naïve Bayes Accuracy:", acc_nb)


Naïve Bayes Accuracy: 0.8011837178403188


In [8]:
# Accuracy Comparison
print(f"Accuracy Comparison:\n  Logistic Regression: {acc_lr:.2f}\n  Naive Bayes: {acc_nb:.2f}")


Accuracy Comparison:
  Logistic Regression: 0.82
  Naive Bayes: 0.80
