this is going to be the notebook for our model training with step by step explanation of what we are trying to achieve, and building on top of the previous notebooks.

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df = pd.read_csv('../data/adult_clean.csv')
print(df.shape)

(45222, 14)


In [14]:
df = df.drop(columns=['Unnamed: 0', 'fnlwgt'], errors='ignore')
print(df.shape)
print(df.columns.tolist())

df.to_csv('../data/adult_clean.csv', index=False)

(45222, 14)
['age', 'workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class']


In [15]:
X = df.drop(columns=['class'])
y = df['class']

print(f"Features: {X.shape}")
print(f"Target: {y.shape}")
print(f"Target distribution:\n{y.value_counts(normalize=True)}")

Features: (45222, 13)
Target: (45222,)
Target distribution:
class
<=50K    0.752156
>50K     0.247844
Name: proportion, dtype: float64


In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)
print(f"Training set: {X_train.shape}, {y_train.shape} rows")
print(f"Test set: {X_test.shape}, {y_test.shape} rows")
print(f"Training target distribution:\n{y_train.value_counts(normalize=True)}")

Training set: (36177, 13), (36177,) rows
Test set: (9045, 13), (9045,) rows
Training target distribution:
class
<=50K    0.752163
>50K     0.247837
Name: proportion, dtype: float64


In [17]:
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical: {categorical_cols}")
print(f"Numeric: {numeric_cols}")

Categorical: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
Numeric: ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']


In [18]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)   

In [19]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])


In [20]:
model.fit(X_train, y_train)
print("Model trained.")

Model trained.


In [21]:
y_pred = model.predict(X_test)

print(f"Predictions: {len(y_pred)}")
print(f"Sample Predictions: {y_pred[:10]}")

Predictions: 9045
Sample Predictions: ['<=50K' '<=50K' '<=50K' '>50K' '<=50K' '<=50K' '<=50K' '<=50K' '<=50K'
 '<=50K']


In [22]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.8452
