In [80]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [81]:
df = pd.read_csv("C:/Users/priya/OneDrive/Desktop/adult-income-project/data/adult/cleaned_adult.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [82]:
X = df.drop("income", axis=1)
y = df["income"]


In [83]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [84]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [85]:
model_a = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

model_a.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [86]:
y_pred_a = model_a.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_a))
print("Logistic Regression F1 Score:", f1_score(y_test, y_pred_a))


Logistic Regression Accuracy: 0.8475053870379579
Logistic Regression F1 Score: 0.6639883126369612


In [87]:
model_b = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

model_b.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [88]:
y_pred_b = model_b.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_b))
print("Random Forest F1 Score:", f1_score(y_test, y_pred_b))


Random Forest Accuracy: 0.8488314271506713
Random Forest F1 Score: 0.673352435530086


In [89]:
print("Model A (Logistic Regression) F1:", f1_score(y_test, y_pred_a))
print("Model B (Random Forest) F1:", f1_score(y_test, y_pred_b))


Model A (Logistic Regression) F1: 0.6639883126369612
Model B (Random Forest) F1: 0.673352435530086


In [90]:
import joblib

joblib.dump(model_b, "C:/Users/priya/OneDrive/Desktop/adult-income-project/models/final_model.joblib")


['C:/Users/priya/OneDrive/Desktop/adult-income-project/models/final_model.joblib']

# 03 — Machine Learning Modeling & Evaluation

This notebook focuses on building, evaluating, and comparing machine learning models to predict whether an individual earns more than $50K annually.

---

## 1. Feature–Target Separation
The dataset was split into:
- **X**: All input features (demographic and work-related columns)
- **y**: The target label (`income`), encoded as 0 or 1

---

## 2. Train–Test Split

The dataset was divided:
- 80% → Training set  
- 20% → Test set  

`stratify=y` was used to preserve the income class distribution.

---

## 3. Preprocessing Pipeline

A unified preprocessing strategy was created using `ColumnTransformer`:

### Numeric Features  
- Scaled using `StandardScaler` to normalize distributions.

### Categorical Features  
- Converted to numeric using `OneHotEncoder(handle_unknown="ignore")`.

A `Pipeline` was used to combine preprocessing and model training in one structured workflow.

---

## 4. Model A — Logistic Regression

- Serves as a baseline linear classifier.
- Uses sigmoid function to produce probabilities.
- Works best on linearly separable data.

The model was trained through the pipeline and evaluated using:
- Accuracy  
- F1 score  
- Classification report  

---

## 5. Model B — Random Forest Classifier

- Ensemble learning model composed of multiple decision trees.
- Captures nonlinear relationships and complex interactions.
- Generally performs well on tabular datasets like Adult Income.

Also evaluated using accuracy and F1 score.

---

## 6. Model Comparison (A/B Testing)

The models were compared on the test dataset:
- Logistic Regression → baseline  
- Random Forest → typically superior on this dataset  

**Random Forest achieved a higher F1 Score** and was selected as the final model.

---

## 7. Saving the Final Model

The final Random Forest pipeline was saved using `joblib` as:

models/final_model.joblib


This serialized model will be used for API deployment in the next phase.

---

## ✔️ Modeling Output
- Two trained models  
- Performance comparison  
- Final chosen model saved for deployment  
- Fully reproducible training pipeline
