In [1]:
import pandas as pd
import numpy as np


# Load Dataset

In [4]:
df = pd.read_csv("C:/Users/91960/Documents/IBM Skillbuild/salary_prediction_app/Data/adult 3.csv", na_values='?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


# Select Columns + Drop Missing

In [5]:
selected_cols = ['age', 'education', 'occupation', 'hours-per-week', 'income']
df = df[selected_cols]
df.dropna(inplace=True)


# Map Education → Years + Compute Experience 

In [6]:
edu_years = {
    'Preschool': 6, '1st-4th': 7, '5th-6th': 8, '7th-8th': 9,
    '9th': 10, '10th': 11, '11th': 12, '12th': 12,
    'HS-grad': 12, 'Some-college': 14, 'Assoc-acdm': 14,
    'Assoc-voc': 14, 'Bachelors': 16, 'Masters': 18,
    'Prof-school': 19, 'Doctorate': 20
}

df['edu_years'] = df['education'].map(edu_years)
df['experience'] = df['age'] - df['edu_years'] - 6
df['experience'] = df['experience'].apply(lambda x: max(0, x))


In [7]:
df.head()


Unnamed: 0,age,education,occupation,hours-per-week,income,edu_years,experience
0,25,11th,Machine-op-inspct,40,<=50K,12,7
1,38,HS-grad,Farming-fishing,50,<=50K,12,20
2,28,Assoc-acdm,Protective-serv,40,>50K,14,8
3,44,Some-college,Machine-op-inspct,40,>50K,14,24
5,34,10th,Other-service,30,<=50K,11,17


# Train-Test Split

In [8]:
from sklearn.model_selection import train_test_split

features = ['age', 'education', 'occupation', 'hours-per-week', 'experience']
X = df[features]
y = df['income'].apply(lambda x: 1 if x.strip() == '>50K' else 0)  # binary target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


# Build Preprocessing + Pipeline

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier  # We'll test others later

# Feature types
numeric = ['age', 'hours-per-week', 'experience']
categorical = ['education', 'occupation']

# Preprocessing
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
])

# Full pipeline
pipeline = Pipeline([
    ('pre', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])


# Train & Evaluate

In [10]:
from sklearn.metrics import classification_report

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.82      0.88      0.85      6923
           1       0.55      0.43      0.48      2284

    accuracy                           0.77      9207
   macro avg       0.69      0.66      0.67      9207
weighted avg       0.76      0.77      0.76      9207



# Model Comparison via Cross-Validation

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "GradientBoosting": GradientBoostingClassifier()
}

for name, model in models.items():
    pipe = Pipeline([
        ('pre', preprocessor),
        ('clf', model)
    ])
    scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy')
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std():.4f})")


RandomForest: 0.7695 (+/- 0.0016)
LogisticRegression: 0.7914 (+/- 0.0018)
SVM: 0.7972 (+/- 0.0027)
KNN: 0.7702 (+/- 0.0027)
GradientBoosting: 0.7981 (+/- 0.0020)


#  Save the Best Model

In [12]:
import os
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier

# Features for preprocessing
numeric = ['age', 'hours-per-week', 'experience']
categorical = ['education', 'occupation']

# Define preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
])

# Final ML pipeline
final_pipeline = Pipeline([
    ('pre', preprocessor),
    ('clf', GradientBoostingClassifier())
])

# Train on training set
final_pipeline.fit(X_train, y_train)

# Ensure models folder exists
os.makedirs('models', exist_ok=True)

# Save model
joblib.dump(final_pipeline, 'models/best_model.pkl')

print("✅ Model saved to models/best_model.pkl")


✅ Model saved to models/best_model.pkl
