In [12]:
import pandas as pd
import kagglehub
import os

# 1. Download the dataset folder to your local machine
dataset_path = kagglehub.dataset_download("uciml/adult-census-income")

# 2. Construct the exact file path to the CSV
csv_path = os.path.join(dataset_path, "adult.csv")
print(f"Dataset downloaded to: {csv_path}")

# 3. Load it with Pandas directly, skipping any corrupted lines just in case
df = pd.read_csv(csv_path, encoding="latin-1", on_bad_lines="skip")

print("First 5 records:")
display(df.head())

Dataset downloaded to: C:\Users\L\.cache\kagglehub\datasets\uciml\adult-census-income\versions\3\adult.csv
First 5 records:


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [13]:
# Replace '?' with standard NaN values
df.replace('?', np.nan, inplace=True)

# Drop rows with missing values to keep things clean and simple
df.dropna(inplace=True)

# Drop duplicates
df.drop_duplicates(inplace=True)

print(f"Cleaned dataset shape: {df.shape}")

Cleaned dataset shape: (30139, 15)


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Separate features (X) and target (y)
X = df.drop('income', axis=1)
y = df['income']

# Encode the target variable (<=50K -> 0, >50K -> 1)
le = LabelEncoder()
y = le.fit_transform(y)

# Convert categorical features into dummy variables (One-Hot Encoding)
X_encoded = pd.get_dummies(X, drop_first=True)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Scale the numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data preprocessing complete!")

Data preprocessing complete!


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate
print(f"Model Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%\n")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

Model Accuracy: 85.10%

Classification Report:
              precision    recall  f1-score   support

       <=50K       0.89      0.92      0.90      4580
        >50K       0.71      0.63      0.67      1448

    accuracy                           0.85      6028
   macro avg       0.80      0.78      0.79      6028
weighted avg       0.85      0.85      0.85      6028



In [16]:
import joblib

# Save the model, scaler, and the expected columns for the Flask app
joblib.dump(model, 'income_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(list(X_encoded.columns), 'model_columns.pkl')

print("Model and preprocessors successfully saved to disk!")

Model and preprocessors successfully saved to disk!


In [17]:
%%writefile app.py
from flask import Flask, request, jsonify
import joblib
import pandas as pd
import traceback

app = Flask(__name__)

# Load the model, scaler, and column names
model = joblib.load('income_model.pkl')
scaler = joblib.load('scaler.pkl')
model_columns = joblib.load('model_columns.pkl')

@app.route('/predict', methods=['POST'])
def predict():
    if model:
        try:
            # Get the JSON data from the request
            json_ = request.json
            query_df = pd.DataFrame(json_)

            # One-hot encode the incoming data
            query_encoded = pd.get_dummies(query_df)

            # Ensure the incoming data has all the columns the model expects
            query_encoded = query_encoded.reindex(columns=model_columns, fill_value=0)

            # Scale the features
            query_scaled = scaler.transform(query_encoded)

            # Make the prediction
            prediction = model.predict(query_scaled)

            # Convert prediction back to label format
            result = ">50K" if prediction[0] == 1 else "<=50K"

            return jsonify({'prediction': result})

        except Exception as e:
            return jsonify({'error': str(e), 'trace': traceback.format_exc()})
    else:
        return jsonify({'error': 'No model loaded'})

if __name__ == '__main__':
    app.run(port=5000, debug=True)

Overwriting app.py


In [19]:
%%writefile streamlit_app.py
import streamlit as st
import pandas as pd
import joblib

# Load the trained model and preprocessors we saved earlier
model = joblib.load('income_model.pkl')
scaler = joblib.load('scaler.pkl')
model_columns = joblib.load('model_columns.pkl')

st.title("üìä Adult Income Prediction Dashboard")
st.write("Adjust the demographic parameters in the sidebar to predict if a person's income exceeds $50K/year.")

# 1. Create a sidebar for user inputs
st.sidebar.header("User Demographics")

def get_user_input():
    age = st.sidebar.slider("Age", 17, 90, 39)
    workclass = st.sidebar.selectbox("Workclass", ["Private", "State-gov", "Federal-gov", "Self-emp-not-inc", "Local-gov"])
    education = st.sidebar.selectbox("Education", ["Bachelors", "HS-grad", "11th", "Masters", "Some-college"])
    marital_status = st.sidebar.selectbox("Marital Status", ["Never-married", "Married-civ-spouse", "Divorced"])
    occupation = st.sidebar.selectbox("Occupation", ["Adm-clerical", "Exec-managerial", "Prof-specialty", "Sales", "Craft-repair"])
    sex = st.sidebar.radio("Sex", ["Male", "Female"])
    hours_per_week = st.sidebar.slider("Hours per week", 1, 99, 40)

    # Bundle everything into a dictionary (using default values for unlisted features to keep the UI clean)
    user_data = {
        "age": age,
        "workclass": workclass,
        "fnlwgt": 189778,
        "education": education,
        "education.num": 10,
        "marital.status": marital_status,
        "occupation": occupation,
        "relationship": "Not-in-family",
        "race": "White",
        "sex": sex,
        "capital.gain": 0,
        "capital.loss": 0,
        "hours.per.week": hours_per_week,
        "native.country": "United-States"
    }
    return pd.DataFrame(user_data, index=[0])

# Store the input into a dataframe
input_df = get_user_input()

# Display the chosen parameters on the main page
st.subheader("Selected Profile")
st.write(input_df)

# 2. Add a prediction button
if st.button("Predict Income", type="primary"):

    # Match the preprocessing steps from the training phase
    query_encoded = pd.get_dummies(input_df)

    # Ensure the columns match exactly what the model expects, filling missing ones with 0
    query_encoded = query_encoded.reindex(columns=model_columns, fill_value=0)

    # Scale the data
    query_scaled = scaler.transform(query_encoded)

    # Run the model
    prediction = model.predict(query_scaled)

    st.markdown("---")
    st.subheader("Prediction Result:")

    # Display the result
    if prediction[0] == 1:
        st.success("‚úÖ **>50K**: This profile is predicted to make over $50,000 annually.")
    else:
        st.error("‚ùå **<=50K**: This profile is predicted to make under $50,000 annually.")

Writing streamlit_app.py
