In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import joblib

#  1. Data Loading and Initial Cleaning 

# Load the dataset
df = pd.read_csv('adult3.csv')

# Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

# Impute missing values with the mode
for col in ['workclass', 'occupation', 'native-country']:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Drop unnecessary columns
df.drop(['educational-num', 'fnlwgt'], axis=1, inplace=True)

# Convert income to a binary target variable
df['income'] = df['income'].apply(lambda x: 1 if x.strip() == '>50K' else 0)


#  2. Data Preprocessing for Modeling 

# Separate features (X) and target (y)
X = df.drop('income', axis=1)
y = df['income']

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=np.number).columns

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply the preprocessing to the training and testing data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


#  3. Model Training and Evaluation 

# Define the models to train
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train_processed, y_train)
    y_pred = model.predict(X_test_processed)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")


#  4. Save the Best Model and the Preprocessor 

# Let's assume Gradient Boosting is the best model based on the accuracy
best_model = models['Gradient Boosting']

# Save the model and the preprocessor to disk
joblib.dump(best_model, 'gradient_boosting_model.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')

print("\nModel and preprocessor saved successfully!")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


Logistic Regression Accuracy: 0.8573
Random Forest Accuracy: 0.8518
Gradient Boosting Accuracy: 0.8704

Model and preprocessor saved successfully!
