In [6]:
# Download the dataset from kaggle (if needed)
import os 
need_download: bool = not os.path.exists("adult-census-income/adult.csv")

if need_download:
    !kaggle datasets download uciml/adult-census-income
    !unzip adult-census-income.zip -d adult-census-income

In [18]:
# Import the data into a pandas frame
import pandas as pd
raw_df = pd.read_csv("adult-census-income/adult.csv")

# Process by 
# 1. replacing "?" with Na
# 2. replacing certain columns as category types
processed_df = raw_df.copy()
processed_df.replace("?", pd.NA, inplace=True)

categorical_column_names = ["workclass", "education", "marital.status", "occupation", "relationship", "race", "sex", "native.country", "income"]
# numerical_column_names = ["age", "education.num", "capital.gain", "capital.loss", "hours.per.week"] # also fnlwgt, technically

for categorical_column_name in categorical_column_names:
    processed_df[categorical_column_name] = processed_df[categorical_column_name].astype('category')

print(processed_df.dtypes)

age                  int64
workclass         category
fnlwgt               int64
education         category
education.num        int64
marital.status    category
occupation        category
relationship      category
race              category
sex               category
capital.gain         int64
capital.loss         int64
hours.per.week       int64
native.country    category
income            category
dtype: object


In [20]:
# Filter out incomplete rows
complete_df = processed_df.dropna()
print(f"# complete entries = {len(complete_df)}, # incomplete entries = {len(raw_df) - len(complete_df)}")

# complete entries = 30162, # incomplete entries = 2399


In [58]:
# Take a smaller sample among the complete dataframe
# DROP the weights from each row: sample from the population
sample_size: int = 10000
sample_df: pd.DataFrame = complete_df.drop("fnlwgt", axis=1).sample(n = sample_size, replace = True, weights = complete_df["fnlwgt"], random_state = 0)

In [59]:
# Test-train split, 30-70
# This doesn't incorporate weights, though the sample already did this so it doesn't matter
import sklearn.model_selection
train_df, test_df = sklearn.model_selection.train_test_split(sample_df, test_size = 0.3)

train_X = train_df.drop("income", axis = 1)
train_y = train_df["income"]

test_X = test_df.drop("income", axis = 1)
test_y = test_df["income"]

print(f"train dataset contains {train_y.value_counts()["<=50K"]} examples of '<=50K' out of {len(train_y)} examples in total ({train_y.value_counts()["<=50K"] / len(train_y)})")
print(f"test dataset contains {test_y.value_counts()["<=50K"]} examples of '<=50K' out of {len(test_y)} examples in total ({test_y.value_counts()["<=50K"] / len(test_y)})")

train dataset contains 5245 examples of '<=50K' out of 7000 examples in total (0.7492857142857143)
test dataset contains 2253 examples of '<=50K' out of 3000 examples in total (0.751)


In [60]:
# Create a gradient boosting (with histograms, since faster AND can natively handle categorical data)
from sklearn.ensemble import HistGradientBoostingClassifier

clf = HistGradientBoostingClassifier(loss = 'log_loss',
                                     categorical_features = 'from_dtype', 
                                     learning_rate = 0.1,
                                     max_iter = 100,
                                     max_depth = 5  # stumps
                                     )

In [61]:
clf.fit(train_X, train_y)

In [62]:
clf.score(train_X, train_y)

0.8997142857142857

In [63]:
clf.score(test_X, test_y)

0.8683333333333333