In [6]:
# Download the dataset from kaggle (if needed)
import os 
need_download: bool = not os.path.exists("adult-census-income/adult.csv")

if need_download:
    !kaggle datasets download uciml/adult-census-income
    !unzip adult-census-income.zip -d adult-census-income

In [39]:
# Import the data into a pandas frame and replace "?" with Na
import pandas as pd
raw_df = pd.read_csv("adult-census-income/adult.csv")
raw_df.replace("?", pd.NA, inplace=True)

In [40]:
# Filter out incomplete rows
complete_df = raw_df.dropna()

In [18]:
# Take a smaller sample among the complete dataframe
# DROP the weights from each row: sample from the population
sample_size: int = 1000
sample_df: pd.DataFrame = complete_df.drop("fnlwgt", axis=1).sample(n = sample_size, replace = True, weights = complete_df["fnlwgt"], random_state = 0)

In [19]:
# Test-train split, 30-70
# This doesn't incorporate weights, though the sample already did this so it doesn't matter
import sklearn.model_selection

train_df, test_df = sklearn.model_selection.train_test_split(sample_df, test_size = 0.3)

In [23]:
# Create an adaboost classifier with decision stumps for now
# TODO: Should the random state be freed?
import sklearn.tree
import sklearn.ensemble

TREE_DEPTH: int = 1
NUM_ESTIMATORS: float = 50
LEARNING_RATE: float = 1.0

base_estimator = sklearn.tree.DecisionTreeClassifier(max_depth = TREE_DEPTH)

adaboost_classifier =  sklearn.ensemble.AdaBoostClassifier(
                                        estimator = base_estimator,
                                        n_estimators = NUM_ESTIMATORS,
                                        learning_rate = LEARNING_RATE,
                                        random_state = 0)

In [41]:
# Prepare the training/testing data by factorizing each string row independently
def factorize_string_columns(df: pd.DataFrame) -> pd.DataFrame:
    factorized_df: pd.DataFrame = df.copy()
    
    for column in factorized_df.select_dtypes(include=['object']).columns:
        factorized_df[column], _ = pd.factorize(factorized_df[column])

    return factorized_df

factorized_train_df = factorize_string_columns(train_df)
factorized_train_df = factorize_string_columns(train_df)

In [51]:
train_df

Unnamed: 0,age,workclass,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
9797,20,Private,Some-college,10,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,35,United-States,<=50K
20248,19,Self-emp-inc,Some-college,10,Never-married,Farming-fishing,Own-child,White,Male,0,0,60,United-States,<=50K
21005,23,State-gov,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,20,United-States,<=50K
16128,52,Private,1st-4th,2,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,>50K
8616,68,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,20,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29377,27,Private,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,80,United-States,<=50K
22984,21,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Female,0,0,40,United-States,<=50K
14469,19,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,30,United-States,<=50K
17682,24,Private,Bachelors,13,Never-married,Adm-clerical,Own-child,White,Male,0,0,39,United-States,<=50K


In [25]:
# Fit the adaboost classifier to the train data
train_X = train_df.drop("income", axis = 1)
train_y = train_df["income"]

In [30]:

adaboost_classifier.fit(X = train_X, y = train_y)



ValueError: could not convert string to float: 'Private'

In [38]:
pd.factorize(train_X["workclass"])

(array([0, 1, 2, 0, 3, 0, 2, 0, 0, 4, 0, 3, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
        0, 0, 0, 0, 3, 0, 3, 0, 1, 0, 0, 5, 0, 0, 1, 3, 0, 0, 0, 5, 0, 0,
        3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 3, 0, 0, 3, 3, 3, 3, 0, 0, 0,
        0, 4, 0, 4, 0, 3, 0, 0, 0, 0, 2, 3, 0, 0, 0, 3, 5, 2, 0, 0, 0, 0,
        2, 5, 0, 5, 0, 0, 0, 0, 0, 5, 0, 5, 0, 0, 3, 0, 0, 0, 0, 0, 0, 2,
        0, 3, 0, 0, 5, 4, 0, 0, 3, 0, 0, 0, 5, 0, 3, 0, 4, 0, 0, 0, 3, 0,
        4, 0, 3, 0, 0, 0, 1, 0, 0, 3, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 2,
        0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 0, 3, 0, 0, 0, 0, 0, 3, 1, 0, 0, 4,
        5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 4, 0, 1, 0, 4, 4, 0, 5,
        2, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 3, 0,
        0, 0, 5, 2, 0, 0, 0, 5, 1, 2, 0, 0, 4, 0, 0, 4, 3, 5, 0, 0, 0, 0,
        0, 4, 0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0,
        4, 0, 0, 0, 0, 0, 3, 0, 4, 5, 0, 2, 2, 0, 0, 0, 0, 1, 0, 0, 0, 2,
        2, 4, 0, 0, 0, 0, 0, 0, 4, 0, 