In [2]:
import pandas as pd
import hydra
from hydra.utils import to_absolute_path
from omegaconf import DictConfig

from starter.ml.data import clean_data, process_data
from starter.ml.model import train_model, inference, compute_model_metrics

In [10]:
# load raw data
data_path = to_absolute_path("data/census.csv")
raw_df =  pd.read_csv(data_path, skipinitialspace=True)
print(f'Raw data shape: {raw_df.shape}')
raw_df.head()

Raw data shape: (32561, 15)


Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [11]:
# clean data
clean_df = clean_data(raw_df)
print(f'Clean data shape: {clean_df.shape}')
clean_df.head()

Clean data shape: (30139, 15)


Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [13]:
from sklearn.model_selection import train_test_split

# split train, test data
train, test = train_test_split(clean_df, test_size=0.2)
print(f'Training set shape: {train.shape}')
print(f'Test set shape: {test.shape}')

Training set shape: (24111, 15)
Test set shape: (6028, 15)


In [17]:
# select features
cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

# one hot enconding
X_train, y_train, encoder_train, lb_train = process_data(
    X=train,
    categorical_features=cat_features,
    label="salary",
    training=True)
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')

X_test, y_test, _, _ = process_data(
    X=test,
    categorical_features=cat_features,
    label="salary",
    training=False,
    encoder=encoder_train,
    lb=lb_train)
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (24111, 104)
y_train shape: (24111,)
X_test shape: (6028, 104)
y_test shape: (6028,)


In [18]:
# train model
model = train_model(X_train, y_train, num_nb=50)

In [19]:
# predict
y_pred = inference(model, X_test)

# evaluate model 
precision, recall, f1 = compute_model_metrics(y_test, y_pred)
print(f'>> Precision: {precision}')
print(f'>> Recall: {recall}')
print(f'>> F1: {f1}')

>> Precision: 0.9578059071729957
>> Recall: 0.15275908479138628
>> F1: 0.2634939059779454
