In [1]:
from starter.ml import data as datautils
from starter.ml import model as mlutils
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
#initialize values
path='data/census_cleaned.csv'
slice_output_path="slice_output.txt"
category_to_slice='workclass'
# Add code to load in the data.
data=pd.read_csv(path)

#additional preprocessing because '?' adds an uncessary amount of uncertainty to the model:
data=data.replace('?',np.nan)
data=data.dropna()

# Optional enhancement, use K-fold cross validation instead of a train-test split.
train, test = train_test_split(data, test_size=0.20, random_state=42)
cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]
# Process the train data with the process_data function.
X_train, y_train, encoder, lb = datautils.process_data(
    train, categorical_features=cat_features, label="salary", training=True
)

# Process the test data with the process_data function.
X_test,y_test,_,_= datautils.process_data(test,cat_features,"salary", training=False ,encoder=encoder, lb=lb)


# Train and save a model.
model=mlutils.train_model(X_train,y_train)

#run inference on train
y_train_pred=mlutils.inference(model,X_train)
train_results=mlutils.compute_model_metrics(y_train,y_train_pred)

#run inference on test
y_test_pred=mlutils.inference(model,X_test)
test_results=mlutils.compute_model_metrics(y_test,y_test_pred)

# Process the test data with the process_data function.
X_test,y_test,_,_= datautils.process_data(test,cat_features,"salary", training=False ,
                                encoder=encoder, lb=lb)
#compute metrics based on the column/feature selected
slice_metrics_df=mlutils.compute_model_metrics_on_slices(category_to_slice,test,y_test,y_test_pred,slice_output_path)

In [2]:
test_results

(0.7321428571428571, 0.6431372549019608, 0.6847599164926931)

In [7]:
test_results

(0.8029556650246306, 0.622533418204965, 0.7013266403728936)

In [15]:
data['workclass'].unique()

array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
       'Local-gov', 'Self-emp-inc', 'Without-pay'], dtype=object)

In [16]:
slice_metrics_df

Unnamed: 0,column,slice,precision,recall,fbeta
0,workclass,Local-gov,0.827586,0.755906,0.790123
1,workclass,Private,0.81311,0.590679,0.684272
2,workclass,Federal-gov,0.784615,0.708333,0.744526
3,workclass,State-gov,0.807018,0.676471,0.736
4,workclass,Self-emp-not-inc,0.747826,0.540881,0.627737
5,workclass,Self-emp-inc,0.868421,0.785714,0.825
6,workclass,Without-pay,1.0,1.0,1.0


In [9]:
X_train.shape

(26048, 109)