In [13]:
# Script to train machine learning model.

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from ml.data import process_data
from model.model import train_model, compute_model_metrics
import pandas as pd
import pickle

In [14]:
# Add code to load in the data.
census_data=pd.read_csv("./amazondrive/Census_cleaned.csv")

In [15]:
# Optional enhancement, use K-fold cross validation instead of a train-test split.
#train, test = train_test_split(data, test_size=0.20)

cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

In [16]:
kf10 = KFold(n_splits=10, shuffle=False)
label="salary"
X, y, encoder, lb= process_data(census_data,cat_features,label,training=True)

In [17]:
for train_index, test_index in kf10.split(census_data):
    X_train,X_test = X[train_index], X[test_index]
    y_train,y_test = y[train_index], y[test_index]
    model=train_model(X_train,y_train)
    prediction=model.predict(X_test)
    precision, recall, fbeta=compute_model_metrics(y_test, prediction)
    print("Precision: ",precision,", Recall: ", recall, ", fbeta: ", fbeta)

Precision:  0.7118055555555556 , Recall:  0.26248399487836105 , fbeta:  0.3835360149672591
Precision:  0.7169117647058824 , Recall:  0.26785714285714285 , fbeta:  0.38999999999999996
Precision:  0.6761565836298933 , Recall:  0.26874115983026875 , fbeta:  0.38461538461538464
Precision:  0.6920415224913494 , Recall:  0.2631578947368421 , fbeta:  0.3813155386081983
Precision:  0.6911764705882353 , Recall:  0.25 , fbeta:  0.3671875
Precision:  0.7536764705882353 , Recall:  0.2812071330589849 , fbeta:  0.4095904095904096
Precision:  0.6870503597122302 , Recall:  0.2556894243641232 , fbeta:  0.3726829268292683
Precision:  0.7392857142857143 , Recall:  0.26813471502590674 , fbeta:  0.3935361216730038
Precision:  0.7158273381294964 , Recall:  0.2608125819134993 , fbeta:  0.38232468780019213
Precision:  0.7430555555555556 , Recall:  0.27828348504551365 , fbeta:  0.4049195837275308


In [18]:
pickle.dump(lb, open('ml/LabelBinarizer.sav', 'wb'))
pickle.dump(encoder, open('ml/OneHotEncoder.sav', 'wb'))
pickle.dump(model, open('ml/finalized_model.sav', 'wb'))

In [19]:
#Function for calculating descriptive stats on slices of the dataset

def slice_data(df, cat_features, encoder, lb, model):
    for cat in cat_features:
        print("Category: ", cat)
        for cls in df[cat].unique():
            print(cls)
            df_temp = df[df[cat] == cls]
            
            X_temp, y_temp, encoder1, lb1 = process_data(
            df_temp, categorical_features=cat_features, label="salary", training=False, encoder=encoder, lb=lb,
            )
            
            preds=model.predict(X_temp)
    
            precision, recall, fbeta=compute_model_metrics(y_temp, preds)
        
            with open("ml/slice_output.txt", 'a') as f:
                f.write("\nCategory: "+ cat +", "+ cls+"\n")
                f.write(" -Precision: " + str(precision)+"\n")
                f.write(" -Recall: " + str(recall)+"\n")
                f.write(" -Fbeta: " + str(fbeta)+"\n")

In [20]:
slice_data(census_data,cat_features,encoder,lb,model)

Category:  workclass
 State-gov
 Self-emp-not-inc
 Private
 Federal-gov
 Local-gov
 Self-emp-inc
 Without-pay
Category:  education
 Bachelors
 HS-grad
 11th
 Masters
 9th
 Some-college
 Assoc-acdm
 7th-8th
 Doctorate
 Assoc-voc
 Prof-school
 5th-6th
 10th
 Preschool
 12th
 1st-4th
Category:  marital-status
 Never-married
 Married-civ-spouse
 Divorced
 Married-spouse-absent
 Separated
 Married-AF-spouse
 Widowed
Category:  occupation
 Adm-clerical
 Exec-managerial
 Handlers-cleaners
 Prof-specialty
 Other-service
 Sales
 Transport-moving
 Farming-fishing
 Machine-op-inspct
 Tech-support
 Craft-repair
 Protective-serv
 Armed-Forces
 Priv-house-serv
Category:  relationship
 Not-in-family
 Husband
 Wife
 Own-child
 Unmarried
 Other-relative
Category:  race
 White
 Black
 Asian-Pac-Islander
 Amer-Indian-Eskimo
 Other
Category:  sex
 Male
 Female
Category:  native-country
 United-States
 Cuba
 Jamaica
 India
 Mexico
 Puerto-Rico
 Honduras
 England
 Canada
 Germany
 Iran
 Philippines
 Poland
