## This notebook checks quality of cells ran inside train_model.py since there were some questions and errors

In [1]:
# Script to train machine learning model.

from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder
from xgboost import XGBClassifier
import joblib

# Add the necessary imports for the starter code.
from ml.data import process_data
from ml.model import train_model, compute_model_metrics, inference

# Add code to load in the data.

# Optional enhancement, use K-fold cross validation instead of a train-test split.
data = pd.read_csv('../data/clean_census.csv')

data.columns = [col.replace('-','_') if '-' else col in col for col in data.columns]

train, test = train_test_split(data, test_size=0.20)

cat_features = [
    "workclass",
    "education",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native_country",
]

In [3]:
train.shape

(26048, 15)

In [4]:
X_train, y_train, encoder, lb = process_data(
    train, categorical_features=cat_features, label="salary", training=True
)

# To do the slice, we need to reset the test index
test.reset_index(drop=True,inplace=True)

In [5]:
X_train.shape

(26048, 108)

In [7]:
len(X_train[0])

108

In [9]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,107
0,26.0,102106.0,6.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,47.0,246739.0,13.0,99999.0,0.0,55.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,25.0,208591.0,9.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,59.0,182836.0,9.0,0.0,0.0,40.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,33.0,178429.0,13.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [10]:
# View an example subset here
test.iloc[0:1].head()

Unnamed: 0,age,workclass,fnlgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,28,Private,201175,11th,7,Never-married,Machine-op-inspct,Not-in-family,Black,Male,0,0,40,United-States,<=50K


In [11]:
print(test.iloc[0:1].shape)

(1, 15)


In [12]:
# Process the test data with the process_data function.
X_test, y_test, encoder, lb = process_data(
    test, categorical_features=cat_features, label="salary", training=False, encoder=encoder, lb=lb
)


In [13]:
X_test.shape

(6513, 108)

In [15]:
pd.DataFrame(X_test).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,107
0,28.0,201175.0,7.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,32.0,295589.0,14.0,0.0,1977.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,19.0,124884.0,5.0,0.0,0.0,25.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,51.0,61270.0,9.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,38.0,412296.0,9.0,0.0,0.0,28.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [16]:
# Train XGBoost Model
model = train_model(X_train, y_train)

y_pred = inference(model,X_test)

In [17]:
pd.DataFrame(y_pred).shape

(6513, 1)

In [19]:
joblib.dump(model, '../model/final_xgb.pkl')

xgb = joblib.load("../model/final_xgb.pkl")

In [20]:
xgb

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [35]:
model.predict(X_test[0:1])

array([0])

In [39]:
print(len(X_test[0:1]))

1


In [40]:
print(len(X_test[0])) # this is why it fails

108


In [45]:
# This does not error
model.predict([X_test[0]])

array([0])

In [36]:
# This errors
model.predict(X_test[0])

XGBoostError: [10:32:45] /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-3.7/xgboost/src/predictor/cpu_predictor.cc:310: Check failed: m->NumColumns() == model.learner_model_param->num_feature (1 vs. 108) : Number of columns in data must equal to trained model.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x000000013ebba895 dmlc::LogMessageFatal::~LogMessageFatal() + 117
  [bt] (1) 2   libxgboost.dylib                    0x000000013ed17f52 void xgboost::predictor::CPUPredictor::DispatchedInplacePredict<xgboost::data::ArrayAdapter, 64ul>(dmlc::any const&, std::__1::shared_ptr<xgboost::DMatrix>, xgboost::gbm::GBTreeModel const&, float, xgboost::PredictionCacheEntry*, unsigned int, unsigned int) const + 354
  [bt] (2) 3   libxgboost.dylib                    0x000000013ed130a1 xgboost::predictor::CPUPredictor::InplacePredict(dmlc::any const&, std::__1::shared_ptr<xgboost::DMatrix>, xgboost::gbm::GBTreeModel const&, float, xgboost::PredictionCacheEntry*, unsigned int, unsigned int) const + 273
  [bt] (3) 4   libxgboost.dylib                    0x000000013ec78196 xgboost::gbm::GBTree::InplacePredict(dmlc::any const&, std::__1::shared_ptr<xgboost::DMatrix>, float, xgboost::PredictionCacheEntry*, unsigned int, unsigned int) const + 1366
  [bt] (4) 5   libxgboost.dylib                    0x000000013ec90aef xgboost::LearnerImpl::InplacePredict(dmlc::any const&, std::__1::shared_ptr<xgboost::DMatrix>, xgboost::PredictionType, float, xgboost::HostDeviceVector<float>**, unsigned int, unsigned int) + 127
  [bt] (5) 6   libxgboost.dylib                    0x000000013ebc1f65 void InplacePredictImpl<xgboost::data::ArrayAdapter>(std::__1::shared_ptr<xgboost::data::ArrayAdapter>, std::__1::shared_ptr<xgboost::DMatrix>, char const*, xgboost::Learner*, unsigned long, unsigned long, unsigned long long const**, unsigned long long*, float const**) + 789
  [bt] (6) 7   libxgboost.dylib                    0x000000013ebc1966 XGBoosterPredictFromDense + 326
  [bt] (7) 8   libffi.8.dylib                      0x000000010d686d92 ffi_call_unix64 + 82

