### Import Libraries

In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import pickle
import joblib

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [7]:
#In case I need to update datarobot-drum
!pip install datarobot-drum --upgrade

Looking in indexes: https://artifactory.int.datarobot.com/artifactory/api/pypi/python-all/simple
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


### Import Data

In [18]:
df = pd.read_csv('../data/airline_delay_train.csv')

X = df.drop('dep_delayed_15min', axis=1)
X = X.drop(['FlightDate','DepTime'],axis=1)
y = df.pop('dep_delayed_15min')

### Define Preprocessing step per type of column

### Fit the Preprocessing Pipeline

In [19]:
#Preprocessing for categorical features
categorical_features = ['UniqueCarrier', 'Origin', 'Dest']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

#Preprocessor with all of the steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

# Full preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

#Train the model-Pipeline
pipeline.fit(X,y)

#Preprocess x
preprocessed = pipeline.transform(X)

#I could also train the model with the sparse matrix. I transform it to padnas because the hook function in custom.py expected a pandas dataframe to be used for scoring.
preprocessed = pd.DataFrame.sparse.from_spmatrix(preprocessed)


### Train XGboost Classifier
Normally, the XGboost classifier could be part of the final scikit-learn pipeline. I am opting to keep them separate in order to create a more complicated example with different pkl files for preprocessing and scoring

In [20]:
model = RandomForestClassifier(n_estimators = 5)

model.fit(preprocessed,y)

RandomForestClassifier(n_estimators=5)

### Save Custom Model files

In [21]:
joblib.dump(pipeline,'custom_model/preprocessing.pkl')
joblib.dump(model, 'custom_model/model.pkl') 

['custom_model/model.pkl']

In [23]:
!drum validation --code-dir ./custom_model --input ../data/airline_delay_test.csv --target-type binary --positive-class-label True --negative-class-label False

            True     False
0       0.673997  0.326003
1       0.835499  0.164501
2       0.905634  0.094366
3       0.872680  0.127320
4       0.668055  0.331945
...          ...       ...
101507  0.570746  0.429254
101508  0.827050  0.172950
101509  0.861419  0.138581
101510  0.520401  0.479599
101511  0.731238  0.268762

[101512 rows x 2 columns]
            True     False
0       0.673997  0.326003
1       0.835499  0.164501
2       0.905634  0.094366
3       0.872680  0.127320
4       0.668055  0.331945
...          ...       ...
101507  0.570746  0.429254
101508  0.827050  0.172950
101509  0.861419  0.138581
101510  0.520401  0.479599
101511  0.731238  0.268762

[101512 rows x 2 columns]
            True     False
0       0.791707  0.208293
1       0.835499  0.164501
2       0.851283  0.148717
3       0.899549  0.100451
4       0.755327  0.244673
...          ...       ...
101507  0.696817  0.303183
101508  0.689971  0.310029
101509  0.864455  0.135545
101510  0.602732  0.397268
1

In [30]:
!drum score --code-dir ./custom_model --input ../data/airline_delay_test.csv --target-type binary --positive-class-label True --negative-class-label False

            True     False
0       0.673997  0.326003
1       0.835499  0.164501
2       0.905634  0.094366
3       0.872680  0.127320
4       0.668055  0.331945
...          ...       ...
101507  0.570746  0.429254
101508  0.827050  0.172950
101509  0.861419  0.138581
101510  0.520401  0.479599
101511  0.731238  0.268762

[101512 rows x 2 columns]
