#### How to integrate your ML model into this existing architecture?
first of all, since there are missing Style recipe, I can assume, user can submit recipe without Style successfully.
1. using a websocket, once an new recipe submitted without a Style, it will go through the model, make a prediction and send back to user as a suggested Style.
2. using a websocket, once an new recipe submitted without a Style, it will go through the model, make a prediction and save to the database.
3. save it to seperate table, run a schedule job to make prediction periodically (e.g. daily), and save the save the predicted recipe to normal table.

approach 3 is the most appropriate in this situation.
1. it's cheapest
2. time sensitivity is very low in this situation, recipe without Style only covers very low proportion.

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
import lightgbm
from lightgbm import LGBMClassifier
import pickle
import warnings
warnings.filterwarnings("ignore")

### test case
use null Style samples as a test to test the application, in production, data can be parse in by different ways.

In [2]:
# load data and correct some dtypes of IDs
recipe = pd.read_csv('recipeData.csv', index_col='BeerID', 
                     dtype={'UserId':'object','StyleID':'object'}, 
                     encoding='latin1')
# Let's remove records withount Style
test = recipe[recipe['Style'].isnull()]

### load model

In [3]:
with open('prep_steps.pkl', 'rb') as f:
    prep_steps = pickle.load(f)
with open('lgbmodel.pkl', 'rb') as f:
    lgb = pickle.load(f)

### prediction

In [4]:
def prep_pipeline(prep_steps, data):
    for _col in prep_steps['col_too_many_missing']:
        data.loc[:,'valid_'+_col] = data[_col].notnull()
        data.drop(_col, axis=1, inplace = True)
    data.loc[:, 'URL'] = data.loc[:,'URL'].apply(lambda x: x.split('/')[-1])

    for w in prep_steps['use_words_features']:
        data.loc[:,w] = data['URL'].apply(lambda x: 1 if w in x else 0)
    fs = prep_steps['num_features'] + prep_steps['use_words_features'] + prep_steps['cat_features']
    data = data.loc[:, fs]

    for _col, encoder in prep_steps['cat_encoder'].items():
        data.loc[:,_col] = encoder.transform(data[_col])

    for _col, imputer in prep_steps['missing_imputer'].items():
        data[_col] = imputer.transform(data[_col].values.reshape(-1,1))

    scaler = prep_steps['scaler']
    data = scaler.transform(data)
    return data

In [5]:
test = prep_pipeline(prep_steps, test)
pred = lgb.predict(test)

In [6]:
pred

array(['19', '9', '86', '7', '9', '21', '7', '10', '92', '98', '15', '9',
       '162', '7', '10', '114', '9', '170', '175', '10', '4', '58', '10',
       '162', '10', '9', '9', '9', '150', '22', '7', '132', '12', '37',
       '132', '7', '129', '132', '15', '132', '4', '10', '86', '169',
       '22', '7', '27', '162', '12', '92', '15', '86', '15', '131', '168',
       '9', '170', '87', '132', '6', '144', '132', '143', '162', '7', '7',
       '27', '143', '170', '9', '7', '132', '143', '9', '129', '7', '86',
       '7', '143', '10', '64', '143', '9', '59', '58', '170', '19', '7',
       '162', '132', '39', '9', '10', '9', '26', '15', '10', '14', '163',
       '4', '39', '9', '7', '4', '7', '91', '10', '85', '10', '68', '132',
       '45', '143', '85', '37', '155', '7', '170', '7', '155', '44', '9',
       '51', '10', '10', '15', '7', '82', '45', '9', '148', '85', '143',
       '39', '15', '7', '10', '22', '163', '4', '143', '92', '10', '63',
       '28', '12', '59', '7', '7', '10', '7'