In [1]:
from lightgbm import LGBMRegressor
import wandb
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from functools import partial
from sklearn.model_selection import cross_val_score, train_test_split, KFold
import numpy as np 

MEAS_COLUMNS = [
    "TI-19",
    "FI-16",
    "TI-33",
    "FI-2",
    "FI-151",
    "TI-8",
    "FI-241",
#   "valve-position-12",  # dry-bed
#     "FI-38",  # strippera
#     "PI-28",  # stripper
    
#     "TI-28",  # stripper
    "FI-20",
    "FI-30",
    "TI-3",
    "FI-19",
    "FI-211",
    "FI-11",
    "TI-30",
    "PI-30",
    "TI-1213",
#     "TI-4",
    "FI-23",
    "delta_t",
]

TARGETS_clean = ['2-Amino-2-methylpropanol C4H11NO', 'Piperazine C4H10N2', 
                 "Carbon dioxide CO2", "Ammonia NH3"]

In [2]:
df = pd.read_pickle('df_dropped.pkl')

In [3]:
X, y = df[MEAS_COLUMNS], df[TARGETS_clean].values

In [4]:
scaler = StandardScaler()

In [5]:
X_ = scaler.fit_transform(X)

In [6]:
config = {
    'n_estimators': {
        'distribution': 'int_uniform',
        'min': 10,
        'max': 5000
    },
    'max_depth': {
        'distribution': 'int_uniform',
        'min': 5,
        'max': 100
    },
    'num_leaves': {
        'distribution': 'int_uniform',
        'min': 5,
        'max': 500
    },
    'reg_alpha': {
        'distribution': 'log_uniform',
        'min': 0.00001,
        'max': 0.4
    },
    'reg_lambda': {
        'distribution': 'log_uniform',
        'min': 0.00001,
        'max': 0.4
    },
    'subsample': {
        'distribution': 'uniform',
        'min': 0.4,
        'max': 1.0
    },
    'colsample_bytree': {
        'distribution': 'uniform',
        'min': 0.01,
        'max': 1.0
    },
    'min_child_weight': {
        'distribution': 'uniform',
        'min': 0.001,
        'max': 0.1,
    },
}

In [7]:
def get_sweep_id(method):
    """return us a sweep id (required for running the sweep)"""
    sweep_config = {
        'method': method,
        'metric': {
            'name': 'cv_mean',
            'goal': 'minimize'
        },
        'early_terminate': {
            'type': 'hyperband',
            's': 2,
            'eta': 3,
            'max_iter': 30
        },
        'parameters': config,
    }
    sweep_id = wandb.sweep(sweep_config, project='process_ml')

    return sweep_id

In [8]:
def train(index):
    # Config is a variable that holds and saves hyperparameters and inputs

    configs = {
        'n_estimators': 100,
        'max_depth': 10,
        'num_leaves': 50,
        'reg_alpha': 0.00001,
        'reg_lambda': 0.00001,
        'subsample': 0.2,
        'colsample_bytree': 0.2,
        'min_child_weight': 0.001,
    }

    # Initilize a new wandb run
    wandb.init(project='process_ml', config=configs)

    config = wandb.config
    config['objective'] =  'huber'

    regressor = LGBMRegressor(**config)

    cv = cross_val_score(regressor, X_, y[:, index], n_jobs=-1, cv=KFold(n_splits=5), scoring='neg_mean_absolute_error')

    mean = np.abs(cv.mean())
    std = np.abs(cv.std())
    wandb.log({'cv_mean': mean})
    wandb.log({'cv_std': std})

    wandb.run.summary['cv_mean'] = mean
    wandb.run.summary['cv_std'] = std

In [9]:
sweep_id = get_sweep_id('bayes')
train_func = partial(train, index=int(1))
wandb.agent(sweep_id, function=train_func)

In [10]:
sweep_id = get_sweep_id('bayes')
train_func = partial(train, index=int(2))
wandb.agent(sweep_id, function=train_func)

In [11]:
target_0_config = { # https://wandb.ai/kjappelbaum/process_ml/runs/c0g28ueb/overview?workspace=user-kjappelbaum
    'n_estimators': 1428,
    'max_depth': 70,
    'num_leaves': 5,
    'reg_alpha': 1.382,
    'reg_lambda': 1.26,
    'subsample': 0.8367,
    'colsample_bytree': 0.522,
    'min_child_weight': 0.09979,
}

target_1_config = { 
    'n_estimators': 261,
    'max_depth': 88,
    'num_leaves': 272,
    'reg_alpha': 1.071,
    'reg_lambda': 1.036,
    'subsample': 0.4096,
    'colsample_bytree': 0.1534,
    'min_child_weight': 0.0117,
   # 'objective': 'huber'
}


target_2_config = { # https://wandb.ai/kjappelbaum/process_ml/runs/571rdxdm/overview?workspace=user-kjappelbaum
    'n_estimators': 228,
    'max_depth': 39,
    'num_leaves': 440,
    'reg_alpha': 1.456,
    'reg_lambda': 1.415,
    'subsample': 0.4801,
    'colsample_bytree': 0.3772,
    'min_child_weight': 0.01326,
    #'objective': 'huber'
}

In [12]:
from sklearn.ensemble import BaggingRegressor

In [13]:
xgb_co2 = BaggingRegressor(LGBMRegressor(**target_0_config), n_estimators=100)
xgb_2amp = BaggingRegressor(LGBMRegressor(**target_1_config), n_estimators=100)
xgb_piperazine = BaggingRegressor(LGBMRegressor(**target_2_config), n_estimators=100)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_, y, train_size=0.8)

In [15]:
xgb_co2.fit(X_train, y_train[:, 0])
xgb_2amp.fit(X_train, y_train[:, 1])
xgb_piperazine.fit(X_train, y_train[:, 2])

BaggingRegressor(base_estimator=LGBMRegressor(colsample_bytree=0.3772,
                                              max_depth=39,
                                              min_child_weight=0.01326,
                                              n_estimators=228, num_leaves=440,
                                              reg_alpha=1.456, reg_lambda=1.415,
                                              subsample=0.4801),
                 n_estimators=100)

In [16]:
def make_parity_plot(model, X_test, X_train, y_test_, y_train_, outname=None):
    predictions_test = model.predict(X_test)
    predictions_train = model.predict(X_train)

    fig, ax = plt.subplots(1,2, sharex=True, sharey=True)

    ax[0].set_ylabel(r'$\hat{y}$')
    ax[0].set_title('test')
    ax[1].set_title('train')

    ax[0].scatter(y_test_, predictions_test, s=10)
    ax[1].scatter(y_train_, predictions_train, s=10)

    for a in ax: 
        a.spines['top'].set_color('none')
        a.spines['right'].set_color('none')
        a.spines['left'].set_smart_bounds(True)
        a.spines['bottom'].set_smart_bounds(True)
        a.set_xlabel(r'$y_\mathrm{true}$')

        x_lims = a.get_xlim()
        y_lims = a.get_ylim()
        a.plot(x_lims, y_lims, '--k')


    fig.tight_layout()

    if outname is not None: 
        fig.savefig(outname, bbox_inches='tight')

In [17]:
make_parity_plot(xgb_co2, X_test, X_train, y_test[:,0], y_train[:,0])

In [18]:
make_parity_plot(xgb_2amp, X_test, X_train, y_test[:,1], y_train[:,1])

In [19]:
make_parity_plot(xgb_piperazine, X_test, X_train, y_test[:,2], y_train[:,2])

In [20]:
def make_parity_plot(model, X_test, X_train, y_test_, y_train_, outname=None):
    predictions_test = model.predict(X_test)
    predictions_train = model.predict(X_train)

    fig, ax = plt.subplots(1,2, sharex=True, sharey=True)

    ax[0].set_ylabel(r'$\hat{y}$')
    ax[0].set_title('test')
    ax[1].set_title('train')

    ax[0].scatter(y_test_, predictions_test, s=10)
    ax[1].scatter(y_train_, predictions_train, s=10)

    for a in ax: 
        a.spines['top'].set_color('none')
        a.spines['right'].set_color('none')
        a.spines['left'].set_smart_bounds(True)
        a.spines['bottom'].set_smart_bounds(True)
        a.set_xlabel(r'$y_\mathrm{true}$')

        x_lims = a.get_xlim()
        y_lims = a.get_ylim()
        a.plot(x_lims, y_lims, '--k', s=1)


    fig.tight_layout()

    if outname is not None: 
        fig.savefig(outname, bbox_inches='tight')

In [21]:
make_parity_plot(xgb_co2, X_test, X_train, y_test[:,0], y_train[:,0])

In [22]:
def make_parity_plot(model, X_test, X_train, y_test_, y_train_, outname=None):
    predictions_test = model.predict(X_test)
    predictions_train = model.predict(X_train)

    fig, ax = plt.subplots(1,2, sharex=True, sharey=True)

    ax[0].set_ylabel(r'$\hat{y}$')
    ax[0].set_title('test')
    ax[1].set_title('train')

    ax[0].scatter(y_test_, predictions_test, s=2)
    ax[1].scatter(y_train_, predictions_train, s=2)

    for a in ax: 
        a.spines['top'].set_color('none')
        a.spines['right'].set_color('none')
        a.spines['left'].set_smart_bounds(True)
        a.spines['bottom'].set_smart_bounds(True)
        a.set_xlabel(r'$y_\mathrm{true}$')

        x_lims = a.get_xlim()
        y_lims = a.get_ylim()
        a.plot(x_lims, y_lims, '--k')


    fig.tight_layout()

    if outname is not None: 
        fig.savefig(outname, bbox_inches='tight')

In [23]:
make_parity_plot(xgb_co2, X_test, X_train, y_test[:,0], y_train[:,0])

In [24]:
target_0_config = { # https://wandb.ai/kjappelbaum/process_ml/runs/c0g28ueb/overview?workspace=user-kjappelbaum
    'n_estimators': 1428,
    'max_depth': 70,
    'num_leaves': 5,
    'reg_alpha': 1.382,
    'reg_lambda': 1.26,
    'subsample': 0.8367,
    'colsample_bytree': 0.522,
    'min_child_weight': 0.09979,
     'objective': 'huber'
}

target_1_config = { 
    'n_estimators': 261,
    'max_depth': 88,
    'num_leaves': 272,
    'reg_alpha': 1.071,
    'reg_lambda': 1.036,
    'subsample': 0.4096,
    'colsample_bytree': 0.1534,
    'min_child_weight': 0.0117,
    'objective': 'huber'
}


target_2_config = { # https://wandb.ai/kjappelbaum/process_ml/runs/571rdxdm/overview?workspace=user-kjappelbaum
    'n_estimators': 228,
    'max_depth': 39,
    'num_leaves': 440,
    'reg_alpha': 1.456,
    'reg_lambda': 1.415,
    'subsample': 0.4801,
    'colsample_bytree': 0.3772,
    'min_child_weight': 0.01326,
    'objective': 'huber'
}

In [25]:
from sklearn.ensemble import BaggingRegressor

In [26]:
xgb_co2 = BaggingRegressor(LGBMRegressor(**target_0_config), n_estimators=100)
xgb_2amp = BaggingRegressor(LGBMRegressor(**target_1_config), n_estimators=100)
xgb_piperazine = BaggingRegressor(LGBMRegressor(**target_2_config), n_estimators=100)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_, y, train_size=0.8)

In [28]:
xgb_co2.fit(X_train, y_train[:, 0])
xgb_2amp.fit(X_train, y_train[:, 1])
xgb_piperazine.fit(X_train, y_train[:, 2])

BaggingRegressor(base_estimator=LGBMRegressor(colsample_bytree=0.3772,
                                              max_depth=39,
                                              min_child_weight=0.01326,
                                              n_estimators=228, num_leaves=440,
                                              objective='huber',
                                              reg_alpha=1.456, reg_lambda=1.415,
                                              subsample=0.4801),
                 n_estimators=100)

In [29]:
def make_parity_plot(model, X_test, X_train, y_test_, y_train_, outname=None):
    predictions_test = model.predict(X_test)
    predictions_train = model.predict(X_train)

    fig, ax = plt.subplots(1,2, sharex=True, sharey=True)

    ax[0].set_ylabel(r'$\hat{y}$')
    ax[0].set_title('test')
    ax[1].set_title('train')

    ax[0].scatter(y_test_, predictions_test, s=2)
    ax[1].scatter(y_train_, predictions_train, s=2)

    for a in ax: 
        a.spines['top'].set_color('none')
        a.spines['right'].set_color('none')
        a.spines['left'].set_smart_bounds(True)
        a.spines['bottom'].set_smart_bounds(True)
        a.set_xlabel(r'$y_\mathrm{true}$')

        x_lims = a.get_xlim()
        y_lims = a.get_ylim()
        a.plot(x_lims, y_lims, '--k')


    fig.tight_layout()

    if outname is not None: 
        fig.savefig(outname, bbox_inches='tight')

In [30]:
make_parity_plot(xgb_co2, X_test, X_train, y_test[:,0], y_train[:,0])

In [31]:
make_parity_plot(xgb_2amp, X_test, X_train, y_test[:,1], y_train[:,1])

In [32]:
make_parity_plot(xgb_piperazine, X_test, X_train, y_test[:,2], y_train[:,2])

In [33]:
make_parity_plot(xgb_co2, X_test, X_train, y_test[:,0], y_train[:,0])

In [34]:
make_parity_plot(xgb_2amp, X_test, X_train, y_test[:,1], y_train[:,1])

In [35]:
make_parity_plot(xgb_piperazine, X_test, X_train, y_test[:,2], y_train[:,2])

In [36]:
def train(index):
    # Config is a variable that holds and saves hyperparameters and inputs

    configs = {
        'n_estimators': 100,
        'max_depth': 10,
        'num_leaves': 50,
        'reg_alpha': 0.00001,
        'reg_lambda': 0.00001,
        'subsample': 0.2,
        'colsample_bytree': 0.2,
        'min_child_weight': 0.001,
    }

    # Initilize a new wandb run
    wandb.init(project='process_ml', config=configs)

    config = wandb.config
    #config['objective'] =  'huber'

    regressor = LGBMRegressor(**config)

    cv = cross_val_score(regressor, X_, y[:, index], n_jobs=-1, cv=KFold(n_splits=5), scoring='neg_mean_absolute_error')

    mean = np.abs(cv.mean())
    std = np.abs(cv.std())
    wandb.log({'cv_mean': mean})
    wandb.log({'cv_std': std})

    wandb.run.summary['cv_mean'] = mean
    wandb.run.summary['cv_std'] = std

In [37]:
config = {
    'n_estimators': {
        'distribution': 'int_uniform',
        'min': 10,
        'max': 1000
    },
    'max_depth': {
        'distribution': 'int_uniform',
        'min': 5,
        'max': 100
    },
    'num_leaves': {
        'distribution': 'int_uniform',
        'min': 5,
        'max': 500
    },
    'reg_alpha': {
        'distribution': 'log_uniform',
        'min': 0.00001,
        'max': 0.4
    },
    'reg_lambda': {
        'distribution': 'log_uniform',
        'min': 0.00001,
        'max': 0.4
    },
    'subsample': {
        'distribution': 'uniform',
        'min': 0.4,
        'max': 1.0
    },
    'colsample_bytree': {
        'distribution': 'uniform',
        'min': 0.4,
        'max': 1.0
    },
    'min_child_weight': {
        'distribution': 'uniform',
        'min': 0.0001,
        'max': 0.1,
    },
}

In [38]:
def get_sweep_id(method):
    """return us a sweep id (required for running the sweep)"""
    sweep_config = {
        'method': method,
        'metric': {
            'name': 'cv_mean',
            'goal': 'minimize'
        },
        'early_terminate': {
            'type': 'hyperband',
            's': 2,
            'eta': 3,
            'max_iter': 30
        },
        'parameters': config,
    }
    sweep_id = wandb.sweep(sweep_config, project='process_ml')

    return sweep_id

In [39]:
def train(index):
    # Config is a variable that holds and saves hyperparameters and inputs

    configs = {
        'n_estimators': 100,
        'max_depth': 10,
        'num_leaves': 50,
        'reg_alpha': 0.00001,
        'reg_lambda': 0.00001,
        'subsample': 0.2,
        'colsample_bytree': 0.2,
        'min_child_weight': 0.001,
    }

    # Initilize a new wandb run
    wandb.init(project='process_ml', config=configs)

    config = wandb.config
    #config['objective'] =  'huber'

    regressor = LGBMRegressor(**config)

    cv = cross_val_score(regressor, X_, y[:, index], n_jobs=-1, cv=KFold(n_splits=5), scoring='neg_mean_absolute_error')

    mean = np.abs(cv.mean())
    std = np.abs(cv.std())
    wandb.log({'cv_mean': mean})
    wandb.log({'cv_std': std})

    wandb.run.summary['cv_mean'] = mean
    wandb.run.summary['cv_std'] = std

In [40]:
sweep_id = get_sweep_id('bayes')
train_func = partial(train, index=int(0))
wandb.agent(sweep_id, function=train_func)