In [1]:
from lightgbm import LGBMRegressor
import wandb
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from functools import partial
from sklearn.model_selection import cross_val_score, train_test_split, KFold
import numpy as np 

MEAS_COLUMNS = [
    "TI-19",
    "FI-16",
    "TI-33",
    "FI-2",
    "FI-151",
    "TI-8",
    "FI-241",
#   "valve-position-12",  # dry-bed
#     "FI-38",  # strippera
#     "PI-28",  # stripper
#     "TI-28",  # stripper
    "FI-20",
    "FI-30",
    "TI-3",
    "FI-19",
    "FI-211",
    "FI-11",
    "TI-30",
    "PI-30",
    "TI-1213",
#     "TI-4",
    "FI-23",
    "delta_t",
]

TARGETS_clean = ['2-Amino-2-methylpropanol C4H11NO', 'Piperazine C4H10N2', 
                 "Carbon dioxide CO2", "Ammonia NH3"]

In [2]:
df = pd.read_pickle('df_dropped.pkl')

In [3]:
X, y = df[MEAS_COLUMNS], df[TARGETS_clean].values

In [4]:
scaler = StandardScaler()

In [5]:
X_ = scaler.fit_transform(X)

In [6]:
config = {
    'n_estimators': {
        'distribution': 'int_uniform',
        'min': 10,
        'max': 5000
    },
    'max_depth': {
        'distribution': 'int_uniform',
        'min': 5,
        'max': 100
    },
    'num_leaves': {
        'distribution': 'int_uniform',
        'min': 5,
        'max': 500
    },
    'reg_alpha': {
        'distribution': 'log_uniform',
        'min': 0.00001,
        'max': 0.4
    },
    'reg_lambda': {
        'distribution': 'log_uniform',
        'min': 0.00001,
        'max': 0.4
    },
    'subsample': {
        'distribution': 'uniform',
        'min': 0.4,
        'max': 1.0
    },
    'colsample_bytree': {
        'distribution': 'uniform',
        'min': 0.01,
        'max': 1.0
    },
    'min_child_weight': {
        'distribution': 'uniform',
        'min': 0.001,
        'max': 0.1,
    },
}

In [7]:
def get_sweep_id(method):
    """return us a sweep id (required for running the sweep)"""
    sweep_config = {
        'method': method,
        'metric': {
            'name': 'cv_mean',
            'goal': 'minimize'
        },
        'early_terminate': {
            'type': 'hyperband',
            's': 2,
            'eta': 3,
            'max_iter': 30
        },
        'parameters': config,
    }
    sweep_id = wandb.sweep(sweep_config, project='process_ml')

    return sweep_id

In [8]:
def train(index):
    # Config is a variable that holds and saves hyperparameters and inputs

    configs = {
        'n_estimators': 100,
        'max_depth': 10,
        'num_leaves': 50,
        'reg_alpha': 0.00001,
        'reg_lambda': 0.00001,
        'subsample': 0.2,
        'colsample_bytree': 0.2,
        'min_child_weight': 0.001,
    }

    # Initilize a new wandb run
    wandb.init(project='process_ml', config=configs)

    config = wandb.config
    config['objective'] =  'huber'

    regressor = LGBMRegressor(**config)

    cv = cross_val_score(regressor, X_, y[:, index], n_jobs=-1, cv=KFold(n_splits=5), scoring='neg_mean_absolute_error')

    mean = np.abs(cv.mean())
    std = np.abs(cv.std())
    wandb.log({'cv_mean': mean})
    wandb.log({'cv_std': std})

    wandb.run.summary['cv_mean'] = mean
    wandb.run.summary['cv_std'] = std

In [9]:
sweep_id = get_sweep_id('bayes')
train_func = partial(train, index=int(0))
wandb.agent(sweep_id, function=train_func)