In [8]:
dataset_xtrain='gs://vtx-root-path/569083142710/lightgbm-light-20220812090128/get-data_2079671818112729088/dataset_xtrain.csv'
dataset_ytrain='gs://vtx-root-path/569083142710/lightgbm-light-20220812090128/get-data_2079671818112729088/dataset_ytrain.csv'

In [11]:
def training(
    project_id: str,
    dataset_xtrain: str,
    dataset_ytrain: str,
    model_uri: str
    ):
    import pickle
    import argparse
    import numpy as np
    import pandas as pd
    import lightgbm as lgb
    from google.cloud import storage
    
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--n_estimators', 
        type=int)
    parser.add_argument(
        "--learning_rate",
        type=float)
    parser.add_argument(
        "--num_leaves",
        type=int)
    parser.add_argument(
        "--max_depth",
        type=int)
    parser.add_argument(
        "--min_data_in_leaf",
        type=int)
    parser.add_argument(
        "--bagging_fraction",
        type=float)
    parser.add_argument(
        "--bagging_freq",
        type=int)
    parser.add_argument(
        "--feature_fraction",
        type=float)
    args = parser.parse_args()
    
    def train_model():
        X_train = pd.read_csv(dataset_xtrain)
        y_train = pd.read_csv(dataset_ytrain).diagnosis
        model = lgb.LGBMClassifier(
            n_estimators=args.n_estimators,
            learning_rate=args.learning_rate,
            num_leaves=args.num_leaves,
            max_depth=args.max_depth,
            min_data_in_leaf=args.min_data_in_leaf,
            bagging_fraction=args.bagging_fraction,
            bagging_freq=args.bagging_freq,
            feature_fraction=args.feature_fraction
        )
        model.fit(X_train, y_train)
        
    
    def evaluate_model(model, test_data, test_labels):
        dtest = pd.read_csv(test_data)
        pred = model.predict(dtest)
        predictions = [round(value) for value in pred]
        # evaluate predictions
        accuracy = accuracy_score(test_labels, predictions)
        print(f"[INFO] ------ Evaluation completed with model accuracy: {accuracy}")
        
        # report metric for hyperparameter tuning
        hpt = hypertune.HyperTune()
        hpt.report_hyperparameter_tuning_metric(
            hyperparameter_metric_tag='accuracy',
            metric_value=accuracy
            )
        return accuracy
    
    
    
    file_name = "/tmp/model.pkl"
    with open(file_name, 'wb') as file:
        pickle.dump(clf, file)



    #storage_client = storage.Client(project=project_id)
    #_bucket = model_uri.split('/')[2]
    #_suffix = "/".join(model_uri.split('/')[3:]).rstrip("/")
    #bucket = storage_client.get_bucket(_bucket)
    #print(bucket)
    #print(_suffix)
    #blob = bucket.blob(f'{_suffix}/model.pkl')
    #blob.upload_from_filename('/tmp/model.pkl')

In [12]:
train('jchavezar-demo', dataset_xtrain, dataset_ytrain, "test")



In [14]:
import pickle 

with open('/tmp/model.pkl', 'rb') as f:
    model = pickle.load(f)

req={"instances": [
    [17.99, 10.38, 122.80, 1001.0, 0.11840],
    [23.4, 3.4, 200.4, 300.5, 0.086]
    ]}

model.predict(req["instances"])

array([0, 1])

In [None]:
from optuna.integration import LightGBMPruningCallback

def objective(trial, X, y):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        ),
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1121218)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = lgbm.LGBMClassifier(objective="binary", **param_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="binary_logloss",
            early_stopping_rounds=100,
            callbacks=[
                LightGBMPruningCallback(trial, "binary_logloss")
            ],  # Add a pruning callback
        )
        preds = model.predict_proba(X_test)
        cv_scores[idx] = log_loss(y_test, preds)

    return np.mean(cv_scores)

In [None]:
%%writefile custom/trainer/task.py
import datetime
import os
import subprocess
import sys
import pandas as pd
import xgboost as xgb
import hypertune
import argparse
import logging

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

parser = argparse.ArgumentParser()
parser.add_argument('--model-dir', dest='model_dir',
                    default=os.getenv('AIP_MODEL_DIR'), type=str, help='Model dir.')
parser.add_argument("--dataset-data-url", dest="dataset_data_url",
                    type=str, help="Download url for the training data.")
parser.add_argument("--dataset-labels-url", dest="dataset_labels_url",
                    type=str, help="Download url for the training data labels.")
parser.add_argument("--boost-rounds", dest="boost_rounds",
                    default=20, type=int, help="Number of boosted rounds")
args = parser.parse_args()

logging.getLogger().setLevel(logging.INFO)

def get_data():
    logging.info("Downloading training data and labelsfrom: {}, {}".format(args.dataset_data_url, args.dataset_labels_url))
    # gsutil outputs everything to stderr so we need to divert it to stdout.
    subprocess.check_call(['gsutil', 'cp', args.dataset_data_url, 'data.csv'], stderr=sys.stdout)
    # gsutil outputs everything to stderr so we need to divert it to stdout.
    subprocess.check_call(['gsutil', 'cp', args.dataset_labels_url, 'labels.csv'], stderr=sys.stdout)


    # Load data into pandas, then use `.values` to get NumPy arrays
    data = pd.read_csv('data.csv').values
    labels = pd.read_csv('labels.csv').values

    # Convert one-column 2D array into 1D array for use with XGBoost
    labels = labels.reshape((labels.size,))

    train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.2, random_state=7)

    # Load data into DMatrix object
    dtrain = xgb.DMatrix(train_data, label=train_labels)
    return dtrain, test_data, test_labels

def train_model(dtrain):
    logging.info("Start training ...")
    # Train XGBoost model
    model = xgb.train({}, dtrain, num_boost_round=args.boost_rounds)
    logging.info("Training completed")
    return model

def evaluate_model(model, test_data, test_labels):
    dtest = xgb.DMatrix(test_data)
    pred = model.predict(dtest)
    predictions = [round(value) for value in pred]
    # evaluate predictions
    accuracy = accuracy_score(test_labels, predictions)
    logging.info(f"Evaluation completed with model accuracy: {accuracy}")

    # report metric for hyperparameter tuning
    hpt = hypertune.HyperTune()
    hpt.report_hyperparameter_tuning_metric(
        hyperparameter_metric_tag='accuracy',
        metric_value=accuracy
    )
    return accuracy


dtrain, test_data, test_labels = get_data()
model = train_model(dtrain)
accuracy = evaluate_model(model, test_data, test_labels)

# GCSFuse conversion
gs_prefix = 'gs://'
gcsfuse_prefix = '/gcs/'
if args.model_dir.startswith(gs_prefix):
    args.model_dir = args.model_dir.replace(gs_prefix, gcsfuse_prefix)
    dirpath = os.path.split(args.model_dir)[0]
    if not os.path.isdir(dirpath):
        os.makedirs(dirpath)

# Export the classifier to a file
gcs_model_path = os.path.join(args.model_dir, 'model.bst')
logging.info("Saving model artifacts to {}". format(gcs_model_path))
model.save_model(gcs_model_path)

logging.info("Saving metrics to {}/metrics.json". format(args.model_dir))
gcs_metrics_path = os.path.join(args.model_dir, 'metrics.json')
with open(gcs_metrics_path, "w") as f:
    f.write(f"{'accuracy: {accuracy}'}")