## A simple regression training using LightGBM through Fairing

In [None]:
import os
from time import gmtime, strftime
import fairing
from fairing.frameworks import lightgbm

# Setting up google container repositories (GCR) for storing output containers
# You can use any docker container registry istead of GCR
GCP_PROJECT = fairing.cloud.gcp.guess_project_name()
DOCKER_REGISTRY = 'gcr.io/{}/fairing-job'.format(GCP_PROJECT)

## Data Preparation

#https://archive.ics.uci.edu/ml/datasets/HIGGS
dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
dataset_gz_path = "HIGGS.csv.gz"
dataset_path = dataset_gz_path.split(".gz")[0]
gcs_bucket = "gs://{}-fairing".format(GCP_PROJECT)
dataset_path_in_gcs = "{}/higgs-dataset".fromat(gcs_bucket)
train_gcs_path = "{}/train.csv".format(dataset_path_in_gcs)
validation_gcs_path = "{}/validation.csv".format(dataset_path_in_gcs)
eval_gcs_path = "{}/test.csv".format(dataset_path_in_gcs)

if not os.path.exists(dataset_path):
    !wget  -o {dataset_gz_path}
else:
    print("{} exists so not downloading.".format(dataset_gz_path))
if not os.path.exists(dataset_path):
    !tar -xvvf {dataset_gz_path}
else:
    print("{} exists so not extracting.".format(dataset_path))

### Create a GCS bucket for storing model output and predictions

In [None]:
# Creating a bucket for copying the trained model. 
# You can set gcs_bucket variable to an existing bucket name if that is desired.
gcs_bucket = "gs://{}-fairing".format(GCP_PROJECT)
!gsutil mb {gcs_bucket}

## Parallel training with tree_learner = "data"

In [None]:
common_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'cross_entropy',
    'metric_freq': 1,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    "n_estimators": 100,
    "is_training_metric": "true",
    "valid_data": "gs://caip-dexter-bugbash/lightgbm/higgs_wide/valid.csv",
    "train_data": "gs://caip-dexter-bugbash/lightgbm/higgs_wide/train_0.csv,gs://caip-dexter-bugbash/lightgbm/higgs_wide/train_1.csv,gs://caip-dexter-bugbash/lightgbm/higgs_wide/train_2.csv,gs://caip-dexter-bugbash/lightgbm/higgs_wide/train_3.csv",
    'verbose': 1,
    "verbose_eval": 1
}

In [None]:
params["num_threads"]= 44
params["tree_learner"] = "serial"
params["num_machines"] = 1
params['train_data'] = "gs://caip-dexter-bugbash/lightgbm/higgs_wide_full/train.csv"
params['model_output'] =  "{}/lightgbm/example/model_{}.txt".format(gcs_bucket, strftime("%Y_%m_%d_%H_%M_%S", gmtime()))
lightgbm.execute(config=params,
                 base_image='gcr.io/caip-dexter-bugbash/lightgbm:latest',
                 docker_registry=DOCKER_REGISTRY,
                 cores_per_worker=2,
                 stream_log=True)

In [None]:
params["num_threads"]= 44
params["tree_learner"] = "feature"
params["num_machines"] = 4
lightgbm.execute(config=params,
                 base_image='gcr.io/caip-dexter-bugbash/lightgbm:latest',
                 docker_registry=DOCKER_REGISTRY,
                 cores_per_worker=94,
                 stream_log=True) #59-8

In [None]:
params["num_threads"]= 44
params["tree_learner"] = "voting"
params["num_machines"] = 4
lightgbm.execute(config=params,
                 base_image='gcr.io/caip-dexter-bugbash/lightgbm:latest',
                 docker_registry=DOCKER_REGISTRY,
                 cores_per_worker=94,
                 stream_log=True)

In [None]:
params["num_threads"]= 44
params["tree_learner"] = "data"
params["num_machines"] = 4
lightgbm.execute(config=params,
                 base_image='gcr.io/caip-dexter-bugbash/lightgbm:latest',
                 docker_registry=DOCKER_REGISTRY,
                 cores_per_worker=94,
                 stream_log=True)


## Let's look at the trained model

In [None]:
url = params['model_output']
model_name = os.path.split(url)[1]
!gsutil cp {url} /tmp/{model_name}
!cat /tmp/{model_name}

In [None]:
!cat /tmp/{model_name}

## Runnig a prediction task using the trained model

In [None]:
url = params['model_output']
model_name = os.path.split(url)[1]
predict_params = {
    "task": "predict",
    'metric_freq': 1,
    'metric': params['metric'],
    "data": "gs://caip-dexter-bugbash/lightgbm/higgs_wide/test.csv",
    "input_model": params['model_output'],
    "output_result": "{}/lightgbm/example/prediction_result_{}".format(gcs_bucket, model_name)
}

In [None]:
lightgbm.execute(config=predict_params,
                 base_image='gcr.io/caip-dexter-bugbash/lightgbm:latest',
 docker_registry=DOCKER_REGISTRY, cores_per_worker=90)

In [None]:
url = predict_params['output_result']
file_name = os.path.split(url)[1]
!gsutil cp {url} /tmp/{file_name}

In [None]:
import pandas as pd
predictions = pd.read_csv("/tmp/{}".format(file_name), header=None)
print("Prediction mean: {}, count: {}".format(predictions.mean()[0], predictions.count()[0]))