# Gradient Boosting Regression

Demonstrate Gradient Boosting on the Boston housing dataset.

This example fits a Gradient Boosting model with least squares loss and 500 regression trees of depth 4.

This is a port to OCaml of the [scikit-learn gradient boosting regression example](https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-regression-py).

In [None]:
#require "pyml"
#require "matplotlib"
#require "jupyter.notebook"
#require "shell"

In [None]:
open Matplotlib;;
let plot () =
  let data = Mpl.plot_data `png in
  ignore (Jupyter_notebook.display ~base64:true "image/png" data);;

let () =
    Mpl.set_backend Agg
;;

In [None]:
(* load sklearn from the current git repo (build first with dune build @install) *)
let root = String.trim @@ Shell.run_full "git" ["rev-parse"; "--show-toplevel"];;
List.iter (fun component ->
    let libdir = root ^ "/_build/install/default/lib/" ^ component in
    Topdirs.dir_directory libdir) ["np"; "scipy"; "sklearn"];;
#load "np.cma";;
#load "scipy.cma";;
#load "sklearn.cma";;
module Np = Np.Numpy;;

In [None]:
(* it would be nice if the OCaml kernel installed printers automatically like utop *)
#install_printer Np.Obj.pp;;
(* #install_printer Sklearn.Ensemble.GradientBoostingRegressor.pp;;*)

## Load data

In [None]:
(* The Python source does a custom split using shuffle(). The equivalent with train_test_split is simpler (see below) ,
   but it is nice to reproduce the numerical results and graphics. *)
   
(* (* the recommended simpler version *)
  let [@ocaml.warning "-8"] [x_train; x_test; y_train; y_test] =
  Sklearn.Model_selection.train_test_split [boston#data; boston#target] ~random_state:42 ~train_size:(`F 0.9);; *)

let boston = Sklearn.Datasets.load_boston();;

let [@ocaml.warning "-8"] [x; y] = Sklearn.Utils.shuffle [boston#data; boston#target] ~random_state:13;;
let offset = int_of_float @@ (float_of_int (Np.shape x).(0)) *. 0.9;;
let x_train = Np.Ndarray.(get ~key:[slice ~j:offset ()] x);;
let y_train = Np.Ndarray.(get ~key:[slice ~j:offset ()] y);;
let x_test = Np.Ndarray.(get ~key:[slice ~i:offset ()] x);;
let y_test = Np.Ndarray.(get ~key:[slice ~i:offset ()] y);

## Fit regression model

In [None]:
let get_f = function `F x -> x | _ -> assert false;;

let n_estimators = 500;;

module Gbr = Sklearn.Ensemble.GradientBoostingRegressor;;
let clf =
  Gbr.(create ~n_estimators ~max_depth:4 ~min_samples_split:(`I 2) ~learning_rate:0.01 ~loss:`Ls()
       |> fit ~x:x_train ~y:y_train);;

let mse = Sklearn.Metrics.mean_squared_error ~y_true:y_test ~y_pred:(Gbr.predict clf ~x:x_test) () |> fun x -> (Np.Ndarray.to_float_array x).(0) in
Printf.printf "MSE: %.4f\n%!" mse;;

## Plot training deviance

In [None]:
let protect f =
  try f()
  with (Py.E (a, b)) as exc -> Printf.printf "error: %s\n%s\n%!" (Py.Object.to_string a) (Py.Object.to_string b); raise exc

(* let call_loss ~y ~raw_predictions loss =
  Py.Callable.to_function loss [|Sklearn.Arr.to_pyobject y; Sklearn.Arr.to_pyobject raw_predictions|] |> Py.Float.to_float;; *)

(* TODO: contribute this to ocaml-matplotlib *)
let set_yticks ax ticks =
  let _ = Py.Module.get_function (Ax.Expert.to_pyobject ax) "set_yticks" [|Np.Obj.to_pyobject ticks|] in ();;
 
let set_yticklabels ax labels =
  let _ = Py.Module.get_function (Ax.Expert.to_pyobject ax) "set_yticklabels" [|Np.Obj.to_pyobject labels|] in ();;

(* Axes.barh(self, y, width, height=0.8, left=None, *, align='center', **kwargs)[source]) *)
let barh ax y width =
  let _ = Py.Module.get_function (Ax.Expert.to_pyobject ax) "barh" [|Np.Obj.to_pyobject y; Np.Obj.to_pyobject width|] in ();;

(* plot deviance *)
let test_score =
  let score = Np.zeros ~dtype:(`S "float64") [n_estimators] in
  let _ = Seq.fold_left
    (fun i e -> Np.(Ndarray.set ~key:[`I i] ~value:(float (Gbr.loss_ clf y_test e)) score); succ i)
    0
    (Gbr.staged_predict clf ~x:x_test)
  in score
in

let fig, ax1, ax2 = Fig.create_with_two_axes ~figsize:(12., 6.) `horizontal in

let xs = Array.init n_estimators (fun i -> float_of_int (succ i)) in
let train_score = Gbr.train_score_ clf |> Np.Ndarray.to_float_array in
Ax.set_title ax1 "Deviance";
Ax.plot ax1 ~label:"Training Set Deviance" ~linestyle:Solid ~xs train_score;
Ax.plot ax1 ~label:"Test Set Deviance" ~linestyle:Solid ~xs (Np.Ndarray.to_float_array test_score);
Ax.legend ax1;
Ax.set_xlabel ax1 "Boosting iterations";
Ax.set_ylabel ax1 "Deviance";

let feature_importance = Gbr.feature_importances_ clf in
let feature_importance = Np.((int 100) * feature_importance / (max feature_importance)) in
let sorted_idx = Np.argsort feature_importance in
let pos = Np.(arange (`I (shape sorted_idx).(0))) in
let pos = Np.(pos + (float 0.5)) in
Ax.set_title ax2 "Variable Importance";
barh ax2 pos Np.(Ndarray.get ~key:[mask sorted_idx] feature_importance);
set_yticks ax2 pos;
set_yticklabels ax2 Np.(Ndarray.get ~key:[mask sorted_idx] (Np.Ndarray.of_string_list boston#feature_names));
Ax.set_xlabel ax2 "Relative Importance";
Ax.set_title ax2 "Variable Importance";
plot ();;