## Package Installation & Importing

In [1]:
#Activate the 'titanic' package environment
using Pkg
Pkg.activate("titanic", shared=true)

[32m[1m  Activating[22m[39m project at `C:\Users\luked\.julia\environments\titanic`




Open up a Julia REPL and run the following:

```Julia
] add MLJ DataFrames BetaML
```

In [25]:
using MLJ, GLMakie
import DataFrames as DF

In [26]:
table = OpenML.load(42638)
df = DF.DataFrame(table)

Row,pclass,sex,age,sibsp,fare,cabin,embarked,survived
Unnamed: 0_level_1,Cat…,Cat…,Float64,Float64,Float64,Cat…?,Cat…?,Cat…
1,3,male,22.0,1.0,7.25,missing,S,0
2,1,female,38.0,1.0,71.2833,C85,C,1
3,3,female,26.0,0.0,7.925,missing,S,1
4,1,female,35.0,1.0,53.1,C123,S,1
5,3,male,35.0,0.0,8.05,missing,S,0
6,3,male,30.0,0.0,8.4583,missing,Q,0
7,1,male,54.0,0.0,51.8625,E46,S,0
8,3,male,2.0,3.0,21.075,missing,S,0
9,3,female,27.0,0.0,11.1333,missing,S,1
10,2,female,14.0,1.0,30.0708,missing,C,1


## Inspect Data

In [27]:
DF.describe(df)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,Type
1,pclass,,1,,3,0,"CategoricalValue{String, UInt32}"
2,sex,,female,,male,0,"CategoricalValue{String, UInt32}"
3,age,29.7589,0.42,30.0,80.0,0,Float64
4,sibsp,0.523008,0.0,0.0,8.0,0,Float64
5,fare,32.2042,0.0,14.4542,512.329,0,Float64
6,cabin,,E31,,C148,687,"Union{Missing, CategoricalValue{String, UInt32}}"
7,embarked,,C,,S,2,"Union{Missing, CategoricalValue{String, UInt32}}"
8,survived,,0,,1,0,"CategoricalValue{String, UInt32}"


In [28]:
schema(df)

┌──────────┬─────────────────────────────────┬──────────────────────────────────
│[22m names    [0m│[22m scitypes                        [0m│[22m types                          [0m ⋯
├──────────┼─────────────────────────────────┼──────────────────────────────────
│ pclass   │ Multiclass{3}                   │ CategoricalValue{String, UInt32 ⋯
│ sex      │ Multiclass{2}                   │ CategoricalValue{String, UInt32 ⋯
│ age      │ Continuous                      │ Float64                         ⋯
│ sibsp    │ Continuous                      │ Float64                         ⋯
│ fare     │ Continuous                      │ Float64                         ⋯
│ cabin    │ Union{Missing, Multiclass{186}} │ Union{Missing, CategoricalValue ⋯
│ embarked │ Union{Missing, Multiclass{3}}   │ Union{Missing, CategoricalValue ⋯
│ survived │ Multiclass{2}                   │ CategoricalValue{String, UInt32 ⋯
└──────────┴─────────────────────────────────┴──────────────────────────────────


In [52]:
fig = Figure(resolution=(1920, 1080))

ax1 = fig[1,1] = Axis(fig,
                      #aspect = 1, targetlimits = BBox(-10, 10, -10, 10),
                      #title = "EDA", titlegap = 48, titlesize = 60,
                      #xautolimitmargin = (0,0), xgridwith=2, xticklabelsize = 36,
                      #xticks = LinearTicks(20), xticksize = 18,
                      #yautolimitmargin = (0, 0), ygridwidth = 2, yticklabelpad = 14,
                      #yticklabelsize = 36, yticks = LinearTicks(20), yticksize = 18
                      )

## Feature Engineering

### sibsp

In [29]:
coerce!(df, :sibsp => Count)

schema(df)

┌──────────┬─────────────────────────────────┬──────────────────────────────────
│[22m names    [0m│[22m scitypes                        [0m│[22m types                          [0m ⋯
├──────────┼─────────────────────────────────┼──────────────────────────────────
│ pclass   │ Multiclass{3}                   │ CategoricalValue{String, UInt32 ⋯
│ sex      │ Multiclass{2}                   │ CategoricalValue{String, UInt32 ⋯
│ age      │ Continuous                      │ Float64                         ⋯
│ sibsp    │ Count                           │ Int64                           ⋯
│ fare     │ Continuous                      │ Float64                         ⋯
│ cabin    │ Union{Missing, Multiclass{186}} │ Union{Missing, CategoricalValue ⋯
│ embarked │ Union{Missing, Multiclass{3}}   │ Union{Missing, CategoricalValue ⋯
│ survived │ Multiclass{2}                   │ CategoricalValue{String, UInt32 ⋯
└──────────┴─────────────────────────────────┴──────────────────────────────────


## Train & Test Splitting

In [30]:
df_train, df_test = partition(df, 0.7, rng=42)

println("Training rows = ", DF.nrow(df_train))
println("Test rows = ", DF.nrow(df_test))
println("Train/test split ratio = $(round(DF.nrow(df_train) / (DF.nrow(df_train) + DF.nrow(df_test)), digits=3))")

Training rows = 624
Test rows = 267
Train/test split ratio = 0.7


## Split into Variables and Target

In [31]:
y_train, X_train = unpack(df_train, ==(:survived), !=(:cabin))

(CategoricalArrays.CategoricalValue{String, UInt32}["0", "0", "1", "1", "0", "1", "1", "1", "1", "0"  …  "0", "1", "0", "0", "0", "0", "0", "0", "0", "0"], [1m624×6 DataFrame[0m
[1m Row [0m│[1m pclass [0m[1m sex    [0m[1m age     [0m[1m sibsp [0m[1m fare     [0m[1m embarked [0m
     │[90m Cat…   [0m[90m Cat…   [0m[90m Float64 [0m[90m Int64 [0m[90m Float64  [0m[90m Cat…?    [0m
─────┼────────────────────────────────────────────────────
   1 │ 1       male       52.0      1   79.65    S
   2 │ 1       male       31.0      1   52.0     S
   3 │ 1       female     49.0      1   76.7292  C
   4 │ 2       female     48.0      1   65.0     S
   5 │ 3       male       19.0      0    8.05    S
   6 │ 1       male       32.0      0   30.5     C
   7 │ 1       male       36.0      0  512.329   C
   8 │ 3       female     22.0      0    7.75    S
  ⋮  │   ⋮       ⋮        ⋮       ⋮       ⋮         ⋮
 618 │ 1       male       64.0      0   26.0     S
 619 │ 1       male 

In [32]:
schema(X_train)

┌──────────┬───────────────────────────────┬────────────────────────────────────
│[22m names    [0m│[22m scitypes                      [0m│[22m types                            [0m ⋯
├──────────┼───────────────────────────────┼────────────────────────────────────
│ pclass   │ Multiclass{3}                 │ CategoricalValue{String, UInt32}  ⋯
│ sex      │ Multiclass{2}                 │ CategoricalValue{String, UInt32}  ⋯
│ age      │ Continuous                    │ Float64                           ⋯
│ sibsp    │ Count                         │ Int64                             ⋯
│ fare     │ Continuous                    │ Float64                           ⋯
│ embarked │ Union{Missing, Multiclass{3}} │ Union{Missing, CategoricalValue{S ⋯
└──────────┴───────────────────────────────┴────────────────────────────────────
[36m                                                                1 column omitted[0m


In [33]:
scitype(y_train)

AbstractVector{Multiclass{2}}[90m (alias for [39m[90mAbstractArray{Multiclass{2}, 1}[39m[90m)[39m

In [34]:
y_test, X_test = unpack(df_test, ==(:survived), !=(:cabin) )

(CategoricalArrays.CategoricalValue{String, UInt32}["0", "0", "0", "0", "1", "0", "0", "0", "0", "0"  …  "0", "0", "0", "0", "1", "0", "0", "1", "0", "1"], [1m267×6 DataFrame[0m
[1m Row [0m│[1m pclass [0m[1m sex    [0m[1m age     [0m[1m sibsp [0m[1m fare    [0m[1m embarked [0m
     │[90m Cat…   [0m[90m Cat…   [0m[90m Float64 [0m[90m Int64 [0m[90m Float64 [0m[90m Cat…?    [0m
─────┼───────────────────────────────────────────────────
   1 │ 2       male       52.0      0  13.5     S
   2 │ 3       male       39.0      0  24.15    S
   3 │ 3       male       30.0      0   8.05    S
   4 │ 3       female     39.0      1  31.275   S
   5 │ 1       female     63.0      1  77.9583  S
   6 │ 1       male       54.0      0  51.8625  S
   7 │ 1       female     50.0      0  28.7125  C
   8 │ 3       male       18.0      1   6.4958  S
  ⋮  │   ⋮       ⋮        ⋮       ⋮       ⋮        ⋮
 261 │ 3       male       48.0      0   7.8542  S
 262 │ 1       female     22.0   

## Import and Construct Model

In [35]:
Tree = @load RandomForestClassifier pkg=BetaML
tree = Tree(max_depth=5)

import BetaML

┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\luked\.julia\packages\MLJModels\EkXIe\src\loading.jl:159


 ✔


RandomForestClassifier(
  n_trees = 30, 
  max_depth = 5, 
  min_gain = 0.0, 
  min_records = 2, 
  max_features = 0, 
  splitting_criterion = BetaML.Utils.gini, 
  β = 0.0, 
  rng = Random._GLOBAL_RNG())

## Train the Model

In [36]:
mach = machine(tree, X_train, y_train)

untrained Machine; caches model-specific representations of data
  model: RandomForestClassifier(n_trees = 30, …)
  args: 
    1:	Source @447 ⏎ Table{Union{AbstractVector{Continuous}, AbstractVector{Count}, AbstractVector{Multiclass{3}}, AbstractVector{Multiclass{2}}, AbstractVector{Union{Missing, Multiclass{3}}}}}
    2:	Source @337 ⏎ AbstractVector{Multiclass{2}}


In [37]:
fit!(mach)

┌ Info: Training machine(RandomForestClassifier(n_trees = 30, …), …).
└ @ MLJBase C:\Users\luked\.julia\packages\MLJBase\fEiP2\src\machines.jl:492


trained Machine; caches model-specific representations of data
  model: RandomForestClassifier(n_trees = 30, …)
  args: 
    1:	Source @447 ⏎ Table{Union{AbstractVector{Continuous}, AbstractVector{Count}, AbstractVector{Multiclass{3}}, AbstractVector{Multiclass{2}}, AbstractVector{Union{Missing, Multiclass{3}}}}}
    2:	Source @337 ⏎ AbstractVector{Multiclass{2}}


In [38]:
p_train = predict(mach, X_train)
p_test = predict(mach, X_test)

267-element CategoricalDistributions.UnivariateFiniteVector{Multiclass{2}, String, UInt32, Float64}:
 UnivariateFinite{Multiclass{2}}(0=>0.766, 1=>0.234)
 UnivariateFinite{Multiclass{2}}(0=>0.835, 1=>0.165)
 UnivariateFinite{Multiclass{2}}(0=>0.856, 1=>0.144)
 UnivariateFinite{Multiclass{2}}(0=>0.655, 1=>0.345)
 UnivariateFinite{Multiclass{2}}(0=>0.276, 1=>0.724)
 UnivariateFinite{Multiclass{2}}(0=>0.619, 1=>0.381)
 UnivariateFinite{Multiclass{2}}(0=>0.12, 1=>0.88)
 UnivariateFinite{Multiclass{2}}(0=>0.846, 1=>0.154)
 UnivariateFinite{Multiclass{2}}(0=>0.704, 1=>0.296)
 UnivariateFinite{Multiclass{2}}(0=>0.822, 1=>0.178)
 ⋮
 UnivariateFinite{Multiclass{2}}(0=>0.641, 1=>0.359)
 UnivariateFinite{Multiclass{2}}(0=>0.874, 1=>0.126)
 UnivariateFinite{Multiclass{2}}(0=>0.848, 1=>0.152)
 UnivariateFinite{Multiclass{2}}(0=>0.175, 1=>0.825)
 UnivariateFinite{Multiclass{2}}(0=>0.782, 1=>0.218)
 UnivariateFinite{Multiclass{2}}(0=>0.809, 1=>0.191)
 UnivariateFinite{Multiclass{2}}(0=>0.0825, 1=>0.9

### View prediction outputs in detail

In [39]:
p_train[6]

UnivariateFinite{Multiclass{2}}(0=>0.562, 1=>0.438)

In [40]:
p_test[6]

UnivariateFinite{Multiclass{2}}(0=>0.619, 1=>0.381)

In [41]:
pdf(p_train[6], "1")

0.43795473662639856

In [42]:
yhat_train = mode.(p_train)
yhat_test = mode.(p_test)

267-element CategoricalArrays.CategoricalArray{String,1,UInt32}:
 "0"
 "0"
 "0"
 "0"
 "1"
 "0"
 "1"
 "0"
 "0"
 "0"
 ⋮
 "0"
 "0"
 "0"
 "1"
 "0"
 "0"
 "1"
 "0"
 "1"

In [43]:
yhat_train[3:5]
#yhat_test[3:5]

3-element CategoricalArrays.CategoricalArray{String,1,UInt32}:
 "1"
 "1"
 "0"

## Evaluating Model Performance

In [44]:
accuracy(yhat_train, y_train)

0.8349358974358975

In [45]:
accuracy(yhat_test, y_test)

0.8239700374531835

## Tune Hyperparameters

In [46]:
r = range(tree, :max_depth, lower=0, upper=10)

NumericRange(0 ≤ max_depth ≤ 10; origin=5.0, unit=5.0)

In [47]:
tuned_tree = TunedModel(
    tree,
    tuning=Grid(),
    range=r,
    measure=accuracy,
    resampling=Holdout(fraction_train=0.7)
)

ProbabilisticTunedModel(
  model = RandomForestClassifier(
        n_trees = 30, 
        max_depth = 5, 
        min_gain = 0.0, 
        min_records = 2, 
        max_features = 0, 
        splitting_criterion = BetaML.Utils.gini, 
        β = 0.0, 
        rng = Random._GLOBAL_RNG()), 
  tuning = Grid(
        goal = nothing, 
        resolution = 10, 
        shuffle = true, 
        rng = Random._GLOBAL_RNG()), 
  resampling = Holdout(
        fraction_train = 0.7, 
        shuffle = false, 
        rng = Random._GLOBAL_RNG()), 
  measure = Accuracy(), 
  weights = nothing, 
  class_weights = nothing, 
  operation = nothing, 
  range = NumericRange(0 ≤ max_depth ≤ 10; origin=5.0, unit=5.0), 
  selection_heuristic = MLJTuning.NaiveSelection(nothing), 
  train_best = true, 
  repeats = 1, 
  n = nothing, 
  acceleration = CPU1{Nothing}(nothing), 
  acceleration_resampling = CPU1{Nothing}(nothing), 
  check_measure = true, 
  cache = true)

In [48]:
mach2 = machine(tuned_tree, X_train, y_train)
fit!(mach2)

┌ Info: Training machine(ProbabilisticTunedModel(model = RandomForestClassifier(n_trees = 30, …), …), …).
└ @ MLJBase C:\Users\luked\.julia\packages\MLJBase\fEiP2\src\machines.jl:492


┌ Info: Attempting to evaluate 10 models.
└ @ MLJTuning C:\Users\luked\.julia\packages\MLJTuning\nZnsJ\src\tuned_models.jl:727


[33mEvaluating over 10 metamodels:  20%[=====>                   ]  ETA: 0:00:17[39m[K

















trained Machine; does not cache data
  model: ProbabilisticTunedModel(model = RandomForestClassifier(n_trees = 30, …), …)
  args: 
    1:	Source @815 ⏎ Table{Union{AbstractVector{Continuous}, AbstractVector{Count}, AbstractVector{Multiclass{3}}, AbstractVector{Multiclass{2}}, AbstractVector{Union{Missing, Multiclass{3}}}}}
    2:	Source @949 ⏎ AbstractVector{Multiclass{2}}


In [49]:
fitted_params(mach2).best_model

RandomForestClassifier(
  n_trees = 30, 
  max_depth = 10, 
  min_gain = 0.0, 
  min_records = 2, 
  max_features = 0, 
  splitting_criterion = BetaML.Utils.gini, 
  β = 0.0, 
  rng = Random._GLOBAL_RNG())

## Final Model

In [50]:
yhat_test_2 = mode.(predict(mach2, X_test))
accuracy(yhat_test_2, y_test)

0.8389513108614233