# Analysing Supervised Learning Models With Imandra

Two of the most common tasks within supervised learning (and machine learning more generally) are classification and regression. In this notebook we show how two of the most common kinds of model used to perform these tasks, random forests and neural networks, can be analysed using Imandra. For each task we use a real-world benchmark dataset from the UCI Machine Learning Repository. We'll mostly be working with reals in this notebook so we'll start by installing a pretty printer so that we're not overrun with digits.

In [18]:
let pp_approx fmt r = CCFormat.fprintf fmt "%s" (Real.to_string_approx r) [@@program]
#install_printer pp_approx

val pp_approx : CCFormat.t -> Q.t -> unit = <fun>


## Classification

In a classification task we want to learn to predict the label of a datapoint based on previous data. In the classic [Wisconsin Breast Cancer (Diagnostic) dataset](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)) the task is to predict whether the cancer is benign or malignant based on the features of cell nuclei. In the dataset we have the following variables:

```
1. ID number
2. Diagnosis (malignant or benign)
3-32. Real values for the mean, standard error, and the 'worst' value for each cell nucleus'
      a) Radius
      b) Texture
      c) Perimeter
      d) Area
      e) Smoothness
      f) Compactness
      g) Concavity
      h) Concave points
      i) Symmetry 
      j) Fractal dimension
```

As is standard practice we pre-process the data before learning. First we standardise each variable to have zero mean and unit variance, then remove all but one from sets of highly correlated variables, along with those that have low mutual information with respect to the target variable. The data is split into training (80%) and test (20%) sets and we use Scikit-Learn to learn a random forest of 3 decision trees of maximum depth 3. As this is a relatively straightforward problem even this simple model achieves a fairly high accuracy. Using a short Python script each tree is then converted to Imandra Modelling Language (IML) and can be reasoned about using Imandra.

In [1]:
let tree_0 f_0 f_1 f_2 f_3 f_4 f_5 f_6 = let open Real in
  if f_2 <=. (-0.10815) then
    if f_0 <=. (0.26348) then
      if f_6 <=. (-0.06176) then
        (236.0, 1.0)
      else
        (17.0, 5.0)
    else
      if f_3 <=. (-0.54236) then
        (8.0, 2.0)
      else
        (3.0, 7.0)
  else
    if f_6 <=. (0.09812) then
      if f_6 <=. (-0.17063) then
        (24.0, 0.0)
      else
        (4.0, 2.0)
    else
      if f_2 <=. (2.65413) then
        (6.0, 128.0)
      else
        (7.0, 5.0);;

let tree_1 f_0 f_1 f_2 f_3 f_4 f_5 f_6 = let open Real in
  if f_5 <=. (-0.05799) then
    if f_0 <=. (0.68524) then
      if f_1 <=. (-0.83180) then
        (110.0, 3.0)
      else
        (137.0, 0.0)
    else
      if f_3 <=. (0.45504) then
        (1.0, 8.0)
      else
        (0.0, 7.0)
  else
    if f_0 <=. (-0.18668) then
      if f_6 <=. (0.45214) then
        (39.0, 0.0)
      else
        (2.0, 11.0)
    else
      if f_6 <=. (-0.00009) then
        (8.0, 4.0)
      else
        (5.0, 120.0);;

let tree_2 f_0 f_1 f_2 f_3 f_4 f_5 f_6 = let open Real in
  if f_2 <=. (0.10459) then
    if f_5 <=. (-0.38015) then
      if f_5 <=. (-0.60659) then
        (139.0, 1.0)
      else
        (44.0, 3.0)
    else
      if f_6 <=. (-0.07927) then
        (38.0, 2.0)
      else
        (25.0, 17.0)
  else
    if f_6 <=. (0.46888) then
      if f_3 <=. (0.41642) then
        (28.0, 3.0)
      else
        (1.0, 4.0)
    else
      if f_2 <=. (1.74327) then
        (3.0, 122.0)
      else
        (4.0, 21.0);;

let rf (f_0, f_1, f_2, f_3, f_4, f_5, f_6) = let open Real in
let (a_0, b_0) = tree_0 f_0 f_1 f_2 f_3 f_4 f_5 f_6 in
let (a_1, b_1) = tree_1 f_0 f_1 f_2 f_3 f_4 f_5 f_6 in
let (a_2, b_2) = tree_2 f_0 f_1 f_2 f_3 f_4 f_5 f_6 in
let a = a_0 + a_1 + a_2 in
let b = b_0 + b_1 + b_2 in
(a, b);;

val tree_0 : real -> 'a -> real -> real -> 'b -> 'c -> real -> Q.t * Q.t =
  <fun>
val tree_1 : real -> real -> 'a -> real -> 'b -> real -> real -> Q.t * Q.t =
  <fun>
val tree_2 : 'a -> 'b -> real -> real -> 'c -> real -> real -> Q.t * Q.t =
  <fun>
val rf : real * real * real * real * 'a * real * real -> real * real = <fun>


We can create a custom input type in Imandra for our model, so that we can keep track of the different features of our data.

In [2]:
type rf_input = {
  radius_mean : real;
  compactness_mean : real;
  concavity_mean : real;
  radius_se : real;
  compactness_worst : real;
  concavity_worst : real;
  concave_points_worst : real;
};;

type rf_input = {
  radius_mean : real;
  compactness_mean : real;
  concavity_mean : real;
  radius_se : real;
  compactness_worst : real;
  concavity_worst : real;
  concave_points_worst : real;
}


However, remember that we also scaled our data. To make things easier we'll add in a function applying this transformation to each input variable. Here we simply use some multiplicative and additive scaling values extracted during our data pre-processing stage. After that we can define a full model which combines these pre/post-processing steps and the random forest.

In [3]:
let process_rf_input input = let open Real in
let f_0 = (input.radius_mean          - 14.12729) / 3.52405 in
let f_1 = (input.compactness_mean     - 0.10434)  / 0.05281 in
let f_2 = (input.concavity_mean       - 0.08880)  / 0.07972 in
let f_3 = (input.radius_se            - 0.40517)  / 0.27731 in
let f_4 = (input.compactness_worst    - 0.25427)  / 0.15734 in
let f_5 = (input.concavity_worst      - 0.27219)  / 0.20862 in
let f_6 = (input.concave_points_worst - 0.11461)  / 0.06573 in
(f_0, f_1, f_2, f_3, f_4, f_5, f_6);;

let process_rf_output c = 
let (a, b) = c in
if a >. b then "benign" else "malignant";;

let rf_model input = input |> process_rf_input |> rf |> process_rf_output;;

val process_rf_input :
  rf_input -> real * real * real * real * real * real * real = <fun>
val process_rf_output : real * real -> string = <fun>
val rf_model : rf_input -> string = <fun>


As our model is fully executable we can both query it as well as find counterexamples, prove properties, apply logical side-conditions, decompose its regions, and more. As a quick sanity check to make sure everything is working, let's run a datum from our dataset through the model. In particular, we'll input  `(17.99, 0.2776, 0.3001, 1.095, 0.6656, 0.7119, 0.2654)` which is classified as `malignant` in the data.

In [4]:
let x = {
  radius_mean = 17.99;
  compactness_mean = 0.2776;
  concavity_mean = 0.3001;
  radius_se = 1.095;
  compactness_worst = 0.6656;
  concavity_worst = 0.7119;
  concave_points_worst = 0.7119;
}

let y = rf_model x;;

val x : rf_input =
  {radius_mean = 1799/100; compactness_mean = 347/1250;
   concavity_mean = 3001/10000; radius_se = 219/200;
   compactness_worst = 416/625; concavity_worst = 7119/10000;
   concave_points_worst = 7119/10000}
val y : string = "malignant"


Great, just what we'd expect. Now we'll use Imandra to generate an example datapoint for us given that diagnosis is `benign`.

In [31]:
instance (fun x -> rf_model x = "benign");;

- : rf_input -> bool = <fun>
module CX : sig val x : rf_input end


0,1
ground_instances,0
definitions,0
inductions,0
search_time,0.025s
details,"Expandsmt_statsnum checks1arith assert lower7arith pivots3rlimit count2733mk clause65datatype occurs check25seq add axiom2mk bool var158arith assert upper15decisions22seq num reductions2propagations43datatype accessor ax28datatype constructor ax4num allocs1842887098final checks1added eqs93del clause1arith eq adapter6memory42.710000max memory43.380000 require(['nbextensions/nbimandra/fold'], function (fold) {  var target = '#fold-1b715eb2-8c33-47a1-a143-a5529f71688d';  fold.hydrate(target); });"

0,1
smt_stats,num checks1arith assert lower7arith pivots3rlimit count2733mk clause65datatype occurs check25seq add axiom2mk bool var158arith assert upper15decisions22seq num reductions2propagations43datatype accessor ax28datatype constructor ax4num allocs1842887098final checks1added eqs93del clause1arith eq adapter6memory42.710000max memory43.380000

0,1
num checks,1.0
arith assert lower,7.0
arith pivots,3.0
rlimit count,2733.0
mk clause,65.0
datatype occurs check,25.0
seq add axiom,2.0
mk bool var,158.0
arith assert upper,15.0
decisions,22.0

0,1
into,(if <=.  (+.  (+.  (if <=. :var_0:.concavity_mean 40089141/500000000  then  if <=. :var_0:.radius_mean 7527903347/500000000  then  if <=. :var_0:.concave_points_worst 8636759/78125000 then …  else …  else if <=. :var_0:.radius_se 636920371/2500000000 then … else …  else  if <=. :var_0:.concave_points_worst 302648569/2500000000  then  if <=. :var_0:.concave_points_worst 1033944901/10000000000 then …  else …  else  if <=. :var_0:.concavity_mean 750968109/2500000000 then … else …).0  (if <=. :var_0:.concavity_worst 1300460631/5000000000  then  if <=. :var_0:.radius_mean 8271055011/500000000  then  if <=. :var_0:.compactness_mean 30206321/500000000 then …  else …  else if <=. :var_0:.radius_se 166049107/312500000 then … else …  else  if <=. :var_0:.radius_mean 6734710173/500000000  then  if <=. :var_0:.concave_points_worst 721645811/5000000000 then …  else …  else  if <=. :var_0:.concave_points_worst 1146040843/10000000000 then …  else …).0)  (if <=. :var_0:.concavity_mean 242844787/2500000000  then  if <=. :var_0:.concavity_worst 192883107/1000000000  then  if <=. :var_0:.concavity_worst 728215971/5000000000 then …  else …  else  if <=. :var_0:.concave_points_worst 1093995829/10000000000 then …  else …  else  if <=. :var_0:.concave_points_worst 181786853/1250000000  then if … then … else … else …).0)  …  then … else …) = …
expansions,[]
rewrite_steps,
forward_chaining,


In [7]:
CX.x;;

- : rf_input =
{radius_mean = -8082.94419331; compactness_mean = -280.939587358;
 concavity_mean = -1141.91982172; radius_se = 5853.; compactness_worst = 9.;
 concavity_worst = -448.854356806; concave_points_worst = -608.889449485}


This looks a bit funny however; notice how the unspecified input variables are unbounded in a way that doesn't make sense with respect to the data. In general we might only care about the performance of our model when some reasonable bounds are placed on the input (for example, the mean radius can't be negative, and if the maximum value in our dataset is 28.11 we wouldn't really expect any value greater than, say, 35). Using the description of each variable in the dataset we can form a condition describing valid and reasonable inputs to our model. In machine learning more generally, we are typically only interested in the performance and quality of a model over some particular distribution of data, which we often have particular prior beliefs about.

In [6]:
let is_valid_rf input = if 
    5.0 <=. input.radius_mean && input.radius_mean <=. 35.0 &&
    0.0 <=. input.compactness_mean && input.compactness_mean <=. 0.4 &&
    0.0 <=. input.concavity_mean && input.concavity_mean <=. 0.5 &&
    0.0 <=. input.radius_se && input.radius_se <=. 3.5 &&
    0.0 <=. input.compactness_worst && input.compactness_worst <=. 1.2 &&
    0.0 <=. input.concavity_worst && input.concavity_worst <=. 1.5 &&
    0.0 <=. input.concave_points_worst && input.concave_points_worst <=. 0.35
    then true else false;;  
    
instance (fun x -> rf_model x = "benign" && is_valid_rf x);;

val is_valid_rf : rf_input -> bool = <fun>
- : rf_input -> bool = <fun>
module CX : sig val x : rf_input end


0,1
ground_instances,0
definitions,0
inductions,0
search_time,0.027s
details,"Expandsmt_statsnum checks1arith assert lower14arith pivots3rlimit count3215mk clause65datatype occurs check25seq add axiom2mk bool var172arith assert upper22decisions22seq num reductions2propagations43datatype accessor ax28datatype constructor ax4num allocs979101837final checks1added eqs93del clause1arith eq adapter6memory25.910000max memory26.570000 require(['nbextensions/nbimandra/fold'], function (fold) {  var target = '#fold-dd887eeb-7ac7-4ea2-a539-0a01420dc803';  fold.hydrate(target); });"

0,1
smt_stats,num checks1arith assert lower14arith pivots3rlimit count3215mk clause65datatype occurs check25seq add axiom2mk bool var172arith assert upper22decisions22seq num reductions2propagations43datatype accessor ax28datatype constructor ax4num allocs979101837final checks1added eqs93del clause1arith eq adapter6memory25.910000max memory26.570000

0,1
num checks,1.0
arith assert lower,14.0
arith pivots,3.0
rlimit count,3215.0
mk clause,65.0
datatype occurs check,25.0
seq add axiom,2.0
mk bool var,172.0
arith assert upper,22.0
decisions,22.0

0,1
into,((((((((((((((if <=.  (+.  (+.  (if <=. :var_0:.concavity_mean 40089141/500000000  then  if <=. :var_0:.radius_mean 7527903347/500000000  then  if <=. :var_0:.concave_points_worst 8636759/78125000  then … else …  else  if <=. :var_0:.radius_se 636920371/2500000000 then …  else …  else  if <=. :var_0:.concave_points_worst 302648569/2500000000  then  if <=. :var_0:.concave_points_worst  1033944901/10000000000  then … else …  else  if <=. :var_0:.concavity_mean 750968109/2500000000  then … else …).0  (if <=. :var_0:.concavity_worst 1300460631/5000000000  then  if <=. :var_0:.radius_mean 8271055011/500000000  then  if <=. :var_0:.compactness_mean 30206321/500000000  then … else …  else  if <=. :var_0:.radius_se 166049107/312500000 then …  else …  else  if <=. :var_0:.radius_mean 6734710173/500000000  then  if <=. :var_0:.concave_points_worst  721645811/5000000000  then … else …  else  if <=. :var_0:.concave_points_worst  1146040843/10000000000  then … else …).0)  (if <=. :var_0:.concavity_mean 242844787/2500000000  then  if <=. :var_0:.concavity_worst 192883107/1000000000  then  if <=. :var_0:.concavity_worst 728215971/5000000000  then … else …  else  if <=. :var_0:.concave_points_worst  1093995829/10000000000  then … else …  else  if <=. :var_0:.concave_points_worst 181786853/1250000000  then if … then … else … else …).0)  …  then … else …)  = … && …)  && …)  && …)  && …)  && …)  && …)  && …)  && …)  && …)  && …)  && …)  && …)  && …) && …
expansions,[]
rewrite_steps,
forward_chaining,


In [33]:
CX.x

- : rf_input =
{radius_mean = 5.61239862766; compactness_mean = 0.0016975952402;
 concavity_mean = 0.0649283727636; radius_se = 2.04855;
 compactness_worst = 0.0642; concavity_worst = 0.0166324527776;
 concave_points_worst = 0.00496371813248}


This looks much better. Now let's move on to reasoning about our model in more interesting ways. One thing we can do is check the validity of certain constraints we might want our model to satisfy. For example, if the surface of a cell nucleus has many, large concave sections then is a particularly negative sign indicating that the cancer is likely to be malignant. We can use Imandra to easily verify that our model always classifies a sample of highly concave cells as `malignant`.

In [7]:
verify (fun x -> is_valid_rf x
        && x.concavity_mean >=. 0.4
        && x.concavity_worst >=. 1.0 
        && x.concave_points_worst >=. 0.25
        ==> rf_model x = "malignant")

- : rf_input -> bool = <fun>


0,1
ground_instances,0
definitions,0
inductions,0
search_time,0.030s
details,"Expandsmt_statsnum checks1arith assert lower30arith pivots2rlimit count3220mk clause56seq add axiom2mk bool var181arith assert upper16decisions9seq num reductions2propagations28conflicts2datatype accessor ax28arith conflicts2datatype constructor ax4num allocs1043846647added eqs99del clause36arith eq adapter8memory29.030000max memory29.620000 require(['nbextensions/nbimandra/fold'], function (fold) {  var target = '#fold-1a5ec2ce-4460-4459-8648-7960db78b3cc';  fold.hydrate(target); });"

0,1
smt_stats,num checks1arith assert lower30arith pivots2rlimit count3220mk clause56seq add axiom2mk bool var181arith assert upper16decisions9seq num reductions2propagations28conflicts2datatype accessor ax28arith conflicts2datatype constructor ax4num allocs1043846647added eqs99del clause36arith eq adapter8memory29.030000max memory29.620000

0,1
num checks,1.0
arith assert lower,30.0
arith pivots,2.0
rlimit count,3220.0
mk clause,56.0
seq add axiom,2.0
mk bool var,181.0
arith assert upper,16.0
decisions,9.0
seq num reductions,2.0

0,1
into,not ((((((((((((((((<=. 5 :var_0:.radius_mean && <=. :var_0:.radius_mean 35)  && <=. 0 :var_0:.compactness_mean)  && <=. :var_0:.compactness_mean 2/5)  && <=. 0 :var_0:.concavity_mean)  && <=. :var_0:.concavity_mean 1/2)  && <=. 0 :var_0:.radius_se)  && <=. :var_0:.radius_se 7/2)  && <=. 0 :var_0:.compactness_worst)  && <=. :var_0:.compactness_worst 6/5)  && <=. 0 :var_0:.concavity_worst)  && <=. :var_0:.concavity_worst 3/2)  && <=. 0 :var_0:.concave_points_worst)  && <=. :var_0:.concave_points_worst 7/20)  && >=. :var_0:.concavity_mean 2/5)  && >=. :var_0:.concavity_worst 1)  && >=. :var_0:.concave_points_worst 1/4) || (if <=.  (+.  (+.  (if <=. :var_0:.concavity_mean 40089141/500000000  then  if <=. :var_0:.radius_mean 7527903347/500000000  then  if <=. :var_0:.concave_points_worst 8636759/78125000 then …  else …  else  if <=. :var_0:.radius_se 636920371/2500000000 then … else …  else  if <=. :var_0:.concave_points_worst 302648569/2500000000  then  if <=. :var_0:.concave_points_worst 1033944901/10000000000  then … else …  else  if <=. :var_0:.concavity_mean 750968109/2500000000 then …  else …).0  (if <=. :var_0:.concavity_worst 1300460631/5000000000  then  if <=. :var_0:.radius_mean 8271055011/500000000  then  if <=. :var_0:.compactness_mean 30206321/500000000 then …  else …  else  if <=. :var_0:.radius_se 166049107/312500000 then … else …  else  if <=. :var_0:.radius_mean 6734710173/500000000  then  if <=. :var_0:.concave_points_worst 721645811/5000000000 then … else …  else  if <=. :var_0:.concave_points_worst 1146040843/10000000000 then … else …).0)  (if <=. :var_0:.concavity_mean 242844787/2500000000  then  if <=. :var_0:.concavity_worst 192883107/1000000000  then  if <=. :var_0:.concavity_worst 728215971/5000000000 then …  else …  else  if <=. :var_0:.concave_points_worst 1093995829/10000000000  then … else …  else  if <=. :var_0:.concave_points_worst 181786853/1250000000  then if … then … else … else …).0)  …  then … else …)  = …
expansions,[]
rewrite_steps,
forward_chaining,


The nested `if ... then ... else` statements in how the trees are defined mean that they are a prime candidate for Imandra's region decomposition functionality. As well as the total model we can of course also decompose the individual trees making up the ensemble.

In [43]:
Decompose.top ~assuming:"is_valid_rf" "rf_model"

- : Imandra_interactive.Decompose.t list =
[<region>; <region>; <region>; <region>; <region>; <region>; <region>;
 <region>; <region>; <region>; <region>; <region>; <region>; <region>;
 <region>; <region>; <region>; <region>; <region>; <region>; <region>;
 <region>; <region>; <region>; <region>; <region>; <region>; <region>;
 <region>; <region>; <region>; <region>; <region>; <region>; <region>;
 <region>; <region>; <region>; <region>; <region>; <region>; <region>;
 <region>; <region>; <region>; <region>; <region>; <region>; <region>;
 <region>; <region>; <region>; <region>; <region>; <region>; <region>;
 <region>; <region>; <region>; <region>; <region>; <region>; <region>;
 <region>; <region>; <region>; <region>; <region>; <region>; <region>;
 <region>; <region>; <region>; <region>; <region>; <region>; <region>;
 <region>; <region>; <region>; <region>; <region>; <region>; <region>;
 <region>; <region>; <region>; <region>; <region>; <region>; <region>;
 <region>; <region>; <region>; <re

Constraints,Invariant
not (<=. input.concavity_mean 40089141/500000000)not (<=. input.concave_points_worst 1033944901/10000000000)not (<=. input.concavity_worst 1300460631/5000000000)not (<=. input.concave_points_worst 721645811/5000000000)not (<=. input.concavity_mean 242844787/2500000000)not (<=. input.concave_points_worst 181786853/1250000000)not (<=. input.concave_points_worst 302648569/2500000000)not (<=. input.concavity_mean 750968109/2500000000)not (<=. input.radius_mean 6734710173/500000000)not (<=. input.concave_points_worst 1146040843/10000000000)not (<=. input.concavity_mean 569433711/2500000000),"""malignant"""
not (<=. input.concavity_mean 40089141/500000000)not (<=. input.concave_points_worst 1033944901/10000000000)not (<=. input.concavity_worst 1300460631/5000000000)not (<=. input.concave_points_worst 721645811/5000000000)not (<=. input.concavity_mean 242844787/2500000000)not (<=. input.concave_points_worst 181786853/1250000000)not (<=. input.concave_points_worst 302648569/2500000000)not (<=. input.concavity_mean 750968109/2500000000)<=. input.radius_mean 6734710173/500000000not (<=. input.concavity_mean 569433711/2500000000),"""malignant"""
not (<=. input.concavity_mean 40089141/500000000)not (<=. input.concave_points_worst 1033944901/10000000000)not (<=. input.concavity_worst 1300460631/5000000000)not (<=. input.concave_points_worst 721645811/5000000000)not (<=. input.concavity_mean 242844787/2500000000)not (<=. input.concave_points_worst 181786853/1250000000)not (<=. input.concave_points_worst 302648569/2500000000)<=. input.concavity_mean 750968109/2500000000not (<=. input.radius_mean 6734710173/500000000)not (<=. input.concave_points_worst 1146040843/10000000000)not (<=. input.concavity_mean 569433711/2500000000),"""malignant"""
not (<=. input.concavity_mean 40089141/500000000)not (<=. input.concave_points_worst 1033944901/10000000000)not (<=. input.concavity_worst 1300460631/5000000000)not (<=. input.concave_points_worst 721645811/5000000000)not (<=. input.concavity_mean 242844787/2500000000)not (<=. input.concave_points_worst 181786853/1250000000)not (<=. input.concave_points_worst 302648569/2500000000)<=. input.concavity_mean 750968109/2500000000not (<=. input.radius_mean 6734710173/500000000)not (<=. input.concave_points_worst 1146040843/10000000000)<=. input.concavity_mean 569433711/2500000000,"""malignant"""
not (<=. input.concavity_mean 40089141/500000000)not (<=. input.concave_points_worst 1033944901/10000000000)not (<=. input.concavity_worst 1300460631/5000000000)not (<=. input.concave_points_worst 721645811/5000000000)not (<=. input.concavity_mean 242844787/2500000000)not (<=. input.concave_points_worst 181786853/1250000000)not (<=. input.concave_points_worst 302648569/2500000000)<=. input.concavity_mean 750968109/2500000000<=. input.radius_mean 6734710173/500000000not (<=. input.concavity_mean 569433711/2500000000),"""malignant"""
not (<=. input.concavity_mean 40089141/500000000)not (<=. input.concave_points_worst 1033944901/10000000000)not (<=. input.concavity_worst 1300460631/5000000000)not (<=. input.concave_points_worst 721645811/5000000000)not (<=. input.concavity_mean 242844787/2500000000)not (<=. input.concave_points_worst 181786853/1250000000)not (<=. input.concave_points_worst 302648569/2500000000)<=. input.concavity_mean 750968109/2500000000<=. input.radius_mean 6734710173/500000000<=. input.concavity_mean 569433711/2500000000,"""malignant"""
not (<=. input.concavity_mean 40089141/500000000)not (<=. input.concave_points_worst 1033944901/10000000000)not (<=. input.concavity_worst 1300460631/5000000000)not (<=. input.concave_points_worst 721645811/5000000000)not (<=. input.concavity_mean 242844787/2500000000)<=. input.concave_points_worst 181786853/1250000000not (<=. input.radius_se 2603237151/5000000000)not (<=. input.concave_points_worst 302648569/2500000000)not (<=. input.concavity_mean 750968109/2500000000)not (<=. input.radius_mean 6734710173/500000000)not (<=. input.concave_points_worst 1146040843/10000000000),"""malignant"""
not (<=. input.concavity_mean 40089141/500000000)not (<=. input.concave_points_worst 1033944901/10000000000)not (<=. input.concavity_worst 1300460631/5000000000)not (<=. input.concave_points_worst 721645811/5000000000)not (<=. input.concavity_mean 242844787/2500000000)<=. input.concave_points_worst 181786853/1250000000not (<=. input.radius_se 2603237151/5000000000)not (<=. input.concave_points_worst 302648569/2500000000)not (<=. input.concavity_mean 750968109/2500000000)<=. input.radius_mean 6734710173/500000000,"""malignant"""
not (<=. input.concavity_mean 40089141/500000000)not (<=. input.concave_points_worst 1033944901/10000000000)not (<=. input.concavity_worst 1300460631/5000000000)not (<=. input.concave_points_worst 721645811/5000000000)not (<=. input.concavity_mean 242844787/2500000000)<=. input.concave_points_worst 181786853/1250000000not (<=. input.radius_se 2603237151/5000000000)not (<=. input.concave_points_worst 302648569/2500000000)<=. input.concavity_mean 750968109/2500000000not (<=. input.radius_mean 6734710173/500000000)not (<=. input.concave_points_worst 1146040843/10000000000),"""malignant"""
not (<=. input.concavity_mean 40089141/500000000)not (<=. input.concave_points_worst 1033944901/10000000000)not (<=. input.concavity_worst 1300460631/5000000000)not (<=. input.concave_points_worst 721645811/5000000000)not (<=. input.concavity_mean 242844787/2500000000)<=. input.concave_points_worst 181786853/1250000000not (<=. input.radius_se 2603237151/5000000000)not (<=. input.concave_points_worst 302648569/2500000000)<=. input.concavity_mean 750968109/2500000000<=. input.radius_mean 6734710173/500000000,"""malignant"""


In [1]:
Decompose.top "tree_0";;

Connection lost, attempting reconnection..[31;1mError[0m: uncaught exception:
Not_found
Raised at file "src/interactive/backend.ml", line 51, characters 46-82
Called from file "src/interactive/backend.ml", line 148, characters 10-63
Called from file "src/interactive/decompose.ml", line 148, characters 10-38
Called from file "toplevel/toploop.ml", line 180, characters 17-56


We can also use side conditions on the region decomposition of our model. One application here is in simulating partial observability. Perhaps we know most of the measurements for a particular set of cells and we'd like to see how the classification of the input depends on the remaining features. Let's imagine that we only have the concavity measurements for a particular patient and we'd like to see how the output of our model depends on the values of the other features.

In [8]:
let partial_observation x = 
    is_valid_rf x &&
    x.concavity_mean = 0.04295 &&
    x.concavity_worst = 0.26000 &&
    x.concave_points_worst = 0.11460;;

Decompose.top ~ctx_asm_simp:true ~assuming:"partial_observation" "rf_model";;

val partial_observation : rf_input -> bool = <fun>
- : Imandra_interactive.Decompose.t list =
[<region>; <region>; <region>; <region>; <region>; <region>; <region>;
 <region>; <region>]


Constraints,Invariant
not (<=. input.radius_mean 7527903347/500000000)not (<=. input.radius_se 636920371/2500000000)not (<=. input.radius_mean 8271055011/500000000)not (<=. input.radius_se 166049107/312500000),"""malignant"""
not (<=. input.radius_mean 7527903347/500000000)not (<=. input.radius_se 636920371/2500000000)not (<=. input.radius_mean 8271055011/500000000)<=. input.radius_se 166049107/312500000,"""malignant"""
not (<=. input.radius_mean 7527903347/500000000)not (<=. input.radius_se 636920371/2500000000)<=. input.radius_mean 8271055011/500000000not (<=. input.compactness_mean 30206321/500000000),"""benign"""
not (<=. input.radius_mean 7527903347/500000000)not (<=. input.radius_se 636920371/2500000000)<=. input.radius_mean 8271055011/500000000<=. input.compactness_mean 30206321/500000000,"""benign"""
not (<=. input.radius_mean 7527903347/500000000)<=. input.radius_se 636920371/2500000000not (<=. input.radius_mean 8271055011/500000000)<=. input.radius_se 166049107/312500000,"""benign"""
not (<=. input.radius_mean 7527903347/500000000)<=. input.radius_se 636920371/2500000000<=. input.radius_mean 8271055011/500000000not (<=. input.compactness_mean 30206321/500000000),"""benign"""
not (<=. input.radius_mean 7527903347/500000000)<=. input.radius_se 636920371/2500000000<=. input.radius_mean 8271055011/500000000<=. input.compactness_mean 30206321/500000000,"""benign"""
<=. input.radius_mean 7527903347/500000000<=. input.radius_mean 8271055011/500000000not (<=. input.compactness_mean 30206321/500000000),"""benign"""
<=. input.radius_mean 7527903347/500000000<=. input.radius_mean 8271055011/500000000<=. input.compactness_mean 30206321/500000000,"""benign"""


## Regression

In a regression task we want to learn to predict the value(s) of some variable(s) based on previous data. In the commonly used [Forest Fires dataset](https://archive.ics.uci.edu/ml/datasets/forest+fires) the aim is to predict the area burned by forest fires, in the northeast region of Portugal, by using meteorological and other data. This is a fairly difficult task and while the neural network below doesn't achieve state-of-the-art performance, it's enough to demonstrate how we can analyse relatively simple models in Imandra. In the dataset we have the following variables:

```
1. X-axis spatial coordinate (within the Montesinho park map)
2. Y-axis spatial coordinate (within the Montesinho park map)
3. Month
4. Day
5. FFMC index (from the FWI system)
6. DMC index (from the FWI system)
7. DC index (from the FWI system)
8. ISI index (from the FWI system)
9. Temperature (Celsius)
10. Relative percentage humidity
11. Wind speed
12. Outside rain (mm/m^2)
13. The burned area of the forest
```

We again pre-process the data before learning by first transforming the month and day variables into a numerical value and applying a `sin` transformation (so similar times are close in value), as well as removing outliers and applying an approximate log transformation to the area variable (as recommended in the dataset description). Each variable is scaled to lie between 0 and 1, and those with high correlations and/or low mutual information respect to the target variable are removed. We then split the data into training (80%) and test (20%) sets and use Keras to learn a simple feed-forward neural network with one (6 neuron) hidden layer, ReLU activation functions, and stochastic gradient descent to optimise the mean squared error. After saving our model as an `.h5` file we use a short script to extract the network into an IML file and reason about it using Imandra.

In [6]:
let relu x = Real.(if x > 0.0 then x else 0.0);;

let linear x = Real.(x)

let layer_0 (x_0, x_1, x_2, x_3, x_4, x_5) = let open Real in
let y_0 = relu @@ (0.20124)*x_0 + (-0.15722)*x_1 + (-0.19063)*x_2 + (-0.54562)*x_3 + (0.03425)*x_4 + (0.50104)*x_5 + -0.02768 in
let y_1 = relu @@ (0.29103)*x_0 + (0.03180)*x_1 + (-0.16336)*x_2 + (0.17919)*x_3 + (0.32971)*x_4 + (-0.43206)*x_5 + -0.02620 in
let y_2 = relu @@ (0.66419)*x_0 + (0.25399)*x_1 + (0.00449)*x_2 + (0.03841)*x_3 + (-0.51482)*x_4 + (0.58299)*x_5 + 0.11858 in
let y_3 = relu @@ (0.47598)*x_0 + (-0.36142)*x_1 + (0.38981)*x_2 + (0.27632)*x_3 + (-0.61231)*x_4 + (-0.03662)*x_5 + -0.02890 in
let y_4 = relu @@ (0.10277)*x_0 + (-0.28841)*x_1 + (0.04637)*x_2 + (0.28808)*x_3 + (0.05957)*x_4 + (-0.22041)*x_5 + 0.18270 in
let y_5 = relu @@ (0.55604)*x_0 + (-0.04015)*x_1 + (0.10557)*x_2 + (0.60757)*x_3 + (-0.32314)*x_4 + (0.47933)*x_5 + -0.24876 in
(y_0, y_1, y_2, y_3, y_4, y_5);;

let layer_1 (x_0, x_1, x_2, x_3, x_4, x_5) = let open Real in
let y_0 = linear @@ (0.28248)*x_0 + (-0.25208)*x_1 + (-0.50075)*x_2 + (-0.07092)*x_3 + (-0.43189)*x_4 + (0.60065)*x_5 + 0.47136 in
(y_0);;

let nn (x_0, x_1, x_2, x_3, x_4, x_5) = let open Real in 
(x_0, x_1, x_2, x_3, x_4, x_5) |> layer_0 |> layer_1 ;;

val relu : real -> real = <fun>
val linear : 'a -> 'a = <fun>
val layer_0 :
  real * real * real * real * real * real ->
  real * real * real * real * real * real = <fun>
val layer_1 : real * real * real * real * real * real -> real = <fun>
val nn : real * real * real * real * real * real -> real = <fun>


Given the description of the dataset above we can again create some custom input types in Imandra for our model:

In [5]:
type month = Jan | Feb | Mar | Apr | May | Jun| Jul | Aug | Sep | Oct | Nov | Dec;;
type day = Mon | Tue | Wed | Thu | Fri | Sat | Sun;;
 
type nn_input = {
  month : month;
  day : day;
  dmc : real;
  temp : real;
  rh : real;
  rain : real;
};;

type month =
    Jan
  | Feb
  | Mar
  | Apr
  | May
  | Jun
  | Jul
  | Aug
  | Sep
  | Oct
  | Nov
  | Dec
type day = Mon | Tue | Wed | Thu | Fri | Sat | Sun
type nn_input = {
  month : month;
  day : day;
  dmc : real;
  temp : real;
  rh : real;
  rain : real;
}


As before, because we pre-processed our data, we'll add in a function applying this transform to each input variable. Equally, we'll need to convert back to hectares for our output variable. Here we simply use some minimum and maximum values extracted during our data pre-processing stage. After that we define a full model which combines these pre/post-processing steps and the network above.

In [7]:
let month_2_num month = let open Real in
if month = Jan then 0.134 else
if month = Feb then 0.500 else
if month = Mar then 1.000 else
if month = Apr then 1.500 else
if month = May then 1.866 else
if month = Jun then 2.000 else
if month = Jul then 1.866 else
if month = Aug then 1.500 else
if month = Sep then 1.000 else
if month = Oct then 0.500 else
if month = Nov then 0.133 else
0.000;;

let day_2_num day = let open Real in
if day = Mon then 0.377 else
if day = Tue then 1.223 else
if day = Wed then 1.901 else
if day = Thu then 1.901 else
if day = Fri then 1.223 else
if day = Sat then 0.377 else
0.000;;

let process_nn_input input = let open Real in
let real_month = month_2_num input.month in
let real_day = day_2_num input.day in
let x_0 = (real_month - 0.0)  / (2.0   - 0.0)  in
let x_1 = (real_day   - 0.0)  / (1.901 - 0.0)  in
let x_2 = (input.dmc  - 1.1)  / (291.3 - 1.1)  in
let x_3 = (input.temp - 2.2)  / (33.3  - 2.2)  in
let x_4 = (input.rh   - 15.0) / (100.0 - 15.0) in
let x_5 = (input.rain - 0.0)  / (6.40  - 0.0)  in
(x_0, x_1, x_2, x_3, x_4, x_5);;

let process_nn_output y_0 = let open Real in
let y = 4.44323 * y_0 in
if y <= 1.0 then (y - 0.00000) * 1.71828 else 
if y <= 2.0 then (y - 0.63212) * 4.67077 else 
if y <= 3.0 then (y - 1.49679) * 12.69648 else 
if y <= 4.0 then (y - 2.44700) * 34.51261 else 
(y - 3.42868) * 93.81501;;
(* if y <= 5.0 then (y - 3.42868) * 93.81501 else 
if y <= 6.0 then (y - 4.42194) * 255.01563 else 
(y - 5.41946) * 693.20436 ;; *)

let nn_model input = input
|> process_nn_input
|> nn
|> process_nn_output;;

val month_2_num : month -> Q.t = <fun>
val day_2_num : day -> Q.t = <fun>
val process_nn_input : nn_input -> real * real * real * real * real * real =
  <fun>
val process_nn_output : real -> real = <fun>
val nn_model : nn_input -> real = <fun>


As our model is fully executable we can both query it as well as find counterexamples, prove properties, apply logical side-conditions to the input, decompose its regions, and more. As a quick sanity check to make sure everything is working, let's run a datum from our dataset through the model. In particular, we'll input  `x = (Aug, Sat, 231.1, 26.9, 31.0, 0.0)` which has an area of `y = 4.96` hectares in the data.

In [8]:
let x = {
  month = Aug;
  day = Sat;
  dmc = 231.1;
  temp = 26.9;
  rh = 31.0;
  rain = 0.0;};;
  
let y = nn_model x;;

val x : nn_input =
  {month = Aug; day = Sat; dmc = 2311/10; temp = 269/10; rh = 31; rain = 0}
val y : real =
  31163034026685970872774265972147/14583401737000000000000000000000


Our answer is both roughly similar to the recorded datapoint value and also to the value we get from our original Keras model, `2.13683266556`. The small disparity here is due to our rounding the weight values in our network to 5 decimal places when we extracted them to IML, though it wasn't necessary to do so. Now we'll use Imandra to generate an example for us with some particular side conditions.

In [9]:
instance (fun x -> nn_model x >. 20.0 && x.temp = 20.0 && x.month = May);;

- : nn_input -> bool = <fun>
module CX : sig val x : nn_input end


0,1
ground_instances,0
definitions,0
inductions,0
search_time,0.034s
details,"Expandsmt_statsnum checks1arith assert lower57arith pivots22rlimit count16256mk clause68datatype occurs check7mk bool var141arith assert upper35datatype splits4decisions74arith add rows137arith bound prop1propagations92conflicts7arith fixed eqs13datatype accessor ax10arith conflicts2arith assert diseq1datatype constructor ax7num allocs976978033final checks1added eqs90del clause23arith eq adapter30memory33.010000max memory36.090000 require(['nbextensions/nbimandra/fold'], function (fold) {  var target = '#fold-13b76923-b09e-43cb-93f4-457c239399a4';  fold.hydrate(target); });"

0,1
smt_stats,num checks1arith assert lower57arith pivots22rlimit count16256mk clause68datatype occurs check7mk bool var141arith assert upper35datatype splits4decisions74arith add rows137arith bound prop1propagations92conflicts7arith fixed eqs13datatype accessor ax10arith conflicts2arith assert diseq1datatype constructor ax7num allocs976978033final checks1added eqs90del clause23arith eq adapter30memory33.010000max memory36.090000

0,1
num checks,1.0
arith assert lower,57.0
arith pivots,22.0
rlimit count,16256.0
mk clause,68.0
datatype occurs check,7.0
mk bool var,141.0
arith assert upper,35.0
datatype splits,4.0
decisions,74.0

0,1
into,(not  (.( <=. )  (if .( <=. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( *. ) 1568904513/1250000000  (if .( <=. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( *. ) 5031/50000  (if :var_0:.month = Jan then 67/500  else  if :var_0:.month = Feb then 1/2  else  if :var_0:.month = Mar then 1  else if :var_0:.month = Apr then 3/2 else …))  (.( *. ) -7861/95050  (if :var_0:.day = Mon then 377/1000  else  if :var_0:.day = Tue then 1223/1000  else  if :var_0:.day = Wed then 1901/1000  else if :var_0:.day = Thu then 1901/1000 else …)))  (.( *. ) -19063/29020000 :var_0:.dmc))  (.( *. ) -27281/1555000 :var_0:.temp))  (.( *. ) 137/340000 :var_0:.rh))  (.( *. ) 6263/80000 :var_0:.rain))  -1716983137/306857480000  then 0  else  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( +. ) 1716983137/306857480000  (.( *. ) 5031/50000  (if :var_0:.month = Jan then 67/500  else  if :var_0:.month = Feb then 1/2  else  if :var_0:.month = Mar then 1  else if :var_0:.month = Apr then 3/2 else …)))  (.( *. ) -7861/95050  (if :var_0:.day = Mon then 377/1000  else  if :var_0:.day = Tue then 1223/1000  else  if :var_0:.day = Wed then 1901/1000  else if :var_0:.day = Thu then 1901/1000 else …)))  (.( *. ) -19063/29020000 :var_0:.dmc))  (.( *. ) -27281/1555000 :var_0:.temp))  (.( *. ) 137/340000 :var_0:.rh))  (.( *. ) 6263/80000 :var_0:.rain))))  (.( *. ) -1400061773/1250000000  (if .( <=. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( *. ) 29103/200000  (if :var_0:.month = Jan then 67/500  else if :var_0:.month = Feb then 1/2 else …))  …)  …)  …)  …)  …)  73983893263/767143700000  then 0 else …)))  …)  …)  …)  …)  -341987779/312500000  then … else …)  20)  && …) && …
expansions,[]
rewrite_steps,
forward_chaining,


In [10]:
CX.x;;

- : nn_input =
{month = May; day = Wed;
 dmc =
  7597491365091646983788244502461274105063710103/1285912235995896196222304684839556138615000;
 temp = 20;
 rh =
  116321417146729453300761696944820485610293761/51436489439835847848892187393582245544600;
 rain =
  2495451056678694134139316951586229474021508/32147805899897404905557617120988903465375}


Notice how the unspecified input variables are unbounded, just as in our original classification instances. Using the description of each variable in the data (plus some reasonable assumptions about Portugal's climate) we can form the following condition describing valid inputs to the network.

In [11]:
let is_valid_nn input = if 
    0.0 <=. input.dmc && input.dmc <=. 500.0 &&
    0.0 <=. input.temp && input.temp <=. 40.0 &&
    0.0 <=. input.rh && input.rh <=. 100.0 &&
    0.0 <=. input.rain && input.rain <=. 15.0
    then true else false;;
    
instance (fun x -> nn_model x >. 20.0 && x.temp = 20.0 && x.month = May && is_valid_nn x);;

CX.x

val is_valid_nn : nn_input -> bool = <fun>
- : nn_input -> bool = <fun>
module CX : sig val x : nn_input end
- : nn_input =
{month = May; day = Sat; dmc = 500; temp = 20;
 rh = 171464336140233292471484482076009419/2360873552944263234940969123509600;
 rain =
  1931685368286694583093022598772257507/138859415446610125537264590188566875}


0,1
ground_instances,0
definitions,0
inductions,0
search_time,0.210s
details,"Expandsmt_statsarith offset eqs3num checks1arith assert lower223arith pivots193rlimit count272125mk clause256datatype occurs check7mk bool var389arith assert upper204datatype splits6decisions307arith add rows2230arith bound prop15propagations846conflicts69arith fixed eqs89datatype accessor ax10minimized lits34arith conflicts30arith assert diseq70datatype constructor ax13num allocs1247144691final checks1added eqs546del clause152arith eq adapter173memory12.180000max memory36.090000 require(['nbextensions/nbimandra/fold'], function (fold) {  var target = '#fold-33b38da5-508e-4e3f-a564-e517d19c92c7';  fold.hydrate(target); });"

0,1
smt_stats,arith offset eqs3num checks1arith assert lower223arith pivots193rlimit count272125mk clause256datatype occurs check7mk bool var389arith assert upper204datatype splits6decisions307arith add rows2230arith bound prop15propagations846conflicts69arith fixed eqs89datatype accessor ax10minimized lits34arith conflicts30arith assert diseq70datatype constructor ax13num allocs1247144691final checks1added eqs546del clause152arith eq adapter173memory12.180000max memory36.090000

0,1
arith offset eqs,3.0
num checks,1.0
arith assert lower,223.0
arith pivots,193.0
rlimit count,272125.0
mk clause,256.0
datatype occurs check,7.0
mk bool var,389.0
arith assert upper,204.0
datatype splits,6.0

0,1
into,(((((((((not  (.( <=. )  (if .( <=. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( *. ) 1568904513/1250000000  (if .( <=. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( *. ) 5031/50000  (if :var_0:.month = Jan then 67/500  else  if :var_0:.month = Feb then 1/2  else  if :var_0:.month = Mar then 1  else if :var_0:.month = Apr then 3/2 else …))  (.( *. ) -7861/95050  (if :var_0:.day = Mon then 377/1000  else  if :var_0:.day = Tue then 1223/1000  else  if :var_0:.day = Wed then 1901/1000  else  if :var_0:.day = Thu then 1901/1000 else …)))  (.( *. ) -19063/29020000 :var_0:.dmc))  (.( *. ) -27281/1555000 :var_0:.temp))  (.( *. ) 137/340000 :var_0:.rh))  (.( *. ) 6263/80000 :var_0:.rain))  -1716983137/306857480000  then 0  else  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( +. ) 1716983137/306857480000  (.( *. ) 5031/50000  (if :var_0:.month = Jan then 67/500  else  if :var_0:.month = Feb then 1/2  else  if :var_0:.month = Mar then 1  else if :var_0:.month = Apr then 3/2 else …)))  (.( *. ) -7861/95050  (if :var_0:.day = Mon then 377/1000  else  if :var_0:.day = Tue then 1223/1000  else  if :var_0:.day = Wed then 1901/1000  else  if :var_0:.day = Thu then 1901/1000 else …)))  (.( *. ) -19063/29020000 :var_0:.dmc))  (.( *. ) -27281/1555000 :var_0:.temp))  (.( *. ) 137/340000 :var_0:.rh))  (.( *. ) 6263/80000 :var_0:.rain))))  (.( *. ) -1400061773/1250000000  (if .( <=. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( *. ) 29103/200000  (if :var_0:.month = Jan then 67/500  else if :var_0:.month = Feb then 1/2 else …))  …)  …)  …)  …)  …)  73983893263/767143700000  then 0 else …)))  …)  …)  …)  …)  -341987779/312500000  then … else …)  20)  && …)  && …)  && …)  && …)  && …)  && …)  && …)  && …)  && …) && …
expansions,[]
rewrite_steps,
forward_chaining,


In [25]:
CX.x;;
nn_model CX.x

- : nn_input =
{month = May; day = Sun; dmc = 0.; temp = 20.; rh = 75.4957226187;
 rain = 11.9383864415}
- : real = 21.4241505245


These constraints mean it is slightly harder for Imandra to find a particular instance satisfying our original demand, but nonetheless it's possible. Now let's try something a bit more interesting. First of all let's check for one desirable property of the model, namely that it never outputs a negative area as a prediction.

In [16]:
verify (fun x ->  is_valid_nn x ==> nn_model x >=. 0.0)

- : nn_input -> bool = <fun>


0,1
ground_instances,0
definitions,0
inductions,0
search_time,0.929s
details,"Expandsmt_statsarith offset eqs19num checks1arith assert lower446arith pivots582rlimit count1302885mk clause614mk bool var537restarts1arith assert upper366datatype splits49decisions776arith add rows7090arith bound prop24propagations2561conflicts135arith fixed eqs85datatype accessor ax18minimized lits131arith conflicts68arith assert diseq177datatype constructor ax15added eqs1098del clause367arith eq adapter241memory58.740000max memory165.470000num allocs9206223118.000000 require(['nbextensions/nbimandra/fold'], function (fold) {  var target = '#fold-f704cbb0-af7f-4c34-8cf8-c471f2a8f68b';  fold.hydrate(target); });"

0,1
smt_stats,arith offset eqs19num checks1arith assert lower446arith pivots582rlimit count1302885mk clause614mk bool var537restarts1arith assert upper366datatype splits49decisions776arith add rows7090arith bound prop24propagations2561conflicts135arith fixed eqs85datatype accessor ax18minimized lits131arith conflicts68arith assert diseq177datatype constructor ax15added eqs1098del clause367arith eq adapter241memory58.740000max memory165.470000num allocs9206223118.000000

0,1
arith offset eqs,19.0
num checks,1.0
arith assert lower,446.0
arith pivots,582.0
rlimit count,1302885.0
mk clause,614.0
mk bool var,537.0
restarts,1.0
arith assert upper,366.0
datatype splits,49.0

0,1
into,not (((((((<=. 0 :var_0:.dmc && <=. :var_0:.dmc 500) && <=. 0 :var_0:.temp)  && <=. :var_0:.temp 40)  && <=. 0 :var_0:.rh)  && <=. :var_0:.rh 100)  && <=. 0 :var_0:.rain)  && <=. :var_0:.rain 15) || >=.  (if <=.  (+.  (+.  (+.  (+.  (+.  (*. 1568904513/1250000000  (if <=.  (+.  (+.  (+.  (+.  (+.  (*. 5031/50000  (if :var_0:.month = Jan then 67/500  else  if :var_0:.month = Feb then 1/2  else  if :var_0:.month = Mar then 1  else if :var_0:.month = Apr then 3/2 else …))  (*. -7861/95050  (if :var_0:.day = Mon then 377/1000  else  if :var_0:.day = Tue then 1223/1000  else  if :var_0:.day = Wed then 1901/1000  else if :var_0:.day = Thu then 1901/1000 else …)))  (*. -19063/29020000 :var_0:.dmc))  (*. -27281/1555000 :var_0:.temp))  (*. 137/340000 :var_0:.rh))  (*. 6263/80000 :var_0:.rain))  -1716983137/306857480000  then 0  else  (+.  (+.  (+.  (+.  (+.  (+. 1716983137/306857480000  (*. 5031/50000  (if :var_0:.month = Jan then 67/500  else  if :var_0:.month = Feb then 1/2  else  if :var_0:.month = Mar then 1  else if :var_0:.month = Apr then 3/2 else …)))  (*. -7861/95050  (if :var_0:.day = Mon then 377/1000  else  if :var_0:.day = Tue then 1223/1000  else  if :var_0:.day = Wed then 1901/1000  else if :var_0:.day = Thu then 1901/1000 else …)))  (*. -19063/29020000 :var_0:.dmc))  (*. -27281/1555000 :var_0:.temp))  (*. 137/340000 :var_0:.rh))  (*. 6263/80000 :var_0:.rain))))  (*. -1400061773/1250000000  (if <=.  (+.  (+.  (+.  (+.  (+.  (*. 29103/200000  (if :var_0:.month = Jan then 67/500  else if :var_0:.month = Feb then 1/2 else …))  …)  …)  …)  …)  …)  73983893263/767143700000  then 0 else …)))  …)  …)  …)  …)  -341987779/312500000  then … else …)  0
expansions,[]
rewrite_steps,
forward_chaining,


Now let's test a hypothesis. All other things remaining equal, we would assume that the higher the temperature, the larger the area that would be burned. Due to the imperfections of our model (because of limited data, stochasticity in training, the complicated patterns present in natural physical phenomena, and so on) this assertion is in fact easily falsifiable by Imandra. Let's restrict our setting in a sensible way to see if we can prove something slightly weaker:

* There is very little data from winter months, and so the model is unlikely to generalise well here, hence we'll only consider non-winter months
* We'll increase the tolerance in temperature to 10 degrees celsius
* We'll increase the tolerance in area burned to 25 hectares

In [13]:
let winter month = month = Oct || month = Nov || month = Dec || month = Jan || month = Feb;;

verify (fun a b -> 
is_valid_nn a &&
is_valid_nn b &&
a.month = b.month &&
not (winter a.month) &&
a.day = b.day &&
a.dmc = b.dmc &&
a.rh = b.rh &&
a.rain = b.rain &&
(a.temp -. 10.0) >=. b.temp ==>
(nn_model a +. 25.0) >=. nn_model b);;

val winter : month -> bool = <fun>
- : nn_input -> nn_input -> bool = <fun>


0,1
ground_instances,0
definitions,0
inductions,0
search_time,23.596s
details,"Expandsmt_statsarith offset eqs69num checks1arith assert lower3446arith pivots4624rlimit count36520942mk clause1161mk bool var792restarts4arith assert upper2580datatype splits12decisions2022arith add rows138180arith bound prop125propagations16324conflicts487arith fixed eqs1001datatype accessor ax20minimized lits780arith conflicts332arith assert diseq1256datatype constructor ax36num allocs3749408129added eqs6284del clause492arith eq adapter403memory21.080000max memory36.090000 require(['nbextensions/nbimandra/fold'], function (fold) {  var target = '#fold-f2fda566-46e4-4cf0-a99c-71287b76b5ee';  fold.hydrate(target); });"

0,1
smt_stats,arith offset eqs69num checks1arith assert lower3446arith pivots4624rlimit count36520942mk clause1161mk bool var792restarts4arith assert upper2580datatype splits12decisions2022arith add rows138180arith bound prop125propagations16324conflicts487arith fixed eqs1001datatype accessor ax20minimized lits780arith conflicts332arith assert diseq1256datatype constructor ax36num allocs3749408129added eqs6284del clause492arith eq adapter403memory21.080000max memory36.090000

0,1
arith offset eqs,69.0
num checks,1.0
arith assert lower,3446.0
arith pivots,4624.0
rlimit count,36520942.0
mk clause,1161.0
mk bool var,792.0
restarts,4.0
arith assert upper,2580.0
datatype splits,12.0

0,1
into,not ((((((((((((((((((((((.( <=. ) 0 :var_0:.dmc && .( <=. ) :var_0:.dmc 500)  && .( <=. ) 0 :var_0:.temp)  && .( <=. ) :var_0:.temp 40)  && .( <=. ) 0 :var_0:.rh)  && .( <=. ) :var_0:.rh 100)  && .( <=. ) 0 :var_0:.rain)  && .( <=. ) :var_0:.rain 15)  && .( <=. ) 0 :var_1:.dmc)  && .( <=. ) :var_1:.dmc 500)  && .( <=. ) 0 :var_1:.temp)  && .( <=. ) :var_1:.temp 40)  && .( <=. ) 0 :var_1:.rh)  && .( <=. ) :var_1:.rh 100)  && .( <=. ) 0 :var_1:.rain)  && .( <=. ) :var_1:.rain 15)  && :var_0:.month = :var_1:.month)  && not  ((((:var_0:.month = Oct || :var_0:.month = Nov)  || :var_0:.month = Dec)  || :var_0:.month = Jan)  || :var_0:.month = Feb))  && :var_0:.day = :var_1:.day)  && :var_0:.dmc = :var_1:.dmc)  && :var_0:.rh = :var_1:.rh)  && :var_0:.rain = :var_1:.rain)  && .( >=. ) :var_0:.temp (.( +. ) 10 :var_1:.temp)) || .( >=. )  (if .( <=. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( *. ) 1568904513/1250000000  (if .( <=. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( +. )  (.( *. ) 5031/50000  (if :var_0:.month = Jan then 67/500  else  if :var_0:.month = Feb then 1/2  else  if :var_0:.month = Mar then 1  else if :var_0:.month = Apr then 3/2 else …))  (.( *. ) -7861/95050  (if :var_0:.day = Mon then 377/1000  else  if :var_0:.day = Tue then 1223/1000  else  if :var_0:.day = Wed then 1901/1000  else if :var_0:.day = Thu then 1901/1000 else …)))  …)  …)  …)  …)  -1716983137/306857480000  then 0 else …))  …)  …)  …)  …)  …)  -341987779/312500000  then … else …)  …
expansions,[]
rewrite_steps,
forward_chaining,


In [14]:
let process_nn_output y_0 = let open Real in
  let y = 4.44323 * y_0 in function
  | y <= 1.0 -> (y - 0.00000) * 1.71828
  | y <= 2.0 -> (y - 0.63212) * 4.67077
  | y <= 3.0 -> (y - 1.49679) * 12.69648
  | y <= 4.0 -> (y - 2.44700) * 34.51261
  | _ -> (y - 3.42868) * 93.81501

Compiler exception:
jupyter cell 14:3,6--8
Error: Syntax Error



In [15]:
let day_2_num day = let open Real in function
  | Mon -> 0.377 
  | Tue -> 1.223 
  | Wed -> 1.901 
  | Thu -> 1.901 
  | Fri -> 1.223 
  | Sat -> 0.377 
  | Sun -> 0.000

val day_2_num : 'a -> day -> Q.t = <fun>
