# 4.4.4 Polynomial Regression
We can now explore these concepts interactively by fitting polynomials to data.

In [24]:
##import math
##from mxnet import gluon, np, npx
##from mxnet.gluon import nn
##from d2l import mxnet as d2l
##npx.set_np()

use strict;
use warnings;
use Data::Dump qw(dump);
use AI::MXNet qw(mx);
use AI::MXNet::Gluon qw(gluon);
use List::Util qw(min max shuffle);
use d2l;
use d2l::Accumulator;
use d2l::Animator;

### Generating the Dataset
First we need data. Given x, we will use the following cubic polynomial to generate the labels on
training and test data:

$$ y = 5 + 1.2x − 3.4\frac{x^2}{2!} + 5.6\frac{x^3}{3!} + ϵ $$
$$ where: ϵ ∼ \mathcal{N} (0, {0.1}^{2}) $$
The noise term ϵ obeys a normal distribution with a mean of 0 and a standard deviation of 0.1. For
optimization, we typically want to avoid very large values of gradients or losses. This is why the
features are rescaled from xi to xii!. It allows us to avoid very large values for large exponents i. We
will synthesize 100 samples each for the training set and test set.

In [25]:
sub gamma {
    ##source code: (https://hewgill.com/picomath/perl/gamma.pl.html)
    my $x = $_[0];

    if ($x <= 0.0){
        die "Invalid input argument $x. Argument must be positive";
    }

    my $gamma = 0.577215664901532860606512090; # Euler's gamma constant

    if ($x < 0.001) {
        return 1.0/($x*(1.0 + $gamma*$x));
    }

    if ($x < 12.0)
    {
        my $y = $x;
        my $n = 0;
        my $arg_was_less_than_one = ($y < 1.0);

        if ($arg_was_less_than_one){
            $y += 1.0;
        }else{
            $n = int($y) - 1;  # will use n later
            $y -= $n;
        }

        my @p =
        (
            -1.71618513886549492533811E+0,
             2.47656508055759199108314E+1,
            -3.79804256470945635097577E+2,
             6.29331155312818442661052E+2,
             8.66966202790413211295064E+2,
            -3.14512729688483675254357E+4,
            -3.61444134186911729807069E+4,
             6.64561438202405440627855E+4
        );
        my @q =
        (
            -3.08402300119738975254353E+1,
             3.15350626979604161529144E+2,
            -1.01515636749021914166146E+3,
            -3.10777167157231109440444E+3,
             2.25381184209801510330112E+4,
             4.75584627752788110767815E+3,
            -1.34659959864969306392456E+5,
            -1.15132259675553483497211E+5
        );

        my $num = 0.0;
        my $den = 1.0;
        my $i;

        my $z = $y - 1;
        for ($i = 0; $i < 8; $i++){
            $num = ($num + $p[$i])*$z;
            $den = $den*$z + $q[$i];
        }
        my $result = $num/$den + 1.0;

        if ($arg_was_less_than_one){
            $result /= ($y-1.0);
        }else{
            for ($i = 0; $i < $n; $i++) {
                $result *= $y++;
            }
        }
        return $result;
    }

    if ($x > 171.624){
        return undef;
    }

    return exp(log_gamma($x));
}

sub log_gamma {
    my $x = $_[0];

    if ($x <= 0.0)
    {
        die "Invalid input argument $x. Argument must be positive";
    }

    if ($x < 12.0)
    {
        return log(abs(gamma($x)));
    }

    # Abramowitz and Stegun 6.1.41
    # Asymptotic series should be good to at least 11 or 12 figures
    # For error analysis, see Whittiker and Watson
    # A Course in Modern Analysis (1927), page 252

    my @c =
    (
         1.0/12.0,
        -1.0/360.0,
         1.0/1260.0,
        -1.0/1680.0,
         1.0/1188.0,
        -691.0/360360.0,
         1.0/156.0,
        -3617.0/122400.0
    );
    my $z = 1.0/($x*$x);
    my $sum = $c[7];
    for (my $i=6; $i >= 0; $i--)
    {
        $sum *= $z;
        $sum += $c[$i];
    }
    my $series = $sum/$x;

    my $halfLogTwoPi = 0.91893853320467274178032973640562;
    my $logGamma = ($x - 0.5)*log($x) - $x + $halfLogTwoPi + $series;    
    return $logGamma;
}

Warning: Subroutine gamma redefined at reply input line 1.

Subroutine log_gamma redefined at reply input line 79.


In [26]:
## Simular el slice, ejem: arr[0:8, 0:8]
sub getSubND{
    my ($input) = @_;
       if(!$input->{data}){
        print "ERR: No ha colocado el array";
        return 1;
    }
    if(!$input->{row_end}){
        print "ERR: No ha colocado el valor de row_end";
        return 1;
    }else{
        if(!$input->{row_start}){
            $input->{row_start} = 0;
        }
    }
    
    if($input->{column_end} && $input->{data}->shape->[1]){
        if(!$input->{column_start}){
            $input->{column_start} = 0;
        }    
        my @arr;
        for my $i ($input->{row_start}..$input->{row_end}-1){
            push(@arr, $input->{data}->[$i]->_slice($input->{column_start}, $input->{column_end})->asarray);
        }
        return mx->nd->array(\@arr);
    }else{
        if(!$input->{data}->shape->[1]){
            return $input->{data}->_slice($input->{row_start}, $input->{row_end});
        }else{
            print "ERR: No ha colocado el valor de column_end";
            return 1;        
        }
    }
}

Warning: Subroutine getSubND redefined at reply input line 2.


In [27]:
my $max_degree = 5; ## Maximum degree of the polynomial
my $n = 10;
my ($n_train, $n_test) = ($n, $n); ## Training and test dataset sizes
my $true_w = mx->nd->zeros([$max_degree]); ## Allocate lots of empty space
$true_w = $true_w->asarray;
@$true_w[0..3] = (5, 1.2, -3.4, 5.6);
$true_w = mx->nd->array($true_w);

my $features = mx->nd->random->normal(shape=>[$n_train + $n_test, 1]);
$features = mx->nd->shuffle($features);

my $poly_features = $features ** mx->nd->arange(stop=>$max_degree)->reshape([1,-1]);

my $len = $poly_features->shape->[0];
$poly_features = $poly_features->asarray;
for my $i (0..$max_degree-1){
    my $gamma = gamma($i + 1);
    for my $j (0..$len-1){
        $$poly_features[$j][$i] = $$poly_features[$j][$i] / $gamma;
    }
}
$poly_features = mx->nd->array($poly_features);

my $labels = mx->nd->dot($poly_features, $true_w);
$labels += mx->nd->random->normal(scale=>0.1, shape=>$labels->shape);

<AI::MXNet::NDArray 20 @cpu(0)>

Again, monomials stored in poly_features are rescaled by the gamma function, where Γ(n) =
(n − 1)!. Take a look at the first 2 samples from the generated dataset. The value 1 is technically a
feature, namely the constant feature corresponding to the bias.

In [28]:
##print dump features[:2], poly_features[:2, :], labels[:2]
print dump(getSubND({data=>$features, row_start=>0, row_end=>2, column_end=>1})->asarray);
print "\n";
print dump((getSubND({data=>$poly_features, row_start=>0, column_start=>0, row_end=>2, column_end=>$poly_features->shape->[1]}))->asarray);
print "\n";
print dump((getSubND({data=>$labels, row_start=>0, row_end=>2}))->asarray);
print "\n";

[[0.891265451908112], [3.2904531955719]]
[
  [
    1,
    0.891265451908112,
    0.397177040576935,
    0.117996729910374,
    0.0262916013598442,
  ],
  [
    1,
    3.2904531955719,
    5.41354131698608,
    5.93766784667969,
    4.88440465927124,
  ],
]
[5.48033285140991, 23.7382698059082]


1

### Training and Testing the Model
Let us first implement a function to evaluate the loss on a given dataset.

In [29]:
sub evaluate_loss{
    my ($net, $data_iter, $loss) = @_;
    #"""Evaluate the loss of a model on the given dataset."""
    my $metric = Accumulator->new(2); # Sum of losses, no. of examples
    while (defined(my $batch = <$data_iter>)){
        my $X = $batch->{data};
        my $y = $batch->{label}->astype('float32');
        
        my $l = $loss->($net->($X), $y);
        $metric->add([ $l->sum->asscalar, $l->size]);
    }
    if($metric->getitem(1)==0){
        return (0);
    }else{
        return ($metric->getitem(0) / $metric->getitem(1));
    }
}

Warning: Subroutine evaluate_loss redefined at reply input line 1.


Now define the training function.

In [30]:
sub load_array{
    my ($X, $y, $batch_size, $is_train) = @_;
    my $dataset = gluon->data->ArrayDataset(data=>$X, label=>$y);
    return gluon->data->DataLoader($dataset, batch_size=> $batch_size, shuffle=>$is_train);
}

Warning: Subroutine load_array redefined at reply input line 1.


In [31]:
sub train_epoch_ch3{ #@save
  #Train a model within one epoch (defined in Chapter 3).
  #Sum of training loss, sum of training accuracy, no. of examples
  my ($net, $train_iter, $loss, $updater, $batch_size) = @_;
  my $metric = Accumulator->new(3);
  
  if (ref($updater) eq 'AI::MXNet::Gluon::Trainer'){
    $updater->step( $batch_size, 1);
  }  
  my ($X, $y, $y_hat, $l);
  
  while(defined(my $batch = <$train_iter>)){
    # Compute gradients and update parameters
    $X = $batch->[0];
    $y = $batch->[1]->astype('float32'); 
    autograd->record(sub {
      $y_hat = $net->($X);
      $l = $loss->($y_hat, $y);
    });
    $l->backward();

    if (ref($updater) eq 'AI::MXNet::Gluon::Trainer'){
      $updater->step( $batch_size);
    }else{
      $updater->($batch_size);
    }

    $metric->add([ $l->sum->asscalar, accuracy($y_hat, $y), $y->size ]);
  }
  # Return training loss and training accuracy
  return ($metric->getitem(0) / $metric->getitem(2), $metric->getitem(1) / $metric->getitem(2));
}

Warning: Subroutine train_epoch_ch3 redefined at reply input line 1.


In [32]:
sub train{
    my ($train_features, $test_features, $train_labels, $test_labels, $num_epochs) = @_;
    if(!$num_epochs) {$num_epochs = 2;}

    my $batch_size = min(10, $train_labels->shape->[0]);
    my $loss = gluon->loss->L2Loss();
    my $net = gluon->nn->Sequential();
    $net->name_scope(sub {
        $net->add(gluon->nn->Dense(1, use_bias=>0, in_units=>$train_features->shape->[1]))
        });
    # Switch off the bias since we already catered for it in the polynomial features
    $net->initialize();
    
    my $train_iter = load_array($train_features, $train_labels, $batch_size, 1);
    my $test_iter = load_array($test_features, $test_labels, $batch_size, 0);

    my $trainer = gluon->Trainer($net->collect_params(), 'sgd', {learning_rate => 0.01});

    my $animator = Animator->new(xlabel=>'epoch', ylabel=>'loss', yscale=>'logarithm', xlim=>[1, $num_epochs], ylim=>[1e-3, 1e2], legend=>['train', 'test']);
    
    for my $epoch (0..$num_epochs-1){
        print $epoch . "\n";
        train_epoch_ch3($net, $train_iter, $loss, $trainer, $batch_size);        
        if( $epoch == 0 || ($epoch + 1) % 20 == 0){
            $animator->add($epoch + 1, (evaluate_loss($net, $train_iter, $loss), evaluate_loss($net, $test_iter, $loss)))
        }      
        last;
    }

    print('weight:', $net->[0]->weight->data()->asnumpy())
}

Warning: Subroutine train redefined at reply input line 1.


### Third-Order Polynomial Function Fitting (Normal)
We will begin by first using a third-order polynomial function, which is the same order as that
of the data generation function. The results show that this modelʼs training and test losses can
be both effectively reduced. The learned model parameters are also close to the true values w =
[5, 1.2, −3.4, 5.6].

In [None]:
# Pick the first four dimensions, i.e., 1, x, x^2/2!, x^3/3! from the
# polynomial features
train(
    getSubND({data=>$poly_features, row_start=>0, column_start=>0, row_end=>$n_train, column_end=>4}),
    getSubND({data=>$poly_features, row_start=>$n_train+1, column_start=>0, row_end=>$poly_features->shape->[0], column_end=>4}),
    getSubND({data=>$labels, row_start=>0, row_end=>$n_train}),
    getSubND({data=>$labels, row_start=>$n_train+1, row_end=>$labels->shape->[0]})
)

### Linear Function Fitting (Underfitting)
Let us take another look at linear function fitting. After the decline in early epochs, it becomes
difficult to further decrease this modelʼs training loss. After the last epoch iteration has been
completed, the training loss is still high. When used to fit nonlinear patterns (like the third-order
polynomial function here) linear models are liable to underfit.

In [20]:
# Pick the first two dimensions, i.e., 1, x, from the polynomial features
train(
    getSubND({data=>$poly_features, row_start=>0, column_start=>0, row_end=>$n_train, column_end=>2}),
    getSubND({data=>$poly_features, row_start=>$n_train+1, column_start=>0, row_end=>$poly_features->shape->[0], column_end=>2}),
    getSubND({data=>$labels, row_start=>0, row_end=>$n_train}),
    getSubND({data=>$labels, row_start=>$n_train+1, row_end=>$labels->shape->[0]})
)


Error: Undefined subroutine &main::net called at reply input line 16.
 at /usr/local/lib/perl5/site_perl/5.32.1/AI/MXNet/AutoGrad.pm line 408.
	AI::MXNet::AutoGrad::record("autograd", CODE(0xb89ae50)) called at reply input line 18
	main::train_epoch_ch3(CODE(0xb9a84e0), AI::MXNet::Gluon::Data::Loader::DataLoader=HASH(0xb99f6d0), AI::MXNet::Gluon::L2Loss=HASH(0xb9964f0), REF(0xba0d718)) called at reply input line 23
	main::train(AI::MXNet::NDArray=HASH(0xb886910), AI::MXNet::NDArray=HASH(0xb9f7af0), AI::MXNet::NDArray=HASH(0x948d680), AI::MXNet::NDArray=HASH(0xb995c68)) called at reply input line 2
	Eval::Closure::Sandbox_1288::__ANON__() called at /usr/local/lib/perl5/site_perl/5.32.1/Reply/Plugin/Defaults.pm line 71
	Reply::Plugin::Defaults::execute(Reply::Plugin::Defaults=HASH(0x4ffc8b0), CODE(0xb886460), CODE(0xb892c70)) called at /usr/local/lib/perl5/site_perl/5.32.1/Reply.pm line 217
	Reply::_wrapped_plugin(Reply=HASH(0x5084558), ARRAY(0x572d7e0), "execute", CODE(0xb892c70)) called at /usr/local/lib/perl5/site_perl/5.32.1/Reply.pm line 215
	Reply::__ANON__(CODE(0xb892c70)) called at /usr/local/lib/perl5/site_perl/5.32.1/Reply/Plugin/IPerl.pm line 28
	Reply::Plugin::IPerl::__ANON__() called at /usr/local/lib/perl5/site_perl/5.32.1/Capture/Tiny.pm line 382
	eval {...} called at /usr/local/lib/perl5/site_perl/5.32.1/Capture/Tiny.pm line 382
	Capture::Tiny::_capture_tee(1, 1, 0, 0, CODE(0x2b15468)) called at /usr/local/lib/perl5/site_perl/5.32.1/Reply/Plugin/IPerl.pm line 29
	Reply::Plugin::IPerl::execute(Reply::Plugin::IPerl=HASH(0x50b0fb0), CODE(0x572d468), CODE(0xb892c70)) called at /usr/local/lib/perl5/site_perl/5.32.1/Reply.pm line 217
	Reply::_wrapped_plugin(Reply=HASH(0x5084558), "execute", CODE(0xb892c70)) called at /usr/local/lib/perl5/site_perl/5.32.1/Reply.pm line 174
	Reply::_eval(Reply=HASH(0x5084558), "\x{a}#line 1 \"reply input\"\x{a}# Pick the first two dimensions, i.e.,"...) called at /usr/local/lib/perl5/site_perl/5.32.1/Reply.pm line 66
	Reply::try {...} () called at /usr/local/lib/perl5/site_perl/5.32.1/Try/Tiny.pm line 102
	eval {...} called at /usr/local/lib/perl5/site_perl/5.32.1/Try/Tiny.pm line 93
	Try::Tiny::try(CODE(0x57189c8), Try::Tiny::Catch=REF(0xb988cd0)) called at /usr/local/lib/perl5/site_perl/5.32.1/Reply.pm line 71
	Reply::step(Reply=HASH(0x5084558), "# Pick the first two dimensions, i.e., 1, x, from the polynom"..., 0) called at /usr/local/lib/perl5/site_perl/5.32.1/Devel/IPerl/Kernel/Backend/Reply.pm line 48
	Devel::IPerl::Kernel::Backend::Reply::__ANON__() called at /usr/local/lib/perl5/site_perl/5.32.1/Capture/Tiny.pm line 382
	eval {...} called at /usr/local/lib/perl5/site_perl/5.32.1/Capture/Tiny.pm line 382
	Capture::Tiny::_capture_tee(1, 1, 0, 0, CODE(0xb86ead8)) called at /usr/local/lib/perl5/site_perl/5.32.1/Devel/IPerl/Kernel/Backend/Reply.pm line 49
	Devel::IPerl::Kernel::Backend::Reply::run_line(Devel::IPerl::Kernel::Backend::Reply=HASH(0x29aab28), "# Pick the first two dimensions, i.e., 1, x, from the polynom"...) called at /usr/local/lib/perl5/site_perl/5.32.1/Devel/IPerl/Kernel/Callback/REPL.pm line 42
	Devel::IPerl::Kernel::Callback::REPL::execute(Devel::IPerl::Kernel::Callback::REPL=HASH(0x2a70a18), Devel::IPerl::Kernel=HASH(0x1e58f80), Devel::IPerl::Message::ZMQ=HASH(0x572d5d0)) called at (eval 30) line 6
	Devel::IPerl::Kernel::Callback::REPL::execute(Devel::IPerl::Kernel::Callback::REPL=HASH(0x2a70a18), Devel::IPerl::Kernel=HASH(0x1e58f80), Devel::IPerl::Message::ZMQ=HASH(0x572d5d0)) called at /usr/local/lib/perl5/site_perl/5.32.1/Devel/IPerl/Kernel/Callback/REPL.pm line 156
	Devel::IPerl::Kernel::Callback::REPL::msg_execute_request(Devel::IPerl::Kernel::Callback::REPL=HASH(0x2a70a18), Devel::IPerl::Kernel=HASH(0x1e58f80), Devel::IPerl::Message::ZMQ=HASH(0x572d5d0), ZMQ::LibZMQ3::Socket=HASH(0x5705168)) called at /usr/local/lib/perl5/site_perl/5.32.1/Devel/IPerl/Kernel.pm line 245
	Devel::IPerl::Kernel::route_message(Devel::IPerl::Kernel=HASH(0x1e58f80), ARRAY(0x5705288), ZMQ::LibZMQ3::Socket=HASH(0x5705168)) called at /usr/local/lib/perl5/site_perl/5.32.1/Devel/IPerl/Kernel.pm line 215
	Devel::IPerl::Kernel::__ANON__(Net::Async::ZMQ::Socket=HASH(0x5705318)) called at /usr/local/lib/perl5/site_perl/5.32.1/IO/Async/Loop/Poll.pm line 172
	IO::Async::Loop::Poll::post_poll(IO::Async::Loop::Poll=HASH(0x5653b50)) called at /usr/local/lib/perl5/site_perl/5.32.1/IO/Async/Loop/Poll.pm line 292
	IO::Async::Loop::Poll::loop_once(IO::Async::Loop::Poll=HASH(0x5653b50), undef) called at /usr/local/lib/perl5/site_perl/5.32.1/IO/Async/Loop.pm line 538
	IO::Async::Loop::run(IO::Async::Loop::Poll=HASH(0x5653b50)) called at /usr/local/lib/perl5/site_perl/5.32.1/IO/Async/Loop.pm line 575
	IO::Async::Loop::loop_forever(IO::Async::Loop::Poll=HASH(0x5653b50)) called at /usr/local/lib/perl5/site_perl/5.32.1/Devel/IPerl/Kernel.pm line 225
	Devel::IPerl::Kernel::run(Devel::IPerl::Kernel=HASH(0x1e58f80)) called at /usr/local/lib/perl5/site_perl/5.32.1/Devel/IPerl.pm line 14
	Devel::IPerl::main() called at -e line 1


### Higher-Order Polynomial Function Fitting (Overfitting)
Now let us try to train the model using a polynomial of too high degree. Here, there are insufficient
data to learn that the higher-degree coefficients should have values close to zero. As a result, our
overly-complex model is so susceptible that it is being influenced by noise in the training data.
Though the training loss can be effectively reduced, the test loss is still much higher. It shows that
the complex model overfits the data.

In [None]:
# Pick all the dimensions from the polynomial features
train(
    getSubND({data=>$poly_features, row_start=>0, column_start=>0, row_end=>$n_train, column_end=>$poly_features->shape->[1]}),
    getSubND({data=>$poly_features, row_start=>$n_train+1, column_start=>0, row_end=>$poly_features->shape->[0], column_end=>$poly_features->shape->[1]}),
    getSubND({data=>$labels, row_start=>0, row_end=>$n_train}),
    getSubND({data=>$labels, row_start=>$n_train+1, row_end=>$labels->shape->[0]}),
    1500
)

In the subsequent sections, we will continue to discuss overfitting problems and methods for dealing with them, such as weight decay and dropout.

### Summary
* Since the generalization error cannot be estimated based on the training error, simply minimizing the training error will not necessarily mean a reduction in the generalization error. <br> Machine learning models need to be careful to safeguard against overfitting so as to minimize the generalization error.
* A validation set can be used for model selection, provided that it is not used too liberally.
* Underfitting means that a model is not able to reduce the training error. When training error is much lower than validation error, there is overfitting.
* We should choose an appropriately complex model and avoid using insufficient training samples.