test/RunTests

#!/usr/bin/env perl
#
# Test suite for vw:
#
# You may add arbitrary (train/test/varying-options) tests
# by adding data files and their expected reference STDOUT and STDERR
#
# See __DATA__ below for how to add more tests
#
require 5.008;
use warnings;

use Getopt::Std;
use File::Basename;

use vars qw($opt_d $opt_D $opt_c $opt_e $opt_f
            $opt_E $opt_o $opt_w $opt_y $opt_t
            $opt_v $opt_V $opt_F $opt_O);

my $Epsilon = 1e-4;

my $VW;

# External utilities we use. See init() for Windows specific actions.
my $Diff = 'diff';
my $Cat = 'cat';

$ENV{'PATH'} = '../build/vowpalwabbit:test:../vowpalwabbit:vowpalwabbit:.:' . $ENV{'PATH'};

# -V prefixes valgrind like this, we should adjust the default
# options over time to what looks most useful.
my $Valgrind = 'valgrind --quiet --error-exitcode=100 --track-origins=yes --leak-check=full';

# -- timeout is part of GNU coreutils, some systems may not have it
my $TimeOut = '';
my $TimeOutSec = 300;    # max allowed time for single vw command run

# By default, we run all tests in the list
my $FullRun = 1;
my $ErrorCount = 0;

# These --side-by-side diff opts are used to make the
# fuzzy-compare easier: just split on '|' and compare numeric values
# word by word:
# NOTE: -W 160 is sufficient for most outputs.
#       --bfgs prints widest (134 chars-per-line)
# 160 used to sufficient but there are new tests with VERY wide outputs - so it needs to be huge...
my $DiffOpts = '-N --minimal --suppress-common-lines --ignore-all-space --strip-trailing-cr --side-by-side -W 1500';
$WordSplit = "[ \t:,@]+";

# These diff options are used for the diff we want to show the user
# The intent is to make them easier to parse (and compare values) by a human
my $DisplayDiffOpts = '-u --minimal';

my @PathAdd = qw(. .. ../vowpalwabbit);

my @ToTest = ();

# __DATA__ test counter
my $TestNo = 0;

sub v($;@) {
    my $verbose_level = shift @_;
    return unless ($opt_v >= $verbose_level);
    if (@_ == 1) {
        print STDERR @_;
    } else {
        printf STDERR @_;
    }
}

sub usage(@) {
    print STDERR @_, "\n" if (@_);

    die "Usage: $0 [options] [testno...] [vw-executable]
    By default will run against the 1st 'vw' executable found in:
        @PathAdd  \$PATH

    Options:
        -c      print test-suite commands before running them
        -d      print diff output on significant diff failure
        -D      print diff output even if it is not significant
        -e      exit with non-zero status on first error
        -w      Ignore white-space differences (diff --ignore-space-change)
        -f      Ignore small (< $Epsilon) floating-point differences (fuzzy compare)
        -E<e>   Tolerance epsilon <e> for fuzzy compares (default $Epsilon)
        -o      Overwrite reference file with new/different result
        -y      On error, copy bad files to (eg stderr.test21) for later comparison
        -v<L>   Verbosity <L> (small integer) is verbosity level
        -V      apply valgrind to vw commands
        -F      include flatbuffer tests
        -t<T>   Apply timeout <T> (default $TimeOutSec) secs to individual tests
                (will only work where GNU coreutils 'timeout' is present)
        -O<O>   Add <O> option(s) to all vw commands

    [testno...]   Optional integer args: explicit test numbers (skip others)
";
}

sub mysystem {
    my $cmd = shift;
    v(1, "%s\n", $cmd);
    system($cmd);
}

sub command_failed($) {
    # Deal with cases where vw crashes, exits prematurely etc.
    # print a message to distinguish between all cases
    # return non-zero status if anything is bad
    my $cmd = shift;
    my $exitcode = 0;
    if ($?) {
        $exitcode = $? >> 8;
        my $signal = $? & 127;
        my $core = ''; if ($? & 128) { $core = ' (core dumped)'; }
        if ($signal) {
            printf STDERR
                "$0: test $TestNo: '%s' died from signal $signal$core\n", $cmd;
            $exitcode = 1;
        } elsif ($exitcode == 124) {
            printf STDERR
                "$0: test $TestNo: '%s' timed-out (exitcode=$exitcode)\n" .
                "$0: test $TestNo: you may increase the imposed time-out: \$TimeOutSec=%d\n",
                $cmd, $TimeOutSec;
        } elsif ($exitcode) {
            printf STDERR
                "$0: test $TestNo: '%s' failed (exitcode=$exitcode)\n", $cmd;
        }
    }
    # This is non-zero only if $cmd failed
    $exitcode;
}

sub valgrind_errfile($) {
    my $testno = shift;
    "Test-$testno.valgrind-err";
}

#
# which vw executable to test against
#
sub which_vw() {
    if (@ARGV > 0) {
        my $exe = $ARGV[0];
        if (-f $exe && -x $exe) {
            printf STDERR "Testing vw: %s\n", $exe;
            return $exe;
        } else {
            usage("$0: argument $exe: not an executable file");
        }
    } elsif (@ARGV == 0) {
        foreach my $dir (@PathAdd, split(':', $ENV{PATH})) {
            my $exe = "$dir/vw";
            if (-x $exe) {
                printf STDERR "Testing vw: %s\n", $exe;
                return $exe;
            }
        }
    }
    usage("can't find a 'vw' executable to test on");
}


sub init() {
    $0 =~ s{.*/}{};
    getopts('wcdDefyE:ov:VFt:O:') || usage();
    $opt_v = 0 unless (defined $opt_v and $opt_v);
    if (defined $opt_O) {
        $opt_O = " $opt_O";
    } else {
        $opt_O = '';
    }

    my $hostname = `hostname`; chomp($hostname);
    printf STDERR "Testing on: hostname=%s OS=%s\n", $hostname, $^O;

    if ($^O =~ /MSWin/i) {
        v(1, "OS is $^O\n");
        # On MS Windows we need to change paths to external executables
        # Assumes cygwin is installed
        $ENV{'PATH'} .= ':/cygdrive/c/cygwin/bin';
        # And just to be safe (probably not needed):
        $Diff  = 'c:\cygwin\bin\diff.exe';
        $Cat   = 'c:\cygwin\bin\cat.exe';
    }
    elsif ($^O =~ /cygwin/i){
        v(1,"OS is $^O\n");
        # On MS Windows we need to change paths to external executables
        # Assumes cygwin is installed
        $ENV{'PATH'} .= ':/cygdrive/c/cygwin/bin';
        # And just to be safe (probably not needed):
#        $Diff  = 'c:/cygwin/bin/diff.exe';
#        $Cat   = 'c:/cygwin/bin/cat.exe';
    }
    $Epsilon = $opt_E if ($opt_E);
    $Diff .= ' --ignore-space-change' if ($opt_w);
    my @num_args = ();
    my @exe_args = ();
    foreach my $arg (@ARGV) {
        if ($arg =~ /^\d+$/) {  # a test number
            push(@num_args, $arg);
            next;
        }
        push(@exe_args, $arg);
    }
    if (@num_args) {
        @ToTest = sort { $a <=> $b } @num_args;
        # add dummy element so we don't become empty on last test
        push(@ToTest, -1);
        $FullRun = 0;
    }
    @ARGV = @exe_args;

    $VW = which_vw();

    my $timeout = `which timeout 2>/dev/null`;
    if ($timeout =~ /timeout$/) {
        chomp($timeout);
        $TimeOut = $timeout;
        v(1,"timeout is: %s\n", $TimeOut);
    }
    if ($opt_t) {
        if ($opt_t =~ /^\d+$/) {
            $TimeOutSec = $opt_t;
        } else {
            usage("-t $opt_t: -t can only accept integer seconds");
        }
        warn "-t passed but this env doesn't have timeout installed\n"
            unless ($TimeOut);
    }
}

sub copy_file {
    my ($src_file, $dst_file) = @_;
    use File::Copy;
    print STDERR "\t\t-> copying output to $dst_file\n";
    copy($src_file, $dst_file);
}

sub trim_spaces($) {
    my $str = shift;
    $str =~ s/^\s+//s;
    $str =~ s/\s+$//s;
    $str =~ s/\n+$//s;
    $str;
}

#
# ref_file($default_name)
#   Reference file existence: if we're on Windows, AND
#   an alternate reference-file exists, give precedence
#   to the alternate file (file with a '-mswin' suffix.)
#
sub ref_file($) {
    my $file = shift;
    if ($^O =~ /MSWin/i or $^O =~ /cygwin/i) {
        my $win_reffile = "$file-mswin";
        if (-e $win_reffile) {
            return $win_reffile;
        }
    }
    $file;
}

sub next_paragraph {
    my $paragraph = '';
    my $testid = '<unknown id>';

    while ($line = <DATA>) {
        if ($line =~ /^\s*#/) {       # skip comment lines
          if ($line =~ /^# Test (\d+)/) {
            $testid = $1;
          }
          next;
        }
        if ($line =~ /\\$/) {           # support line continuation
            $line =~ s/\\\n/ /;
        }
        $paragraph .= $line if $line =~ /\w/;

        if ($paragraph and ($line =~ /^\s*$/ || eof(DATA))) {
            # end of paragraph
            chomp $paragraph;
            $paragraph = trim_spaces($paragraph);
            return ($testid, $paragraph);
        }
    }
    return;
}

sub next_test() {
    my ($cmd, $out_ref, $err_ref, @other_ref);

    my $paragraph = '';
    ($TestNo, $paragraph) = next_paragraph();
    return (undef, undef, undef, undef) if !defined $paragraph;
    my @lines = split("\n", $paragraph);

    # The command line must be first
    $cmd = shift @lines;
    foreach my $line (@lines) {
        if ($line =~ m/\.stdout\b/) {
            $out_ref = ref_file(trim_spaces($line));
            next;
        }
        if ($line =~ /\.stderr\b/) {
            $err_ref = ref_file(trim_spaces($line));
            next;
        }

        # any other reference file
        $line = ref_file(trim_spaces($line));
        if (-e $line) {
            push(@other_ref, $line);
        } else {
            unless ($opt_y) {
                printf STDERR "__DATA__: line $.: " .
                          "non-existent reference file: %s\n", $line;
            }
            next;
        }
    }

    if (eof(DATA) && !defined $cmd) {
        return (undef, undef, undef, undef);
    }

    if ($cmd =~ /{VW}/) {
         $cmd = trim_spaces($cmd);
         $cmd =~ s/{VW}/$VW$opt_O/g;
    }

    if ($cmd =~ /--flatbuffer/) {
         if (!$opt_F)
         {
             printf STDERR "Skipping test %s. Run with -F to include\n", $TestNo;
             next;
         }
    }

    unless (defined $cmd) {
        die "$0: test $TestNo: command is undefined\n";
    }
    unless (defined $err_ref) {
        v(2, "%s: test %s: stderr ref: undefined\n", $0, $TestNo);
        $err_ref = '/dev/null';
    }
    # print STDERR "next_test: (\$cmd, $out_ref, $err_ref, $pred_ref, $pred)\n";
    if ($opt_V) {
        $cmd = sprintf("%s --log-file='%s' %s",
                        $Valgrind, valgrind_errfile($TestNo), $cmd);
    } elsif ($TimeOut) {
        $cmd = sprintf("%s %u %s", $TimeOut, $TimeOutSec, $cmd);
    }
    ($cmd, $out_ref, $err_ref, @other_ref);
}

#
# If the difference is small (least significant digits of numbers)
# treat it as ok. It may be a result of 32 vs 64 bit calculations.
#
use Scalar::Util qw(looks_like_number);

sub lenient_array_compare($$) {
    my ($w1_ref, $w2_ref) = @_;
    my (@w1) = @$w1_ref;
    my (@w2) = @$w2_ref;

    # print STDERR "lenient_array_compare: (@w1) (@w2)\n";
    if ($#w1 != $#w2) { # arrays not of same size
        if ($opt_v > 3) {
            v(4, "#-of-words in two arrays are different: %d != %d\n", scalar(@w1), scalar(@w2));
            v(4, "line1: "); for (my $i=0; $i <= $#w1; $i++) { v(4, " word[%d]='%s'", $i, $w1[$i]) }; v(4, "\n");
            v(4, "line2: "); for (my $i=0; $i <= $#w2; $i++) { v(4, " word[%d]='%s'", $i, $w2[$i]) }; v(4, "\n");
        }
        return 1;
    }
    my $nelem = scalar @w1;
    for (my $i = 0; $i < $nelem; $i++) {
        my ($word1, $word2) = ($w1[$i], $w2[$i]);
        # print STDERR "\t$word1 == $word2 ?\n";
        next if ($word1 eq $word2);

        # Some output contains '...', remove this for comparison.
        $word1 =~ s/\.\.\.//;
        $word2 =~ s/\.\.\.//;

        # There's some difference, is it significant?
        unless (looks_like_number($word1)) {
            v(4, "$word1 vs $word2: word1=$word1 is not a number!\n");
            return 1;
        }
        unless (looks_like_number($word2)) {
            v(4, "$word1 vs $word2: word2=$word2 is not a number!\n");
            return 1;
        }

        my $delta = abs($word1 - $word2);

        if ($delta > $Epsilon) {
            # We have a 'big enough' difference, but this difference
            # may still not be meaningful in all contexts:

            # Big numbers should be compared by ratio rather than
            # by difference

            # Must ensure we can divide (avoid div-by-0)
            if (abs($word2) <= 1.0) {
                # If numbers are so small (close to zero),
                # ($delta > $Epsilon) suffices for deciding that
                # the numbers are meaningfully different
                v(4, "$word1 vs $word2: delta=$delta > Epsilon=$Epsilon\n");
                return 1;
            }
            # Now we can safely divide (since abs($word2) > 0)
            # and determine the ratio difference from 1.0
            my $ratio_delta = abs($word1/$word2 - 1.0);
            if ($ratio_delta > $Epsilon) {
                v(4, "$word1 vs $word2: ratio_delta=$ratio_delta > Epsilon=$Epsilon\n");
                return 1;
            }
        }
    }
    # print STDERR "lenient_array_compare: no meaningful difference\n";
    return 0; # no meaningful difference
}

sub diff_lenient_float($$) {
    my ($reffile, $outfile) = @_;
    my $status = 0;

    my $tmpf = 'lenient-diff.tmp';
    mysystem("$Diff $DiffOpts $reffile $outfile >$tmpf");
    $status = $? >> 8;
    v(2, "diff produced $tmpf: status=$status\n");
    if (-s $tmpf) {
        # The diff has something in it.
        my $fuzzy_status = 0;   # assume innocent till proven guilty
        open(my $sdiff, $tmpf) || die "$0: diff_lenient_float: $tmpf: $!\n";
        while (<$sdiff>) {
            chomp;
            my ($line1, $line2) = split(/\s*\|\s*/, $_);
            unless (defined($line1) && defined($line2)) {
                my $save_diff_file = "test-$TestNo.lenient-diff";
                warn "$0: test $TestNo: $tmpf: line $.: fuzzy-match missing data on one of the sides. Can't compare\n$_\n";
                warn "$0: test $TestNo: saving lenient diff in '$save_diff_file' for later inspection\n";
                close $sdiff;
                rename($tmpf, $save_diff_file);
                return 1;
            }
            # strip leading spaces if any (happens with --bfgs)
            $line1 =~ s/^\s+//;
            $line2 =~ s/^\s+//;
            v(3, "line1: %s\n", $line1);
            v(3, "line2: %s\n", $line2);

            # Break lines into tokens/words
            my (@w1) = split(/$WordSplit/o, $line1);
            my (@w2) = split(/$WordSplit/o, $line2);
            if (lenient_array_compare(\@w1, \@w2) != 0) {
                $fuzzy_status = 1;
                last;
            }
        }
        close $sdiff;
        $status = $fuzzy_status;
    }
    unlink($tmpf) if ($status == 0);
    $status;
}

#
# perl internal way to emulate 'touch'
#
sub touch(@) {
    my $now = time;
    utime $now, $now, @_;
}

sub display_diff($$) {
    my ($reference_file, $actual_file) = @_;
    my $diff_cmd = "$Diff $DisplayDiffOpts $reference_file $actual_file";

    printf STDERR "--- %s\n", $diff_cmd;
    mysystem($diff_cmd);
}

sub diff($$) {
    my ($reffile, $outfile) = @_;
    my $status = 0;
    $reffile = '' unless (defined $reffile);

    # Special case, empty file w/o reference is not considered a failure.
    # This is a most common case with stdout.
    unless (-e $reffile) {
        if (-s $outfile > 0) {
            warn "$0: test $TestNo: stdout ref: $reffile: $!\n";
            exit 1 if ($opt_e);
            return 2 unless ($opt_o);
        } else {
            # Empty output without a ref is not considered a failure
            v(1, "$0: test $TestNo: empty output w/o reference: ignored.\n");
            return 0;
        }
    }

    # Actually run the diff
    my $diff_cmd = "$Diff $DiffOpts $reffile $outfile";
    my $diftmp = 'diff.tmp';
    mysystem("$diff_cmd >$diftmp");
    $status = $? >> 8;
    v(2, "$diff_cmd >$diftmp: status=$status\n");

    if (-s "$diftmp") {
        # There's some difference
        v(2, "$diftmp has something in it. Is it meaningful?\n");

        if ($opt_f && -e $reffile && -e $outfile &&
            diff_lenient_float($reffile, $outfile) == 0) {

            print STDERR "$0: test $TestNo: minor (<$Epsilon) precision differences ignored\n";
            $status = 0;
        }
        if ($opt_D or ($opt_d && $status)) {
            # Print the diff only iff:
            #   1) -D is in effect  OR
            #   2) -d is in effect and diff is significant
            display_diff($reffile, $outfile);
        }
        if ($opt_o) {
            print STDERR "-o: overwriting reference:\n";

            if (-e $reffile) {
                print STDERR "\t$reffile -> $reffile.prev\n";
                rename($reffile, "$reffile.prev") ||
                    die "FATAL: rename($reffile, $reffile.prev): $!\n";
            }
            print STDERR "\t$outfile -> $reffile\n";
            rename($outfile, $reffile) ||
                die "FATAL: rename($outfile, $reffile): $!\n";

            unless ($opt_e) {
                $status = 0;
            }
        }
    }
    unlink($diftmp) if ($status == 0);
    $status;
}

#
# check_for_time_regression()
#   Compare last overall time to run to current to catch
#   performance regressions
#
my $LastTimeFile = 'RunTests.last.times';

sub write_times($@) {
    my ($file, @times) = @_;
    open(my $fh, ">$file") || die "$0: can't open(>$file): $!\n";
    print $fh join(' ', @times), "\n";
    close $fh;
}
sub read_times($) {
    my ($file) = @_;
    open(my $fh, $file) || die "$0: can't open($file): $!\n";
    my $line = <$fh>; chomp $line;
    close $fh;
    return (split(' ', $line));
}

sub check_for_time_regression() {
    my $tolerate_regress = 1.02;
    my $pct_change = 0.0;
    my ($overall_time0, $overall_time1);
    my ($user0, $system0, $cuser0, $csystem0);
    my ($user1, $system1, $cuser1, $csystem1) = times;
    $overall_time1 = $cuser1 + $csystem1;

    if (-e $LastTimeFile) {
        ($user0, $system0, $cuser0, $csystem0) = read_times($LastTimeFile);
        if (!(defined $csystem0) or !(defined $cuser0)) {
            die "$0: undefined times in saved times file: $LastTimeFile," .
                    " try removing it\n"
        }
        $overall_time0 = $cuser0 + $csystem0;
        $pct_change = 100 * ($overall_time1 - $overall_time0) / (1e-4+$overall_time0);

        if ($overall_time0 == 0) {
            die "$0: Bad times in saved times file: $LastTimeFile," .
                    " try removing it\n"
        } elsif ($overall_time1/$overall_time0 > $tolerate_regress) {
            printf STDERR "$0: RUNTIME REGRESSION: " .
                    "%.2f sec vs last time %.2f sec. (%.2f%% worse)\n",
                    $overall_time1, $overall_time0, $pct_change;
        }
    }
    write_times($LastTimeFile, $user1, $system1, $cuser1, $csystem1);
    printf STDERR
        "$0 runtime: user %g, system %g, total %g sec (%+.2f%% vs. last)\n",
                $cuser1, $csystem1, $overall_time1, $pct_change;
}

# only unlink relative path, plain files
# e.g. avoids trying to unlink /dev/null when running as root
sub safe_unlink($) {
    my $file = shift;
    return 0 if ($file =~ m{^/});
    return 0 unless (-f $file);
    unlink($file);
}

sub run_tests() {
    print STDERR "$0: '-D' to see any diff output\n"
        unless ($opt_D);
    print STDERR "$0: '-d' to see only significant diff output\n"
        unless ($opt_d);
    print STDERR "$0: '-o' to force overwrite references\n"
        unless ($opt_o);
    print STDERR "$0: '-e' to abort/exit on first failure\n"
        unless ($opt_e);

    my ($cmd, $out_ref, $err_ref, $pred_ref, $cmp_ref);
    my ($outf, $errf, $predf, $cmpf);

    mkdir('models', 0755) unless (-d 'models');

    unlink(glob('*.tmp'));
    unlink(glob('*.cache'));
    unlink(glob('*/*.cache'));

    while (($cmd, $out_ref, $err_ref, @more_refs) = next_test()) {
        last unless (defined $cmd);
        if (@ToTest) {
            if ($ToTest[0] != $TestNo) {
                # warn "$0: test $TestNo: skipped\n";
                next;
            } else {
                shift(@ToTest);
            }
        }

        $outf = (defined($out_ref) && -f $out_ref)
                    ? basename($out_ref)
                    : '/dev/null';

        $errf = (defined($err_ref) && -f $err_ref)
                    ? basename($err_ref)
                    : '/dev/null';

        # Run the test
        print STDERR "Test $TestNo: ($cmd) >$outf 2>$errf\n" if ($opt_c);
        mysystem("($cmd) >$outf 2>$errf");
        my $full_status = $?;
        my $status = $full_status >> 8;
        unless ($opt_V) {
            if (my $failure = command_failed($cmd)) {
                print STDERR `$Cat $errf`
                    unless ($failure == 124);
                if ($opt_e) {
                    printf STDERR "$0: exiting with status=$failure\n";
                    exit $failure;
                }
                next;
            }
        }
        if ($status) {
            $ErrorCount++;
            if ($opt_V && $status == 100) {
                # If the failing test was a script or Python file, ignore the failure because Valgrind
                # doesn't really work there.
                if (index($cmd, ".sh") != -1 || index($cmd, ".py") != -1) {
                    print "$0: test $TestNo: Valgrind failure in script ignored\n";
                    # Undo the error count change, as this failure is okay.
                    $ErrorCount--;
                    next;
                }
                else {
                    my $errfile = valgrind_errfile($TestNo);
                    warn "$0: test $TestNo: FAILED: valgrind errors in $errfile\n";
                }
            } elsif ($TimeOut && $status == 124) {
                warn "$0: test $TestNo: FAILED: timeout $TimeOutSec exceeded\n";
            } else {
                warn "$0: test $TestNo: '$cmd' failed: status=$status\n";
            }
            if ($opt_e) {
                printf STDERR "exiting with full status $full_status\n";
                exit 1;
            }
            next;
        }

        # command succeded
        # -- compare stdout
        $status = diff($out_ref, $outf);
        if ($status) {
            $ErrorCount++;
            printf STDERR "%s: test %d: FAILED: ref(%s) != stdout(%s)\n\tcmd: $cmd\n",
                $0, $TestNo, $out_ref, $outf;

            copy_file($outf, "$outf.test$TestNo") if ($opt_y);
            exit $status if ($opt_e);
        } else {
            if (defined $out_ref) {
                print STDERR "$0: test $TestNo: stdout OK\n";
                safe_unlink($outf);
            } else {
                v(1, "$0: test $TestNo: stdout OK (no reference)\n");
            }
        }

        # -- compare stderr
        if (! -e $err_ref  and  ! $opt_o) {
            $ErrorCount++;
            print STDERR "$0: test $TestNo: FAILED: stderr ref: $err_ref: $!\n\tcmd: $cmd\n";
            exit 1 if ($opt_e);
            next;
        }
        $status = diff($err_ref, $errf);
        if ($status) {
            $ErrorCount++;
            printf STDERR "%s: test %d: FAILED: ref(%s) != stderr(%s)\n\tcmd: $cmd\n",
                $0, $TestNo, $err_ref, $errf;

            copy_file($errf, "$errf.test$TestNo") if ($opt_y);
            exit $status if ($opt_e);
        } else {
            print STDERR "$0: test $TestNo: stderr OK\n";
            safe_unlink($errf);
        }

        # -- compare all other reference files
        if (@more_refs) {
            foreach my $ref_path (@more_refs) {
                my $ref_base = basename($ref_path);
                # Verify that it exists on the shell line
                unless ($cmd =~ /$ref_base/) {
                    printf STDERR "%s: test %d: FAILED: " .
                        "no match for '%s' in command: '%s'\n" .
                        "Unable to compare output to reference file\n",
                                $0, $TestNo, $ref_base, $cmd;
                    $ErrorCount++;
                    exit $status if ($opt_e);
                    next;
                }
                $status = diff($ref_path, $ref_base);
                if ($status) {
                    $ErrorCount++;
                    printf STDERR "%s: test %d: FAILED: ref(%s) != (%s)\n\tcmd: $cmd\n",
                    $0, $TestNo, $ref_path, $ref_base;
                    copy_file($ref_base, $ref_path) if ($opt_y);
                    exit $status if ($opt_e);
                } else {
                    print STDERR "$0: test $TestNo: $ref_base OK\n";
                    unlink($ref_base);
                }
            }
        }
    }
    if ($FullRun == 0) {
        v(1, "Partial run: not recording overall time\n");
    } elsif ($ErrorCount > 0) {
        v(1, "Errors found: not recording overall time\n");
    } elsif ($opt_V) {
        v(1, "valgrind run: not recording overall time\n");
    } else {
        check_for_time_regression();
    }
}

# --- main
init();
run_tests();
exit $ErrorCount;

#
# Add tests below the __DATA__ line
#
# Each test is a sequence of non-blank lines, terminated
# by an empty line (or EOF), essentially a paragraph.
#
# Each paragraph/test should look like:
#
#   1st line: shell command to run.

#   2nd-to-Nth line: one-or-more reference files to compare outputs to.
#   ONE reference file per line
#   (Note: we indent these lines just for readability.)
#
#   You may break very long lines using \ at EOL.
#
#   # -------------------------------------
#   # Test <test_id>: ...
#   shell command which may include {VW} ...
#       reference/file1
#       reference/file2
#       ...more reference files...
#
#   # -------------------------------------
#
#   shell-command can be anything accepted by bash, including pipes,
#   redirections, etc., even a sequence of shell-commands separated by ';'
#
#   Inside any shell command, all (optional) appearances of {VW}
#   will be substituted by the vw executable under test.
#
#   By default, 'vw' under our parent dir (../vowpalwabbit/vw) is tested.
#   To run against a different reference executable, pass the
#   wanted executable as an argument to RunTests
#
# The output line-items are reference files to compare outputs to:
#   - *.stdout: expected (reference file) standard output
#   - *.stderr: expected (reference file) standard error
#   - Any other relative path, pointing to a reference file to compare
#     to, this allows adding references to any explicitly named file
#     appearing on the shell-line, the only requirement is that the
#     _basename_ (path stripped of directory) of the reference file
#     would exactly match its respective file in the shell-command.
#
# For example:
#
#   #-------------------------------------------------------
#   # Test 237: readable_model
#   {VW} ... -p test75.predict --readable_model test75.rmodel
#       test/train-sets/ref/test75.stderr
#       test/pred-sets/ref/test75.predict
#       test/whatever/ref/test75.rmodel
#
#   #-------------------------------------------------------
#
# All reference filenames are relative to this (test) directory
#
# Only the STDOUT and STDERR streams in the shell command
# are implicit (so only their reference files need to be specified):
# The implicit names would be matched only by their extension
# as opposed to the full basename of the file.
#
# The two implicit names are:
#       TestXXX.stdout
#       TestXXX.stderr
#
# Windows note:
#
#   Due to differences in Random-Number-Generators in Windows,
#   floating-point outputs may differ in some tests (not all).
#
#   To minimize the need for changes (leverage existing tests and
#   reference files as much as possible), on Windows we check for
#   existence of files with '-mswin' suffix:
#       *.stderr-mswin
#       *.stdout-mswin
#   and if any of them exists, we use it instead.
#
__DATA__

# Test 1:
{VW} -k -l 20 --initial_t 128000 --power_t 1 -d train-sets/0001.dat \
    -f models/0001_1.model -c --passes 8 --invariant \
    --ngram 3 --skips 1 --holdout_off
        train-sets/ref/0001.stderr

# Test 2: checking predictions as well
{VW} -k -t -d train-sets/0001.dat -i models/0001_1.model -p 0001.predict --invariant
    test-sets/ref/0001.stderr
    pred-sets/ref/0001.predict

# Test 3: without -d, training only
{VW} -k -d train-sets/0002.dat -f models/0002.model --invariant
    train-sets/ref/0002.stderr

# Test 4: same, with -d
{VW} -k -d train-sets/0002.dat -f models/0002.model --invariant
    train-sets/ref/0002.stdout
    train-sets/ref/0002.stderr

# Test 5: add -q .., adaptive, and more (same input, different outputs)
{VW} -k --initial_t 1 --adaptive --invariant -q Tf -q ff -f models/0002a.model -d train-sets/0002.dat
    train-sets/ref/0002a.stderr

# Test 6: run predictions on Test 4 model
# Pretending the labels aren't there
{VW} -k -t -i models/0002.model -d train-sets/0002.dat -p 0002b.predict
    test-sets/ref/0002b.stderr
    pred-sets/ref/0002b.predict

# Test 7: using normalized adaptive updates and a low --power_t
{VW} -k --power_t 0.45 -f models/0002c.model -d train-sets/0002.dat
    train-sets/ref/0002c.stderr

# Test 8: predicts on test 7 model
{VW} -k -t -i models/0002c.model -d train-sets/0002.dat -p 0002c.predict
    test-sets/ref/0002c.stderr
    pred-sets/ref/0002c.predict

# Test 9: label-dependent features with csoaa_ldf
{VW} -k -c -d train-sets/cs_test.ldf -p cs_test.ldf.csoaa.predict --passes 10 --invariant --csoaa_ldf multiline --holdout_off --noconstant
    train-sets/ref/cs_test.ldf.csoaa.stderr
    train-sets/ref/cs_test.ldf.csoaa.predict

# Test 10: label-dependent features with wap_ldf
{VW} -k -c -d train-sets/cs_test.ldf -p cs_test.ldf.wap.predict --passes 10 --invariant --wap_ldf multiline --holdout_off --noconstant
    train-sets/ref/cs_test.ldf.wap.stderr
    train-sets/ref/cs_test.ldf.wap.predict

# Test 11: one-against-all
{VW} -k --oaa 10 -c --passes 10 -d train-sets/multiclass --holdout_off
    train-sets/ref/oaa.stderr

# Test 12: Error Correcting Tournament
{VW} -k --ect 10 --error 3 -c --passes 10 --invariant -d train-sets/multiclass --holdout_off
    train-sets/ref/multiclass.stderr

# Test 13: LBFGS on zero derivative input
{VW} -k -c -d train-sets/zero.dat --loss_function=squared -b 20 --bfgs --mem 7 --passes 5 --l2 1.0 --holdout_off
    train-sets/ref/zero.stdout
    train-sets/ref/zero.stderr

# Test 14: LBFGS early termination
{VW} -k -c -d train-sets/rcv1_small.dat --loss_function=logistic --bfgs --mem 7 --passes 20 --termination 0.001 --l2 1.0 --holdout_off
    train-sets/ref/rcv1_small.stdout
    train-sets/ref/rcv1_small.stderr

# Test 15: Run LDA with 100 topics on 1000 Wikipedia articles
{VW} -k --lda 100 --lda_alpha 0.01 --lda_rho 0.01 --lda_D 1000 -l 1 -b 13 --minibatch 128 -d train-sets/wiki256.dat
    train-sets/ref/wiki1K.stderr

# Test 16: neural network 3-parity with 2 hidden units
{VW} -k -c -d train-sets/3parity --hash all --passes 3000 -b 16 --nn 2 -l 10 --invariant -f models/0021.model --random_seed 19 --quiet --holdout_off
    train-sets/ref/3parity.stderr

# Test 17: neural network 3-parity with 2 hidden units (predict)
{VW} -d train-sets/3parity -t -i models/0021.model -p 0022.predict
    pred-sets/ref/0022.stderr
    pred-sets/ref/0022.predict

# Test 18: cubic features -- on a parity test case
{VW} -k -c -f models/xxor.model -d train-sets/xxor.dat --cubic abc --passes 100 --holdout_off --progress 1.33333
    train-sets/ref/xxor.stderr

# Test 19: matrix factorization -- training
{VW} -k -d train-sets/ml100k_small_train -b 16 -q ui --rank 10 \
    --l2 2e-6 --learning_rate 0.05 --passes 2 \
    --decay_learning_rate 0.97 --power_t 0 -f models/movielens.reg \
    -c --loss_function classic --holdout_off
        train-sets/ref/ml100k_small.stdout
        train-sets/ref/ml100k_small.stderr

# Test 20: matrix factorization -- testing
{VW} -i models/movielens.reg -t -d test-sets/ml100k_small_test
    test-sets/ref/ml100k_small.stdout
    test-sets/ref/ml100k_small.stderr

# Test 21: active-learning -- training
{VW} -k --active --simulation --mellowness 0.000001 -d train-sets/rcv1_small.dat -l 10 --initial_t 10 --random_seed 3
    train-sets/ref/active-simulation.t24.stderr

# Test 22: bagging -- training regressor
{VW} -k -d train-sets/0002.dat -f models/bs.reg.model --bootstrap 4 -p bs.reg.predict
    train-sets/ref/bs.reg.stderr
    train-sets/ref/bs.reg.predict

# Test 23: bagging -- predicting with bagged regressor
{VW} -d train-sets/0002.dat -i models/bs.reg.model -p bs.prreg.predict -t
    train-sets/ref/bs.prreg.stderr
    train-sets/ref/bs.prreg.predict

# Test 24: bagging -- binary classifiers
{VW} -d train-sets/0001.dat -f models/bs.vote.model --bootstrap 4 --bs_type vote -p bs.vote.predict
    train-sets/ref/bs.vote.stderr
    train-sets/ref/bs.vote.predict

# Test 25: bagging -- predict with bagged classifier
{VW} -d train-sets/0001.dat -i models/bs.vote.model -p bs.prvote.predict -t
    train-sets/ref/bs.prvote.stderr
    train-sets/ref/bs.prvote.predict

# Test 26: affix features
{VW} -d train-sets/affix_test.dat -k -c --passes 10 --holdout_off --affix -2
    train-sets/ref/affix_test.stderr

# Test 27: train --l1 regularized model
{VW} -d train-sets/0001.dat -f models/mask.model --invert_hash mask.predict --l1 0.01
    train-sets/ref/mask.stderr

# Test 28: train model using --feature_mask
{VW} -d train-sets/0001.dat --invert_hash remask.predict --feature_mask models/mask.model -f models/remask.model
    train-sets/ref/remask.stderr

# Test 29: train model using --feature_mask and --initial_regressor
{VW} -d train-sets/0001.dat --feature_mask models/mask.model -i models/remask.model
    train-sets/ref/remask.final.stderr

# Test 30: train model for topk recommender
{VW} -d train-sets/topk.vw -f topk.model -q MF --passes 100 --cache_file topk-train.cache -k --holdout_off
    train-sets/ref/topk-train.stderr

# Test 31: train model for topk recommender
{VW} -P 1 -d train-sets/topk.vw -i topk.model --top 2 -p topk-rec.predict
    train-sets/ref/topk-rec.stderr
    train-sets/ref/topk-rec.predict

# Test 32: non-centered data-set where constant >> 0
#   To test the new --constant option without which performance is very weak
{VW} -k --passes 100 -c --holdout_off --constant 1000 -d train-sets/big-constant.dat
    train-sets/ref/big-constant.stderr

# Test 33: new option: --progress w/ integer arg
{VW} -k -d train-sets/0001.dat --progress 10
    train-sets/ref/progress-10.stderr

# Test 34: new-option: --progress w/ floating-point arg
#           + alternate short form (-P)
{VW} -k -d train-sets/0001.dat -P 0.5
    train-sets/ref/progress-0.5.stderr

# Test 35: --nn without --quiet to avoid nn regressions
#   (Needs to be a simple test, not one sensitive to symmetry breaking)
{VW} -k -d train-sets/0001.dat --nn 1
    train-sets/ref/nn-1-noquiet.stderr

# Test 36: cb with dr
{VW} -d train-sets/rcv1_raw_cb_small.vw --cb 2 --cb_type dr --ngram 2 --skips 4 -b 24 -l 0.25
    train-sets/ref/rcv1_raw_new_cb_dr.stderr

# Test 37: cb with ips
{VW} -d train-sets/rcv1_raw_cb_small.vw --cb 2 --cb_type ips --ngram 2 --skips 4 -b 24 -l 0.125
    train-sets/ref/rcv1_raw_new_cb_ips.stderr

# Test 38: cb with dm
{VW} -d train-sets/rcv1_raw_cb_small.vw --cb 2 --cb_type dm --ngram 2 --skips 4 -b 24 -l 0.125 -f cb_dm.reg
    train-sets/ref/rcv1_raw_new_cb_dm.stderr

# Test 39: --lda --passes 2 hang regression
{VW} -k -d train-sets/lda-2pass-hang.dat --lda 10 -c --passes 2 --holdout_off
    train-sets/ref/lda-2pass-hang.stderr

# Test 40: (holdout-broken regression)
# ensure we have no holdout loss of '0 h'
{VW} -k -c --passes 2 -d train-sets/0001.dat
    train-sets/ref/holdout-loss-not-zero.stderr

# Test 41: stagewise poly with exponent 0.25
####in the following stage_poly tests, there are minute differences in losses, which are not being fuzzy-diffed;
####thus the stderr is cleared (--quiet) and only comparing (fuzzy-diffed) predictions.
{VW} --stage_poly --sched_exponent 0.25 --batch_sz 1000 --batch_sz_no_doubling -d train-sets/rcv1_small.dat -p stage_poly.s025.predict --quiet
    train-sets/ref/stage_poly.s025.stderr
    train-sets/ref/stage_poly.s025.predict

# Test 42: stagewise poly with exponent 1.0
{VW} --stage_poly --sched_exponent 1.0 --batch_sz 1000 --batch_sz_no_doubling -d train-sets/rcv1_small.dat --quiet
    train-sets/ref/stage_poly.s100.stderr

# Test 43: stagewise poly with exponent 0.25 and doubling batches
{VW} --stage_poly --sched_exponent 0.25 --batch_sz 1000 -d train-sets/rcv1_small.dat -p stage_poly.s025.doubling.predict --quiet
    train-sets/ref/stage_poly.s025.doubling.stderr
    train-sets/ref/stage_poly.s025.doubling.predict

# Test 44: stagewise poly with exponent 1.0 and doubling batches
{VW} --stage_poly --sched_exponent 1.0 --batch_sz 1000 -d train-sets/rcv1_small.dat -p stage_poly.s100.doubling.predict --quiet
    train-sets/ref/stage_poly.s100.doubling.stderr
    train-sets/ref/stage_poly.s100.doubling.predict

# Test 45: library test, train the initial model
{VW} -c -k -d train-sets/library_train -f models/library_train.w -q st --passes 100 --hash all --noconstant --csoaa_ldf m --holdout_off
    train-sets/ref/library_train.stdout
    train-sets/ref/library_train.stderr

# Test 46: cb_adf, sharedfeatures
{VW}  --dsjson --chain_hash --cb_adf -d train-sets/no_shared_features.json
    train-sets/ref/no_shared_features.stderr

# Test 47: empty test, bad builds (without make clean)
# sometimes cause a SEGV even on empty input
echo "" | {VW}
    train-sets/ref/empty-set.stderr

# Test 48: daemon test
./daemon-test.sh --port 54249
    test-sets/ref/vw-daemon.stdout

# Test 49: SVM linear kernel
{VW} --ksvm --l2 1 --reprocess 5 -b 18 -p ksvm_train.linear.predict -d train-sets/rcv1_smaller.dat
    train-sets/ref/ksvm_train.linear.stderr
    train-sets/ref/ksvm_train.linear.predict

# Test 50: SVM polynomial kernel
{VW} --ksvm --l2 1 --reprocess 5 -b 18 --kernel poly -p ksvm_train.poly.predict -d train-sets/rcv1_smaller.dat
    train-sets/ref/ksvm_train.poly.stderr
    train-sets/ref/ksvm_train.poly.predict

# Test 51: SVM rbf kernel
{VW} --ksvm --l2 1 --reprocess 5 -b 18 --kernel rbf -p ksvm_train.rbf.predict -d train-sets/rcv1_smaller.dat
    train-sets/ref/ksvm_train.rbf.stderr
    train-sets/ref/ksvm_train.rbf.predict

# Test 52: classification with data from dictionaries
# (eg embeddings or gazetteers) -- note that this is impossible without
# dictionaries because --ignore w; also test to make sure gzipped dicts
# work and dictionary redundancy checking works
{VW} -k -c -d train-sets/dictionary_test.dat --binary --ignore w --holdout_off --passes 32 --dictionary w:dictionary_test.dict --dictionary w:dictionary_test.dict.gz --dictionary_path train-sets
    train-sets/ref/dictionary_test.stderr

# Test 53: autolink
{VW} -d train-sets/0002.dat --autolink 1 --examples 100 -p 0002.autolink.predict
    train-sets/ref/0002.autolink.stderr
    train-sets/ref/0002.autolink.predict

# Test 54: train FTRL-Proximal
{VW} -k -d train-sets/0001.dat -f models/0001_ftrl.model --passes 1 --ftrl --ftrl_alpha 0.01 --ftrl_beta 0 --l1 2
    train-sets/ref/0001_ftrl.stderr

# Test 55: test FTRL-Proximal
{VW} -k -t -d train-sets/0001.dat -i models/0001_ftrl.model -p 0001_ftrl.predict
    test-sets/ref/0001_ftrl.stderr
    pred-sets/ref/0001_ftrl.predict

# Test 56: Log_multi
{VW} --log_multi 10 -d train-sets/multiclass
    train-sets/ref/log_multi.stderr

# Test 57: cbify, epsilon-greedy
{VW} --cbify 10 --epsilon 0.05 -d train-sets/multiclass
    train-sets/ref/cbify_epsilon.stderr

# Test 58: cbify, tau first
{VW} --cbify 10 --first 5 -d train-sets/multiclass
    train-sets/ref/cbify_first.stderr

# Test 59: cbify, bag
{VW} --cbify 10 --bag 7 -d train-sets/multiclass
    train-sets/ref/cbify_bag.stderr

# Test 60: cbify, cover
{VW} --cbify 10 --cover 3 -d train-sets/multiclass --nounif
    train-sets/ref/cbify_cover.stderr

# Test 61: train FTRL-PiSTOL
{VW} -k -d train-sets/0001.dat -f models/ftrl_pistol.model --passes 1 --pistol
    train-sets/ref/ftrl_pistol.stderr

# Test 62: test FTRL-PiSTOL
{VW} -k -t -d train-sets/0001.dat -i models/ftrl_pistol.model -p ftrl_pistol.predict
    test-sets/ref/ftrl_pistol.stderr
    pred-sets/ref/ftrl_pistol.predict

# Test 63: check redefine functionality
{VW} -k -d train-sets/0080.dat --redefine := --redefine y:=: --redefine x:=arma --ignore x -q yy
    train-sets/ref/redefine.stderr

# Test 64: check cb_adf
{VW} --cb_adf -d train-sets/cb_test.ldf --noconstant
    train-sets/ref/cb_adf_mtr.stderr

# Test 65: check multilabel_oaa
{VW} --multilabel_oaa 10 -d train-sets/multilabel -p multilabel.predict
    train-sets/ref/multilabel.stderr
    pred-sets/ref/multilabel.predict

# Test 66: check --csoaa_rank on csoaa_ldf
{VW} --csoaa_ldf multiline --csoaa_rank -d train-sets/cs_test_multilabel.ldf -p multilabel_ldf.predict --noconstant
    train-sets/ref/multilabel_ldf.stderr
    pred-sets/ref/multilabel_ldf.predict

# Test 67: check --rank_all on csoaa_ldf
{VW} --cb_adf --rank_all -d train-sets/cb_test.ldf -p cb_adf_rank.predict --noconstant
    train-sets/ref/cb_adf_rank.stderr
    pred-sets/ref/cb_adf_rank.predict

# Test 68: named labels at training time
{VW} --named_labels det,noun,verb --oaa 3 -d train-sets/test_named  -k -c --passes 10 --holdout_off -f models/test_named.model
    train-sets/ref/test_named_train.stderr

# Test 69: named labels at prediction
{VW} -i models/test_named.model -t -d train-sets/test_named -p test_named.predict
    train-sets/ref/test_named_test.stderr
    pred-sets/ref/test_named.predict

# Test 70: named labels at training time (csoaa)
{VW} --named_labels det,noun,verb --csoaa 3 -d train-sets/test_named_csoaa  -k -c --passes 10 --holdout_off -f models/test_named_csoaa.model
    train-sets/ref/test_named_csoaa_train.stderr

# Test 71: named labels at prediction (csoaa)
{VW} -i models/test_named_csoaa.model -t -d train-sets/test_named_csoaa -p test_named_csoaa.predict
    train-sets/ref/test_named_csoaa_test.stderr
    pred-sets/ref/test_named_csoaa.predict

# Test 72: check -q :: and -oaa inverse hash
printf '3 |f a b c |e x y z\n2 |f a y c |e x\n' | \
    {VW} --oaa 3 -q :: --invert_hash inv_hash.cmp && \
        tail -n +2 inv_hash.cmp > inv_hash.cmp.new && \
            rm inv_hash.cmp && \
                mv inv_hash.cmp.new inv_hash.cmp
    train-sets/ref/inv_hash.stderr
    pred-sets/ref/inv_hash.cmp

# Test 73:  check cb_adf with doubly robust option
{VW} --cb_adf --rank_all -d train-sets/cb_test.ldf -p cb_adf_dr.predict --cb_type dr
    train-sets/ref/cb_adf_dr.stderr
    pred-sets/ref/cb_adf_dr.predict

# Test 74: experience replay version of test 1
{VW} -k -l 20 --initial_t 128000 --power_t 1 -d train-sets/0001.dat \
    -c --passes 8 --invariant \
    --ngram 3 --skips 1 --holdout_off --replay_b 100
        train-sets/ref/0001-replay.stderr

# Test 75: named labels at training time (csoaa) with experience replay
{VW} --named_labels det,noun,verb --csoaa 3 \
    -d train-sets/test_named_csoaa -k -c --passes 10 --holdout_off \
    -f models/test_named_csoaa.model --replay_c 100
        train-sets/ref/test_named_csoaa_train-replay.stderr

# Test 76: backwards compatibility
printf '3 |f a b c |e x y z\n2 |f a y c |e x\n' | \
    {VW} -i simple_model --invert_hash inv_hash.cmp && \
        tail -n +2 inv_hash.cmp
   test-sets/ref/backwards.stderr
   test-sets/ref/backwards.stdout

# Test 77:
{VW} -d train-sets/0001.dat -f models/0097.model --save_resume
        train-sets/ref/0097.stderr

# Test 78: checking predictions as well
{VW} --preserve_performance_counters -d train-sets/0001.dat -i models/0097.model -p 0098.predict
    test-sets/ref/0098.stderr
    pred-sets/ref/0098.predict

# Test 79: checking predictions with testing
{VW} -d train-sets/0001.dat -i models/0097.model -p 0099.predict
    test-sets/ref/0099.stderr
    pred-sets/ref/0099.predict

# Test 80: active cover
{VW} --loss_function logistic --binary --active_cover -d train-sets/rcv1_mini.dat -f models/active_cover.model
    train-sets/ref/active_cover.stderr

# Test 81: active cover (predict)
{VW} -i models/active_cover.model -t -d test-sets/rcv1_small_test.data -p active_cover.predict
    test-sets/ref/active_cover.stderr
    pred-sets/ref/active_cover.predict

# Test 82: active cover oracular
{VW} --loss_function logistic --binary --active_cover --oracular -d ./train-sets/rcv1_small.dat
    train-sets/ref/active_cover_oracular.stderr

# Test 83: check cb_adf
{VW} --cb_adf -d train-sets/cb_test.ldf --cb_type mtr --noconstant
    train-sets/ref/cb_adf_mtr.stderr

# Test 84: train FTRL-Proximal early stopping
{VW} -k -d train-sets/0001.dat -f models/0001_ftrl.model --passes 10 --ftrl --ftrl_alpha 3.0 --ftrl_beta 0 --l1 0.9 --cache
    train-sets/ref/0001_ftrl_holdout.stderr

# Test 85: test FTRL-Proximal early stopping prediction
{VW} -k -t -d train-sets/0001.dat -i models/0001_ftrl.model -p 0001_ftrl_holdout.predict
    test-sets/ref/0001_ftrl_holdout_106.stderr
    pred-sets/ref/0001_ftrl_holdout.predict

# Test 86: train FTRL-Proximal no early stopping
{VW} -k -d train-sets/0001.dat -f models/0001_ftrl.model --passes 10 --ftrl --ftrl_alpha 0.01 --ftrl_beta 0 --l1 2 --cache --holdout_off
    train-sets/ref/0001_ftrl_holdout_off.stderr

# Test 87: test FTRL-Proximal no early stopping
{VW} -k -t -d train-sets/0001.dat -i models/0001_ftrl.model -p 0001_ftrl_holdout_off.predict --holdout_off
    test-sets/ref/0001_ftrl_holdout_off.stderr
    pred-sets/ref/0001_ftrl_holdout_off.predict

# Test 88: --probabilities --oaa
{VW} -d train-sets/probabilities.dat --probabilities --oaa=4 --loss_function=logistic -p oaa_probabilities.predict
   train-sets/ref/oaa_probabilities.stderr
   pred-sets/ref/oaa_probabilities.predict

# Test 89: --probabilities --csoaa_ldf=mc
{VW} -d train-sets/cs_test.ldf --probabilities --csoaa_ldf=mc --loss_function=logistic -p csoaa_ldf_probabilities.predict
   train-sets/ref/csoaa_ldf_probabilities.stderr
   pred-sets/ref/csoaa_ldf_probabilities.predict

# Test 90: Predictions with confidences
{VW} --confidence -d ./train-sets/rcv1_micro.dat --initial_t 0.1 -p confidence.preds
    train-sets/ref/confidence.stderr
    pred-sets/ref/confidence.preds

# Test 91: Over size example test
{VW} -d train-sets/x.txt
    train-sets/ref/oversize.stderr

# Test 92: Long Line test
{VW} -d train-sets/long_line -c -k
    train-sets/ref/long_line.stderr

# Test 93: MWT test
{VW} -d train-sets/cb_eval --multiworld_test f -p cb_eval.preds
    train-sets/ref/cb_eval.stderr
    pred-sets/ref/cb_eval.preds

# Test 94: Audit regressor of ftrl model (from test #107)
{VW} -d train-sets/0001.dat -i models/0001_ftrl.model  --audit_regressor ftrl.audit_regr
    train-sets/ref/ftrl_audit_regr.stderr
    train-sets/ref/ftrl.audit_regr

# Test 95: Audit regressor of csoaa model (from test #95)
{VW} -d train-sets/test_named_csoaa -i models/test_named_csoaa.model --audit_regressor csoaa.audit_regr
    train-sets/ref/csoaa_audit_regr.stderr
    train-sets/ref/csoaa.audit_regr

# Test 96: MWT learn test
{VW} -d train-sets/cb_eval --multiworld_test f --learn 2 -p mwt_learn.preds
    train-sets/ref/mwt_learn.stderr
    pred-sets/ref/mwt_learn.preds

# Test 97: MWT learn exclude test
{VW} -d train-sets/cb_eval --multiworld_test f --learn 2 --exclude_eval -p mwt_learn_exclude.preds
    train-sets/ref/mwt_learn_exclude.stderr
    pred-sets/ref/mwt_learn_exclude.preds

# Test 98: cb_explore
{VW} -d train-sets/rcv1_raw_cb_small.vw --cb_explore 2 --ngram 2 --skips 4 -b 24 -l 0.25 -p rcv1_raw_cb_explore.preds
    train-sets/ref/rcv1_raw_cb_explore.stderr
    pred-sets/ref/rcv1_raw_cb_explore.preds

# Test 99: Predictions with confidences after training
{VW} --confidence --confidence_after_training --initial_t 0.1 -d ./train-sets/rcv1_small.dat -p confidence_after_training.preds
    train-sets/ref/confidence_after_training.stderr
    pred-sets/ref/confidence_after_training.preds

# Test 100: cb_eval save/load #1
{VW} -d train-sets/cb_eval1 --multiworld_test f -f mwt.model -p cb_eval1.preds
    train-sets/ref/cb_eval1.stderr
    pred-sets/ref/cb_eval1.preds

# Test 101: cb_eval save/load #2
{VW} -d train-sets/cb_eval2 -i mwt.model -p cb_eval2.preds
    train-sets/ref/cb_eval2.stderr
    pred-sets/ref/cb_eval2.preds

# Test 102: recall tree hello world
{VW} --quiet -d train-sets/gauss1k.dat.gz -f models/recall_tree_g100.model --recall_tree 100 -b 20 --loss_function logistic

# Test 103: recall_tree hello world predict-from-saved-model
{VW} -t -d train-sets/gauss1k.dat.gz -i models/recall_tree_g100.model
    train-sets/ref/recall_tree_gauss1k.stderr
    train-sets/ref/recall_tree_gauss1k.stdout

# Test 104: cb_explore_adf with epsilon-greedy exploration
{VW} --cb_explore_adf --epsilon 0.1 -d train-sets/cb_test.ldf --noconstant -p cbe_adf_epsilon.predict
    train-sets/ref/cbe_adf_epsilon.stderr
    pred-sets/ref/cbe_adf_epsilon.predict

# Test 105: cb_explore_adf with softmax exploration
{VW} --cb_explore_adf --softmax --lambda 1 -d train-sets/cb_test.ldf --noconstant -p cbe_adf_softmax.predict
    train-sets/ref/cbe_adf_softmax.stderr
    pred-sets/ref/cbe_adf_softmax.predict

# Test 106: cb_explore_adf with bagging exploration
{VW} --cb_explore_adf --bag 3 -d train-sets/cb_test.ldf --noconstant -p cbe_adf_bag.predict
    train-sets/ref/cbe_adf_bag.stderr
    pred-sets/ref/cbe_adf_bag.predict

# Test 107: cb_explore_adf with explore-first exploration
{VW} --cb_explore_adf --first 2 -d train-sets/cb_test.ldf --noconstant -p cbe_adf_first.predict
    train-sets/ref/cbe_adf_first.stderr
    pred-sets/ref/cbe_adf_first.predict

# Test 108: train a poisson model
{VW} --quiet -d train-sets/poisson.dat -f models/poisson.model --loss_function poisson --link poisson -b 2 -p poisson.train.predict
    train-sets/ref/poisson.train.stderr
    pred-sets/ref/poisson.train.predict

# Test 109: train a poisson model without invariant updates
{VW} --quiet -d train-sets/poisson.dat -f models/poisson.normalized.model --normalized --loss_function poisson --link poisson -b 2 -l 0.1 -p poisson.train.normalized.predict
    train-sets/ref/poisson.train.normalized.stderr
    pred-sets/ref/poisson.train.normalized.predict

# Test 110: second order online learning
{VW} --OjaNewton -d train-sets/0001.dat -f models/second_order.model -p second_order.predict
    train-sets/ref/second_order.stderr
    pred-sets/ref/second_order.predict

# Test 111: cb explore adf
{VW} -d train-sets/cb_adf_crash_1.data -f models/cb_adf_crash.model --cb_explore_adf --epsilon 0.05
    train-sets/ref/cb_adf_crash1.stderr

# Test 112: cb explore adf predict
{VW} -d train-sets/cb_adf_crash_2.data -i models/cb_adf_crash.model -t
    train-sets/ref/cb_adf_crash2.stderr

# Test 113: Fix for regression introduced by badeedb.
# Ensure audit output continues to work correctly in the presence of anon features.
# Github issue 1038 (https://github.com/JohnLangford/vowpal_wabbit/issues/1038)
{VW} --audit -d train-sets/audit.dat --noconstant
    train-sets/ref/audit.stderr
    train-sets/ref/audit.stdout

# Test 114: cb_explore_adf with cover exploration
{VW} --cb_explore_adf --cover 3 -d train-sets/cb_test.ldf --noconstant -p cbe_adf_cover.predict
    train-sets/ref/cbe_adf_cover.stderr
    pred-sets/ref/cbe_adf_cover.predict

# Test 115: cb_explore_adf with cover exploration + double robust
{VW} --cb_explore_adf --cover 3 --cb_type dr -d train-sets/cb_test.ldf --noconstant -p cbe_adf_cover_dr.predict
    train-sets/ref/cbe_adf_cover_dr.stderr
    pred-sets/ref/cbe_adf_cover_dr.predict

# Test 116: marginal features
{VW} --marginal f  -d train-sets/marginal_features --noconstant --initial_numerator 0.5 --initial_denominator 1.0 --decay 0.001 --holdout_off -c -k --passes 100 -f marginal_model
    train-sets/ref/marginal.stderr

# Test 117: marginal features test
{VW} -i marginal_model  -d train-sets/marginal_features --noconstant -t
    train-sets/ref/marginal_test.stderr

# Test 118: Evaluate exploration on contextal bandit data
{VW} --explore_eval --epsilon 0.2 -d train-sets/cb_test.ldf --noconstant -p explore_eval.predict
    train-sets/ref/explore_eval.stderr
    pred-sets/ref/explore_eval.predict

# Test 119: Test 1 using JSON
{VW} -k -l 20 --initial_t 128000 --power_t 1 -d train-sets/0001.json --json --chain_hash \
    -c --passes 8 --invariant \
    --ngram 3 --skips 1 --holdout_off
        train-sets/ref/0001.json.stderr

# Test 120: cb_explore_adf with cover exploration + double robust
{VW} --cb_explore_adf --cover 3 --cb_type dr -d train-sets/cb_test.json --json --chain_hash --noconstant -p cbe_adf_cover_dr.predict
    train-sets/ref/cbe_adf_cover_dr.json.stderr
    pred-sets/ref/cbe_adf_cover_dr.predict

# Test 121: mix labeled and unlabeled examples with --bootstrap bug:
# https://github.com/JohnLangford/vowpal_wabbit/issues/1111
{VW} --bootstrap 2 -d train-sets/labeled-unlabeled-mix.dat
    train-sets/ref/labeled-unlabeled-mix.stderr

# Test 122: cb_explore_adf with cover exploration + double robust (using more than 256 examples)
{VW} --cb_explore_adf --cover 3 --cb_type dr -d train-sets/cb_test256.json --json --chain_hash --noconstant -p cbe_adf_cover_dr256.predict
    train-sets/ref/cbe_adf_cover_dr256.json.stderr
    pred-sets/ref/cbe_adf_cover_dr256.predict

# Test 123: --scores --oaa
{VW} -d train-sets/probabilities.dat --scores --oaa=4 -p oaa_scores.predict
   train-sets/ref/oaa_scores.stderr
   pred-sets/ref/oaa_scores.predict

# Test 124:  check cb_adf with direct method option
{VW} --cb_adf -d train-sets/cb_test.ldf -p cb_adf_dm.predict --cb_type dm
    train-sets/ref/cb_adf_dm.stderr
    pred-sets/ref/cb_adf_dm.predict

# Test 125: initial_weight option is used
echo "1 | feature:1" | {VW} -a --initial_weight 0.1 --initial_t 0.3
    train-sets/ref/initial_weight.stderr
    train-sets/ref/initial_weight.stdout

# Test 126:  Test --sparse_weights with 148
{VW} --cb_adf -d train-sets/cb_test.ldf -p cb_adf_dm.predict --cb_type dm --sparse_weights
    train-sets/ref/sparse.stderr

# Test 127: daemon on the foreground test
./daemon-test.sh --foreground --port 54250
    test-sets/ref/vw-daemon.stdout

# Test 128: marginal features
{VW} --marginal f  -d train-sets/marginal_features --noconstant --initial_numerator 0.5 --initial_denominator 1.0 --decay 0.001 --holdout_off -c -k --passes 100  --compete
    train-sets/ref/marginal_compete.stderr

# Test 129: ignore linear
{VW} -k --cache_file ignore_linear.cache --passes 10000 --holdout_off -d train-sets/0154.dat --noconstant --ignore_linear x -q xx
    train-sets/ref/ignore_linear.stderr

# Test 130: checking audit_regressor with --save_resume model
{VW} -d train-sets/0001.dat -i models/0097.model --save_resume --audit_regressor 0097.audit_regr
    train-sets/ref/0097.audit_regr.stderr
    train-sets/ref/0097.audit_regr

# Test 131: --cubic regression verification
./cubic-test.sh {VW}

# Test 132: save_resume without --preserve_performce_counters does not alter performance counters over multiple passes
{VW} -d train-sets/0001.dat -f models/sr.model  --passes 2 -c -k  -P 50 --save_resume
    train-sets/ref/157.stderr

# Test 133: test decision service json parsing
{VW} -d train-sets/decisionservice.json --dsjson --cb_explore_adf --epsilon 0.2 --quadratic GT -P 1 -p cbe_adf_dsjson.predict
    train-sets/ref/cbe_adf_dsjson.stderr
    pred-sets/ref/cbe_adf_dsjson.predict

# Test 134: test --bootstrap & --binary interaction
{VW} -d train-sets/rcv1_mini.dat --bootstrap 5 --binary -c -k --passes 2
    train-sets/ref/bootstrap_and_binary.stderr

# Test 135: test --bootstrap & --oaa interaction
# (Also adds -q :: and -P1 to get & verify perfect predictions in 2nd pass)
{VW} -d train-sets/multiclass --bootstrap 4 --oaa 10 -q :: --leave_duplicate_interactions  -c -k --passes 2 --holdout_off -P1
    train-sets/ref/bootstrap_and_oaa.stderr

# Test 136: --classweight
{VW} -d train-sets/0001.dat --classweight 1:2,0:3.1,-1:5
    train-sets/ref/classweight.stderr

# Test 137: --classweight with multiclass
{VW} --oaa 10 -d train-sets/multiclass --classweight 4:0,7:0.1,2:10 --classweight 10:3
    train-sets/ref/classweight_multiclass.stderr

# Test 138: --classweight with multiclass
{VW} --recall_tree 10 -d train-sets/multiclass --classweight 4:0,7:0.1 --classweight 2:10,10:3
    train-sets/ref/classweight_recall_tree.stderr

# Test 139: cs_active low mellowness
{VW} --cs_active 3 -d train-sets/cs_test --cost_max 2 --mellowness 0.01 --simulation --adax
    train-sets/ref/cs_active_0.01.stderr

# Test 140: cs_active high mellowness
{VW} --cs_active 3 -d train-sets/cs_test --cost_max 2 --mellowness 1.0 --simulation --adax
    train-sets/ref/cs_active_1.0.stderr

# Test 141: hash_seed train
{VW} --hash_seed 5 -d train-sets/rcv1_mini.dat --holdout_off --passes 2 -f hash_seed5.model -c -k --ngram 2 -q ::
    train-sets/ref/hash_seed_train.stderr

# Test 142: hash_seed test
{VW} -d train-sets/rcv1_mini.dat -i hash_seed5.model -t
    train-sets/ref/hash_seed_test.stderr

# Test 143: test cb with dm
{VW} -d train-sets/rcv1_raw_cb_small.vw -t -i cb_dm.reg
    train-sets/ref/rcv1_raw_cb_dm_test.stderr

# Test 144: test cbify large
{VW} -d train-sets/rcv1_multiclass.dat --cbify 2 --epsilon 0.05
    train-sets/ref/rcv1_multiclass.stderr

# Test 145: cbify adf, epsilon-greedy
{VW} --cbify 10 --cb_explore_adf --epsilon 0.05 -d train-sets/multiclass
    train-sets/ref/cbify_epsilon_adf.stderr

# Test 146: cbify cs, epsilon-greedy
{VW} --cbify 3 --cbify_cs --epsilon 0.05 -d train-sets/cs_cb
    train-sets/ref/cbify_epsilon_cs.stderr

# Test 147: cbify adf cs, epsilon-greedy
{VW} --cbify 3 --cbify_cs --cb_explore_adf --epsilon 0.05 -d train-sets/cs_cb
    train-sets/ref/cbify_epsilon_cs_adf.stderr

# Test 148: cbify adf, regcb
{VW} --cbify 10 --cb_explore_adf --cb_type mtr --regcb --mellowness 0.01 -d train-sets/multiclass
    train-sets/ref/cbify_regcb.stderr

# Test 149: cbify adf, regcbopt
{VW} --cbify 10 --cb_explore_adf --cb_type mtr --regcbopt --mellowness 0.01 -d train-sets/multiclass
    train-sets/ref/cbify_regcbopt.stderr

# Test 150: cbify ldf, regcbopt
{VW} -d train-sets/cs_test.ldf --cbify_ldf --cb_type mtr --regcbopt --mellowness 0.01
    train-sets/ref/cbify_ldf_regcbopt.stderr

# Test 151: same model on cluster mode
./same-model-test.sh

# Test 152: check --audit output is reproducible
printf '3 |f a b c |e x y z\n2 |f a y c |e x\n' | {VW} --oaa 3 -q ef --audit
    train-sets/ref/audit2.stdout

# Test 153: cb_adf, sharedfeatures
{VW}  --dsjson --chain_hash --cb_adf -d train-sets/no_shared_features.json
    train-sets/ref/no_shared_features.stderr

# Test 154: warm_cb warm start
{VW} --warm_cb 10 --cb_explore_adf --cb_type mtr --epsilon 0.05 --warm_start 3 --interaction 7 --choices_lambda 8 --warm_start_update --interaction_update -d train-sets/multiclass
    train-sets/ref/warm_cb.stderr

# Test 155: warm_cb warm start with lambda set containing 0/1
{VW} --warm_cb 10 --cb_explore_adf --cb_type mtr --epsilon 0.05 --warm_start 3 --interaction 7 --choices_lambda 8 --lambda_scheme 2 --warm_start_update --interaction_update -d train-sets/multiclass
    train-sets/ref/warm_cb_lambda_zeroone.stderr

# Test 156: warm_cb warm start with warm start update turned off
{VW} --warm_cb 10 --cb_explore_adf --cb_type mtr --epsilon 0.05 --warm_start 3 --interaction 7 --choices_lambda 8 --interaction_update -d train-sets/multiclass
    train-sets/ref/warm_cb_no_ws_upd.stderr

# Test 157: warm_cb warm start with interaction update turned off
{VW} --warm_cb 10 --cb_explore_adf --cb_type mtr --epsilon 0.0 --warm_start 3 --interaction 7 --choices_lambda 8 --warm_start_update -d train-sets/multiclass
    train-sets/ref/warm_cb_no_int_upd.stderr

# Test 158: warm_cb warm start with bandit warm start type (Sim-Bandit)
{VW} --warm_cb 10 --cb_explore_adf --cb_type mtr --epsilon 0.05 --warm_start 3 --interaction 7 --choices_lambda 1 --warm_start_update --interaction_update --sim_bandit -d train-sets/multiclass
    train-sets/ref/warm_cb_simbandit.stderr

# Test 159: warm_cb warm start with CYC supervised corruption
{VW} --warm_cb 10 --cb_explore_adf --cb_type mtr --epsilon 0.05 --warm_start 3 --interaction 7 --choices_lambda 8 --warm_start_update --interaction_update --corrupt_type_warm_start 2 --corrupt_prob_warm_start 0.5 -d train-sets/multiclass
    train-sets/ref/warm_cb_cyc.stderr

# Test 160: warm_cb warm start with input cost-sensitive examples
{VW} --warm_cb 3 --cb_explore_adf --cb_type mtr --epsilon 0.05 --warm_start 1 --interaction 2 --choices_lambda 8 --warm_start_update --interaction_update --warm_cb_cs -d train-sets/cs_cb
    train-sets/ref/warm_cb_cs.stderr

# Test 161: test counting examples with holdout_after option
{VW} -k -P 100 --holdout_after 500 -d train-sets/0002.dat
    train-sets/ref/holdout_after.stderr

# Test 162: test counting examples with holdout_after option with 2 passes on the training set
{VW} -k -P 100 --holdout_after 500 -d train-sets/0002.dat -c --passes 2
    train-sets/ref/holdout_after_2passes.stderr

# Test 163: test cb_adf with softmax
{VW} --cb_adf --rank_all -d train-sets/cb_adf_sm.data -p cb_adf_sm.predict --cb_type sm
    train-sets/ref/cb_adf_sm.stderr
    pred-sets/ref/cb_adf_sm.predict

# Test 164: test dsjson parser correctly processes checkpoint and dangling observation lines
{VW} -d train-sets/b1848_dsjson_parser_regression.txt --dsjson --chain_hash --cb_explore_adf -P 1
    train-sets/ref/b1848_dsjson_parser_regression.stderr

# Test 165: one-against-all with subsampling
{VW} -k --oaa 10 --oaa_subsample 5 -c --passes 10 -d train-sets/multiclass --holdout_off
    train-sets/ref/oaa_subsample.stderr

# Test 166: train coin betting
{VW} -k -d train-sets/0001.dat -f models/ftrl_coin.model --passes 1 --coin
    train-sets/ref/ftrl_coin.stderr

# Test 167: test coin betting
{VW} -k -t -d train-sets/0001.dat -i models/ftrl_coin.model -p ftrl_coin.predict
    test-sets/ref/ftrl_coin.stderr
    pred-sets/ref/ftrl_coin.predict

# Test 168: malformed examples, onethread, strict_parse failure
./negative-test.sh {VW} -d train-sets/malformed.dat --onethread --strict_parse
    train-sets/ref/malformed-onethread-strict_parse.stderr

# Test 169: malformed examples, strict_parse failure
./negative-test.sh {VW} -d train-sets/malformed.dat --strict_parse
    train-sets/ref/malformed-strict_parse.stderr

# Test 170: malformed examples success
{VW} -d train-sets/malformed.dat --onethread
    train-sets/ref/malformed.stderr

# Test 171: online contextual memory tree
{VW} -d train-sets/rcv1_smaller.dat --memory_tree 10 --learn_at_leaf --max_number_of_labels 2 --dream_at_update 0 --dream_repeats 3 --online --leaf_example_multiplier 10 --alpha 0.1 -l 0.001 -b 15 --passes 1 --loss_function squared --holdout_off
    train-sets/ref/cmt_rcv1_smaller_online.stderr

# Test 172: offline contextual memory tree
{VW} -d train-sets/rcv1_smaller.dat -k --memory_tree 10 --learn_at_leaf --max_number_of_labels 2 --dream_at_update 0 --dream_repeats 3 --leaf_example_multiplier 10 --alpha 0.1 -l 0.001 -b 15 -c --passes 2 --loss_function squared --holdout_off
    train-sets/ref/cmt_rcv1_smaller_offline.stderr

# Test 173: test cb_sample
{VW} --cb_sample --cb_explore_adf -d test-sets/cb_sample_seed.data -p cb_sample_seed.predict --random_seed 1234
    pred-sets/ref/cb_sample_seed.predict

# Test 174: CCB train then test
{VW} -d train-sets/ccb_test.dat --ccb_explore_adf -p ccb_test.predict
    train-sets/ref/ccb_test.stderr
    train-sets/ref/ccb_test.predict

# Test 175: cb_explore_adf with huge lambda softmax exploration
{VW} --cb_explore_adf --softmax --lambda 100000 -d train-sets/cb_test.ldf --noconstant -p cbe_adf_softmax_biglambda.predict
    train-sets/ref/cbe_adf_softmax_biglambda.stderr
    pred-sets/ref/cbe_adf_softmax_biglambda.predict

# Test 176: Test memory corruption issue in ccb_explore_adf where mtr was leaving a prediction behind
{VW} --ccb_explore_adf --ring_size 7 -d train-sets/ccb_reuse_small.data
    train-sets/ref/ccb_reuse_small.stderr

# Test 177: Test memory corruption issue in ccb_explore_adf where mtr was leaving a prediction behind
{VW} --ccb_explore_adf --ring_size 20 --dsjson --chain_hash -d train-sets/ccb_reuse_medium.dsjson
    train-sets/ref/ccb_reuse_medium.stderr

# Test 178: Basic test of cluster. Can't use the VW replacer as it will think this is a VW command append things like --onethread
python3 ./cluster_test.py --vw ../build/vowpalwabbit/vw --spanning_tree ../build/cluster/spanning_tree \
    --test_file test-sets/0001.dat --data_files train-sets/0001.dat train-sets/0002.dat \
    --prediction_file cluster.predict
        test-sets/ref/cluster.stderr
        test-sets/ref/cluster.stdout
        pred-sets/ref/cluster.predict

# Test 179: Test if options that are negative numbers are handled correctly
{VW} --classweight -1:0.5 --no_stdin
    test-sets/ref/negative-num-option.stderr

# Test 180: test cb_dro with softmax
{VW} --cb_dro --cb_adf --rank_all -d train-sets/cb_adf_sm.data -p cb_dro_adf_sm.predict --cb_type sm
    train-sets/ref/cb_dro_adf_sm.stderr
    pred-sets/ref/cb_dro_adf_sm.predict

# Test 181: Tests segfault that used to happen when audit, cache and interactions were combined.
{VW} -c -k --passes 2 -d train-sets/cache_interaction_audit.txt -q st --audit
    train-sets/ref/cache_interaction_audit.stdout
    train-sets/ref/cache_interaction_audit.stderr

# Test 182: Enable chain hash option for json example
{VW} --audit --json --chain_hash -d train-sets/chain_hash_json_test.json --invert_hash chain_hash_json_result.cmp --chain_hash && \
    tail -n +2 chain_hash_json_result.cmp > chain_hash_json_result.cmp.new && \
        rm chain_hash_json_result.cmp && \
            mv chain_hash_json_result.cmp.new chain_hash_json_result.cmp
    test-sets/ref/chain_hash_json_test.stderr
    test-sets/ref/chain_hash_json_test.stdout
    test-sets/ref/chain_hash_json_result.cmp

# Test 183: Enable chain hash option for text example
{VW} --audit -d train-sets/chain_hash_text_test.dat --invert_hash chain_hash_text_result.cmp --chain_hash && \
    tail -n +2 chain_hash_text_result.cmp > chain_hash_text_result.cmp.new && \
        rm chain_hash_text_result.cmp && \
            mv chain_hash_text_result.cmp.new chain_hash_text_result.cmp
    test-sets/ref/chain_hash_text_result.stderr
    test-sets/ref/chain_hash_text_result.stdout
    test-sets/ref/chain_hash_text_result.cmp

# Test 184: Test override epsilon value saved in a model
{VW} -i model-sets/epsilon.model -d train-sets/override_epsilon.txt --epsilon 0.3 -p override_epsilon.preds
    pred-sets/ref/override_epsilon.stderr
    pred-sets/ref/override_epsilon.preds

# Test 185: Ensure that all weights that exist in the model are present in the invert_hash output. Even if Audit did not see it.
# SkipC# - Do not remove this - this test breaks test generation by creating an infinite sized list containing this test case (many times)
{VW} -d train-sets/inv_hash_load_model_data1.txt -f inv_hash_load_model.vw --noconstant \
    && {VW} -d train-sets/inv_hash_load_model_data2.txt -i inv_hash_load_model.vw --noconstant --readable_model inv_hash_load_model.readable.txt --invert_hash inv_hash_load_model.invert.txt
    train-sets/ref/inv_hash_load_model.invert.txt
    train-sets/ref/inv_hash_load_model.readable.txt

# Test 186: cb_explore_adf with rnd exploration
{VW} --cb_explore_adf --rnd 1 -d train-sets/cb_test.ldf --noconstant -p cbe_adf_rnd.predict
    train-sets/ref/cbe_adf_rnd.stderr
    pred-sets/ref/cbe_adf_rnd.predict

# Test 187: Slates sanity check
{VW} --slates -d train-sets/slates_simple.txt -p slates_simple.predict
    train-sets/ref/slates_simple.stderr
    pred-sets/ref/slates_simple.predict

# Test 188: offset_tree, 2 actions
{VW} --ot 2 -k -d train-sets/offset_tree_000.dat -p offset_tree_000.pred -P 1
    test-sets/ref/offset_tree_000.stderr
    pred-sets/ref/offset_tree_000.pred

# Test 189: offset_tree, 3 actions
{VW} --ot 3 -k -d train-sets/offset_tree_001.dat -p offset_tree_001.pred -P 1
    test-sets/ref/offset_tree_001.stderr
    pred-sets/ref/offset_tree_001.pred

# Test 190: offset_tree, 4 actions
{VW} --ot 4 -k -d train-sets/offset_tree_002.dat -p offset_tree_002.pred -P 1
    test-sets/ref/offset_tree_002.stderr
    pred-sets/ref/offset_tree_002.pred

# Test 191: Regression test for crash on unlabelled data
{VW} --dsjson --chain_hash --slates -d train-sets/slates_simple_unlabeled.dsjson
    train-sets/ref/slates_simple_unlabeled.stderr

# Test 192: check plt training
{VW} -d train-sets/multilabel -f plt.model --plt 10 --sgd
    train-sets/ref/plt_multilabel.stderr

# Test 193: check default plt prediction
{VW} -t -d train-sets/multilabel -i plt.model -p plt_multilabel.predict
    train-sets/ref/plt_multilabel_predict.stderr
    pred-sets/ref/plt_multilabel.predict

# Test 194: check plt top-1 prediction
{VW} -t -d train-sets/multilabel -i plt.model -p plt_top1_multilabel.predict --top_k 1
    train-sets/ref/plt_top1_multilabel_predict.stderr
    pred-sets/ref/plt_top1_multilabel.predict

# Test 195: daemon test with json
./daemon-test.sh --json --port 54251
    test-sets/ref/vw-daemon.stdout

# Test 196: cbify adf, squarecb
{VW} --cbify 10 --cb_explore_adf --cb_type mtr --squarecb --gamma_scale 500 -d train-sets/multiclass
    train-sets/ref/cbify_squarecb.stderr

# Test 197: cbify adf, squarecb-elim
{VW} --cbify 10 --cb_explore_adf --cb_type mtr --squarecb --elim --gamma_scale 10 --mellowness 0.001 -d train-sets/multiclass
    train-sets/ref/cbify_squarecb_elim.stderr

# Test 198: cbify ldf, squarecb
{VW} -d train-sets/cs_test.ldf --cbify_ldf --cb_type mtr --squarecb --gamma_scale 500
    train-sets/ref/cbify_ldf_squarecb.stderr

# Test 199: cbify ldf, squarecb-elim
{VW} -d train-sets/cs_test.ldf --cbify_ldf --cb_type mtr --squarecb --elim --gamma_scale 10 --mellowness 0.001
    train-sets/ref/cbify_ldf_squarecb_elim.stderr

# Test 200: cbify regression dataset.  Use it with cats.
{VW} --cbify 4 --cbify_reg --min_value=185 --max_value=23959 --bandwidth 3000 -d train-sets/regression/cbify-reg.dat --passes 1 -b 18 --coin --loss_option 1
    train-sets/ref/cbify-reg-cats.stderr

# Test 201: cats train
{VW} --cats 4 --min_value=185 --max_value=23959 --bandwidth 3000 -d train-sets/cats.acpx --passes 1 -b 18 --coin --loss_option 1 -f cats.model
    train-sets/ref/cats-train.stderr

# Test 202: cats predict
{VW} -d train-sets/cats.acpx -i cats.model -p cats.predict
    train-sets/ref/cats-predict.stderr
    pred-sets/ref/cats.predict

# Test 203: cats-pdf train
{VW} --cats_pdf 4 --min_value=185 --max_value=23959 --bandwidth 2000 -d train-sets/cats.acpx --passes 1 -b 18 --coin --loss_option 1 -f cats-pdf.model
    train-sets/ref/cats-pdf-train.stderr

# Test 204: cats-pdf predict
{VW} -d train-sets/cats.acpx -i cats-pdf.model -p cats-pdf.predict
    train-sets/ref/cats-pdf-predict.stderr
    pred-sets/ref/cats-pdf.predict

# Test 205: cbify-reg
{VW} --cbify 2048 --cbify_reg --min_value=185 --max_value=23959 --bandwidth 10000 -d train-sets/regression/cbify-reg.dat --coin --loss_option 1
    train-sets/ref/cbify_reg.stderr

# Test 206: cbify-reg cb_discrete
{VW} --cbify 2048 --cbify_reg --cb_discrete --min_value=185 --max_value=23959 -d train-sets/regression/cbify-reg.dat --coin --loss_option 1
    train-sets/ref/cbify_reg_discrete.stderr

# Test 207: cbify-reg discrete cats_tree
{VW} --cbify 2048 --cbify_reg --cb_discrete --cats_tree 2048 --min_value=185 --max_value=23959 -d train-sets/regression/cbify-reg.dat --coin --loss_option 1
    train-sets/ref/cbify_reg_discrete_cats.stderr

# Test 208: CCB first slot loss
{VW} -d train-sets/ccb_losses.txt --ccb_explore_adf --epsilon 0 --cb_type ips
    train-sets/ref/ccb_1slot_loss.stderr

# Test 209: CCB all slots loss
{VW} -d train-sets/ccb_losses.txt --ccb_explore_adf --epsilon 0 --cb_type ips --all_slots_loss
    train-sets/ref/ccb_allslots_loss.stderr

# Test 210: big feature poison test 1
{VW} -d train-sets/big_feature_poison.dat --interactions aaaaa --noconstant
    train-sets/ref/big_feature_poison.stderr
    train-sets/ref/big_feature_poison.stdout

# Test 211: big feature poison test 2
{VW} -d train-sets/big_feature_poison.dat --interactions aaaaa --noconstant --power_t 0
    train-sets/ref/big_feature_poison_2.stderr
    train-sets/ref/big_feature_poison_2.stdout

# Test 212: test decision service json parsing including chain hashing
{VW} -d train-sets/decisionservice.json --dsjson --chain_hash --cb_explore_adf --epsilon 0.2 --quadratic GT -P 1 -p cbe_adf_dsjson_chain_hash.predict
    train-sets/ref/cbe_adf_dsjson_chain_hash.stderr
    pred-sets/ref/cbe_adf_dsjson_chain_hash.predict

# Test 213: same with test 142 but with empty shared features
{VW} --explore_eval --epsilon 0.2 -d train-sets/cb_test_with_empty_shared_feature.ldf --noconstant -p explore_eval.predict
    train-sets/ref/explore_eval_with_empty_shared_feature.stderr
    pred-sets/ref/explore_eval.predict

# Test 214: Flatbuffer Simple Label Test
{VW} -k -l 20 --initial_t 128000 --power_t 1 -d train-sets/0001.fb \
    -f models/0001_1.model --invariant --flatbuffer\
    --ngram 3 --skips 1 --holdout_off
        train-sets/ref/0001_fb.stderr

# Test 215: Flatbuffer CB Label Test
{VW} --cb_force_legacy --cb 2 -d train-sets/rcv1_raw_cb_small.fb \
     --flatbuffer
        train-sets/ref/rcv1_raw_cb_fb.stderr

# Test 216: Flatbuffer Multilabel Test
{VW} --multilabel_oaa 10 -d train-sets/multilabel.fb \
     --flatbuffer
        train-sets/ref/multilabel_fb.stderr

# Test 217: Flatbuffer Mutliclass Test
{VW} -d train-sets/multiclass.fb -k --ect 10 \
     --flatbuffer
         train-sets/ref/multiclass_fb.stderr

# Test 218: Flatbuffer CS Test
{VW} -k -d train-sets/cs.fb --invariant \
     --csoaa_ldf multiline --flatbuffer
         train-sets/ref/cs_fb.stderr

# Test 219: Flatbuffer CB_eval test
{VW} -d train-sets/rcv1_cb_eval.fb --cb 2 --eval \
     --flatbuffer
         train-sets/ref/rcv1_cb_eval_fb.stderr

# Test 220: Flatbuffer no label Test (LDA)
{VW} -k --lda 100 --lda_alpha 0.01 --lda_rho 0.01 --lda_D 1000 -l 1 -b 13 --minibatch 128 -d train-sets/wiki256_no_label.fb --flatbuffer
         train-sets/ref/no_label_fb.stderr

# Test 221: Flatbuffer CCB Label Test
{VW} --ccb_explore_adf -d train-sets/ccb.fb \
     --flatbuffer
         train-sets/ref/ccb_fb.stderr

# Test 222: cb_explore with cover epsilon decaying
{VW} --cb_explore 2 --cover 3 -d train-sets/cb_explore_cover.dat -f models/cover_e_dec.model -p cover_e_dec_train.pred
    train-sets/ref/cbe_cover_e_dec.stderr
    train-sets/ref/cover_e_dec_train.pred

# Test 223: cb_explore with cover epsilon decaying predict
{VW} --cb_explore 2 --cover 3 -d train-sets/cb_explore_cover.dat -i models/cover_e_dec.model -t -p cbe_cover_e_dec.predict
    train-sets/ref/cbe_cover_e_dec_predict.stderr
    pred-sets/ref/cbe_cover_e_dec.predict

# Test 224: cb_explore with cover epsilon fixed
{VW} --cb_explore 2 --cover 3 -d train-sets/cb_explore_cover.dat -f models/cover_e_fixed.model --epsilon 0.5
    train-sets/ref/cbe_cover_e_fixed.stderr

# Test 225: cb_explore with cover epsilon fixed predict
{VW} --cb_explore 2 --cover 3 -d train-sets/cb_explore_cover.dat -i models/cover_e_fixed.model --epsilon 0.5 -t -p cbe_cover_e_fixed.predict
    train-sets/ref/cbe_cover_e_fixed_predict.stderr
    pred-sets/ref/cbe_cover_e_fixed.predict

# Test 226: cb_explore_adf with cover exploration epsilon decaying
{VW} --cb_explore_adf --cover 3 -d train-sets/cb_test_medium.ldf --noconstant -f models/cover_adf_e_dec.model
    train-sets/ref/cbe_adf_cover_e_dec.stderr

# Test 227: cb_explore_adf with cover exploration epsilon decaying predict only
{VW} --cb_explore_adf --cover 3 -d train-sets/cb_test_medium.ldf --noconstant -p cbe_adf_cover_e_dec.predict -i models/cover_adf_e_dec.model -t
    train-sets/ref/cbe_adf_cover_e_dec_predict.stderr
    pred-sets/ref/cbe_adf_cover_e_dec.predict

# Test 228: cb_explore_adf with cover epsilon fixed
{VW} --cb_explore_adf --cover 3 -d train-sets/cb_test_medium.ldf -f models/cover_adf_e_fixed.model --epsilon 0.5
    train-sets/ref/cbe_adf_cover_e_fixed.stderr

# Test 229: cb_explore_adf with cover epsilon fixed predict
{VW} --cb_explore_adf --cover 3 -d train-sets/cb_test_medium.ldf -i models/cover_adf_e_fixed.model --epsilon 0.5 -t -p cbe_adf_cover_e_fixed.predict
    train-sets/ref/cbe_adf_cover_e_fixed_predict.stderr
    pred-sets/ref/cbe_adf_cover_e_fixed.predict

# Test 230: cb_explore_adf with synthcover exploration
{VW} --cb_explore_adf --synthcover --epsilon 0.01 -d train-sets/cb_test.ldf --noconstant -p cbe_adf_synthcover.predict
    train-sets/ref/cbe_adf_synthcover.stderr
    pred-sets/ref/cbe_adf_synthcover.predict

# Test 231: cb data consumed by ccb_explore_adf reduction
{VW} --ccb_explore_adf --dsjson --epsilon 0.2 -d train-sets/cb_as_ccb.json
    train-sets/ref/cb_as_ccb.stderr

# Test 232: CCB interactions with slot with default namespace
{VW} -d train-sets/ccb_test_interactions.dat --ccb_explore_adf --invert_hash w_out_slot_ns.interactions -q ::
    train-sets/ref/ccb_test_interactions.stderr
    train-sets/ref/w_out_slot_ns.interactions

# Test 233: vw --help
{VW} --help
    train-sets/ref/help.stderr
    train-sets/ref/help.stdout

# Test 234: cb with dr (see test 39)
{VW} --cb_force_legacy -d train-sets/rcv1_raw_cb_small.vw --cb 2 --cb_type dr --ngram 2 --skips 4 -b 24 -l 0.25
    train-sets/ref/rcv1_raw_cb_dr.stderr

# Test 235: cb with ips (see test 40)
{VW} --cb_force_legacy -d train-sets/rcv1_raw_cb_small.vw --cb 2 --cb_type ips --ngram 2 --skips 4 -b 24 -l 0.125
    train-sets/ref/rcv1_raw_cb_ips.stderr

# Test 236: cb with dm (see test 41)
{VW} --cb_force_legacy -d train-sets/rcv1_raw_cb_small.vw --cb 2 --cb_type dm --ngram 2 --skips 4 -b 24 -l 0.125 -f cb_dm.reg
    train-sets/ref/rcv1_raw_cb_dm.stderr

# Test 237: cb redirection when --eval (see test 74)
{VW} -d train-sets/rcv1_cb_eval --cb 2 --eval
    train-sets/ref/rcv1_cb_eval.stderr

# Test 238: cbify, epsilon-greedy (see test 76)
{VW} --cb_force_legacy --cbify 10 --epsilon 0.05 -d train-sets/multiclass
    train-sets/ref/cbify_epsilon_legacy.stderr

# Test 239: cbify, tau first (see test 77)
{VW} --cb_force_legacy --cbify 10 --first 5 -d train-sets/multiclass
    train-sets/ref/cbify_first_legacy.stderr

# Test 240: cbify, bag (see test 78)
{VW} --cb_force_legacy --cbify 10 --bag 7 -d train-sets/multiclass
    train-sets/ref/cbify_bag_legacy.stderr

# Test 241: cbify, cover (see test 79)
{VW} --cb_force_legacy --cbify 10 --cover 3 -d train-sets/multiclass --nounif
    train-sets/ref/cbify_cover_legacy.stderr

# Test 242: cb_explore (see test 121)
{VW} --cb_force_legacy -d train-sets/rcv1_raw_cb_small.vw --cb_explore 2 --ngram 2 --skips 4 -b 24 -l 0.25 -p rcv1_raw_cb_explore_legacy.preds
    train-sets/ref/rcv1_raw_cb_explore_legacy.stderr
    pred-sets/ref/rcv1_raw_cb_explore_legacy.preds

# Test 243: test cbify large (see test 169)
{VW} --cb_force_legacy -d train-sets/rcv1_multiclass.dat --cbify 2 --epsilon 0.05
    train-sets/ref/rcv1_multiclass_legacy.stderr

# Test 244: cbify cs, epsilon-greedy (see test 171)
{VW} --cb_force_legacy --cbify 3 --cbify_cs --epsilon 0.05 -d train-sets/cs_cb
    train-sets/ref/cbify_epsilon_cs_legacy.stderr

# Test 245: cb_explore with cover epsilon decaying (see test 249)
{VW} --cb_force_legacy --cb_explore 2 --cover 3 -d train-sets/cb_explore_cover.dat -f models/cover_e_dec.model
    train-sets/ref/cbe_cover_e_dec_legacy.stderr

# Test 246: cb_explore with cover epsilon decaying predict (see test 250)
{VW} --cb_explore 2 --cover 3 -d train-sets/cb_explore_cover.dat -i models/cover_e_dec.model -t -p cbe_cover_e_dec_legacy.predict
    train-sets/ref/cbe_cover_e_dec_predict_legacy.stderr
    pred-sets/ref/cbe_cover_e_dec_legacy.predict

# Test 247: cb_explore with cover epsilon fixed (see test 251)
{VW} --cb_force_legacy --cb_explore 2 --cover 3 -d train-sets/cb_explore_cover.dat -f models/cover_e_fixed.model --epsilon 0.5
    train-sets/ref/cbe_cover_e_fixed_legacy.stderr

# Test 248: cb_explore with cover epsilon fixed predict (see test 252)
{VW} --cb_explore 2 --cover 3 -d train-sets/cb_explore_cover.dat -i models/cover_e_fixed.model --epsilon 0.5 -t -p cbe_cover_e_fixed_legacy.predict
    train-sets/ref/cbe_cover_e_fixed_predict_legacy.stderr
    pred-sets/ref/cbe_cover_e_fixed_legacy.predict

# Test 249: cb evaluation (see test 74)
{VW} -d train-sets/rcv1_cb_eval --cb 2 --eval
    train-sets/ref/rcv1_cb_eval.stderr

# Test 250: 8.8.0 old model, test cb compat
{VW} -i model-sets/cb_compat_test.vwmodel -d train-sets/cb_compat_test.dat -p cb_compat_test.predict
    train-sets/ref/cb_old_model.compat.stderr
    pred-sets/ref/cb_compat_test.predict


# Test 251: cbzo: learn and save constant template model
# Note: Changing hyperparameters (-l, --radius etc.) or any options affecting the learning algorithm will require
# preparing the dataset again
{VW} -d train-sets/cbzo_constant.dat --holdout_off --cbzo --policy constant -l 0.001 --radius 0.1 -f models/cbzo_constant.model
    train-sets/ref/cbzo_constant.stderr
    train-sets/ref/cbzo_constant.stdout

# Test 252: cbzo: verify predictions of Test 261 model.
# Also ends up testing if important cmd-line options (--cbzo, --policy) are saved with the model.
{VW} -d train-sets/cbzo_constant.dat --holdout_off -t --radius 0.1 -i models/cbzo_constant.model -p cbzo_constant.preds
    pred-sets/ref/cbzo_constant.preds

# Test 253: cbzo: verify predictions without the intervention of model saving
{VW} -d train-sets/cbzo_constant.dat --holdout_off --cbzo --policy constant -l 0.001 --radius 0.1 -p cbzo_constant_online.preds
    pred-sets/ref/cbzo_constant_online.preds

# Test 254: cbzo: verify --readable_model file contents
{VW} -d train-sets/cbzo_constant.dat --holdout_off --cbzo --policy constant -l 0.001 --radius 0.1 --readable_model cbzo_constant_readable_model.txt
    train-sets/ref/cbzo_constant_readable_model.txt

# Test 255: cbzo: verify --invert_hash file contents
{VW} -d train-sets/cbzo_constant.dat --holdout_off --cbzo --policy constant -l 0.001 --radius 0.1 --invert_hash cbzo_constant_invert_hash.txt
    train-sets/ref/cbzo_constant_invert_hash.txt

# Test 256: cbzo: learn and save linear template model
# Note: Changing hyperparameters (-l, --l1, --radius etc.) or any options affecting the learning algorithm will require
# preparing the dataset again
{VW} -d train-sets/cbzo_linear.dat --holdout_off --cbzo --policy linear -l 0.0001 --radius 0.1 --l1 0.2 --l2 0.3 --no_bias_regularization -f models/cbzo_linear.model
    train-sets/ref/cbzo_linear.stderr
    train-sets/ref/cbzo_linear.stdout

# Test 257: cbzo: verify predictions of Test 266 model.
# Also ends up testing if important cmd-line options (--cbzo, --policy) are saved with the model.
{VW} -d train-sets/cbzo_linear.dat --holdout_off -t --radius 0.1 -i models/cbzo_linear.model -p cbzo_linear.preds
    pred-sets/ref/cbzo_linear.preds

# Test 258: cbzo: verify predictions without the intervention of model saving
{VW} -d train-sets/cbzo_linear.dat --holdout_off --cbzo --policy linear -l 0.0001 --radius 0.1 --l1 0.2 --l2 0.3 --no_bias_regularization -p cbzo_linear_online.preds
    pred-sets/ref/cbzo_linear_online.preds

# Test 259: cbzo: verify --readable_model file contents
{VW} -d train-sets/cbzo_linear.dat --holdout_off --cbzo --policy linear -l 0.0001 --radius 0.1 --l1 0.2 --l2 0.3 --no_bias_regularization --readable_model cbzo_linear_readable_model.txt
    train-sets/ref/cbzo_linear_readable_model.txt

# Test 260: cbzo: verify --invert_hash file contents
{VW} -d train-sets/cbzo_linear.dat --holdout_off --cbzo --policy linear -l 0.0001 --radius 0.1 --l1 0.2 --l2 0.3 --no_bias_regularization --invert_hash cbzo_linear_invert_hash.txt
    train-sets/ref/cbzo_linear_invert_hash.txt

# Test 261: vw --help with filtering
{VW} --cb_adf --help
    train-sets/ref/help_cbadf.stderr
    train-sets/ref/help_cbadf.stdout

# Test 262: vw test used in brew script
{VW} -d train-sets/houses --audit --nn 1
    train-sets/ref/houses.stderr
    train-sets/ref/houses.stdout

# Test 263: creating model using --save_resume to be tested with -q :: and --invert_hash
{VW} -d train-sets/ccb_test_interactions.dat --ccb_explore_adf -q :: -f models/288.model --save_resume --invert_hash ccb_quad.inv
    train-sets/ref/ccb_quad.stderr
    pred-sets/ref/ccb_quad.inv

# Test 264: checking invert_hash for ccb with --save_resume model and -q ::
{VW} -d train-sets/ccb_test_interactions.dat -i models/288.model --save_resume --invert_hash ccb_quad_save_resume.inv
    train-sets/ref/ccb_quad_save_resume.stderr
    pred-sets/ref/ccb_quad_save_resume.inv

# Test 265: Slates sanity check with interactions and invert hash
{VW} --slates -d train-sets/slates_simple_w_interactions.txt -p slates_simple_w_interactions.predict -q :: --invert_hash slates_w_interactions.inv
    train-sets/ref/slates_simple_w_interactions.stderr
    pred-sets/ref/slates_simple_w_interactions.predict
    pred-sets/ref/slates_w_interactions.inv

# Test 266: test -q :: with many interactions
{VW} -d train-sets/ccb_lots_of_interactions.dat --ccb_explore_adf -q :: --invert_hash ccb_lots_of_interactions.inv
    train-sets/ref/ccb_lots_of_interactions.stderr
    pred-sets/ref/ccb_lots_of_interactions.inv

# Test 267: test -q :: with explicit interactions
{VW} -d train-sets/ccb_lots_of_interactions_mini.dat --ccb_explore_adf -q :: -q AB --interactions AAA --interactions ::: --invert_hash ccb_implicit_and_explicit_interactions.inv
    train-sets/ref/ccb_implicit_and_explicit_interactions.stderr
    pred-sets/ref/ccb_implicit_and_explicit_interactions.inv

# Test 268: test -q :: with explicit interactions and ignore
{VW} -d train-sets/ccb_lots_of_interactions_mini.dat --ignore C --ccb_explore_adf -q :: -q AB --interactions AAA --interactions ::: --invert_hash ccb_implicit_explicit_ignore_interactions.inv
    train-sets/ref/ccb_implicit_explicit_ignore_interactions.stderr
    pred-sets/ref/ccb_implicit_explicit_ignore_interactions.inv

# Test 269: Test interact reduction
{VW} -d train-sets/interact.dat -p t288.predict --interact ab --readable_model t288.readable
    test-sets/ref/t288.stderr
    pred-sets/ref/t288.predict
    pred-sets/ref/t288.readable

# Test 270: FTRL readable model test
# TODO: investigate slow convergence
{VW} -d train-sets/regression_simple.txt --noconstant --ftrl --invert_hash ftrl.readable
    train-sets/ref/ftrl.readable

# Test 271: pistol readable model test
{VW} -d train-sets/regression_simple.txt --noconstant --pistol --invert_hash pistol.readable
    train-sets/ref/pistol.readable

# Test 272: coin readable model test
{VW} -d train-sets/regression_simple.txt --noconstant --coin --invert_hash coin.readable
    train-sets/ref/coin.readable

# Test 273: generate cb model
{VW} -d train-sets/cb_as_ccb.json --dsjson --cb_explore_adf --save_resume -f models/cb_model.bin
    train-sets/ref/cb_as_ccb.cb.stderr

# Test 274: load cb model into ccb learner
{VW} -d train-sets/cb_as_ccb.json --dsjson --ccb_explore_adf --save_resume -i models/cb_model.bin
    train-sets/ref/cb_as_ccb.ccb.stderr

# Test 275: memory_tree create model
{VW} -d train-sets/aloi_short_train.dat --memory_tree 1204 --learn_at_leaf --max_number_of_labels 1000 --dream_at_update 0 \
    --dream_repeats 20 --online --leaf_example_multiplier 10 -f cmt.model
    train-sets/ref/cmt_train_model.stderr

# Test 276: memory_tree load model
{VW} -d test-sets/aloi_short_test.dat -i cmt.model
    train-sets/ref/cmt_test_model.stderr

# Test 277: cb file with no extra newline at the end
{VW} --cb_explore_adf --epsilon 0.1 -d train-sets/cb_test_nonewline.ldf --noconstant -p cbe_adf_nonewline.predict
    train-sets/ref/cbe_adf_nonewline.stderr
    pred-sets/ref/cbe_adf_nonewline.predict

# Test 278: ccb file with no extra newline at the end
{VW} --ccb_explore_adf  -d train-sets/ccb_test_nonewline.dat -p ccb_test_nonewline.predict
    train-sets/ref/ccb_test_nonewline.stderr
    train-sets/ref/ccb_test_nonewline.predict

# Test 279: slates file with no extra newline at the end
{VW} --slates -d train-sets/slates_simple_nonewline.txt -p slates_simple_nonewline.predict
    train-sets/ref/slates_simple_nonewline.stderr
    pred-sets/ref/slates_simple_nonewline.predict

# Test 280: cb json dataset with zero feature values
{VW} --cb_explore_adf --dsjson -d train-sets/cb_features_w_zero_vals.dsjson -p cb_zero_feature_vals_dsjson.predict
    train-sets/ref/cb_zero_feature_vals_dsjson.stderr
    pred-sets/ref/cb_zero_feature_vals_dsjson.predict

# Test 281: cb text dataset with zero feature values
{VW} --cb_explore_adf -d train-sets/cb_features_w_zero_vals.dat -p cb_zero_feature_vals.predict
    train-sets/ref/cb_zero_feature_vals.stderr
    pred-sets/ref/cb_zero_feature_vals.predict

# Test 282: ccb text dataset with zero feature values in actions/slot/shared feature
{VW} --ccb_explore_adf -d train-sets/ccb_zero_value_features.dat
    train-sets/ref/ccb_zero_value_features.stderr

# Test 283: slate text dataset with zero feature values in actions/slot/shared feature
{VW} --slates -d train-sets/slates_zero_value_features.txt
    train-sets/ref/slates_zero_value_features.stderr

# Test 284: cats predict room tempertature learn
{VW} -d train-sets/cats_room_temp.json --cats 32 --bandwidth 3 --min_value 0 --max_value 100 -f models/cats_room_temp.model --coin --json --chain_hash --epsilon 0.5
    train-sets/ref/cats_room_temp.stderr

# Test 285: cats predict room tempertature learn
{VW} -d train-sets/cats_room_temp.json -i models/cats_room_temp.model --coin --json --chain_hash -p cats_room_temp.predict
    train-sets/ref/cats_room_temp_pred.stderr
    pred-sets/ref/cats_room_temp.predict

# Test 286: test extra_metrics
{VW} -d train-sets/decisionservice.json --dsjson --cb_explore_adf --epsilon 0.2 --quadratic GT -P 1 -p cbe_adf_dsjson.predict --extra_metrics metrics.json
    train-sets/ref/cbe_adf_dsjson_metrics.stderr
    pred-sets/ref/cbe_adf_dsjson.predict
    test-sets/ref/metrics.json

# Test 287: test extra_metrics
{VW}  -d train-sets/rcv1_raw_cb_small.vw --cb 2 --cb_type dr --ngram 2 --skips 4 -b 24 -l 0.25 --extra_metrics metrics_2.json
    train-sets/ref/rcv1_raw_cb_dr_metrics.stderr
    test-sets/ref/metrics_2.json