# Reproduction of Various Tribuo Models

## Setup
You'll need to get a copy of the irises dataset. If you are using Docker this should already be downloaded

`wget https://archive.ics.uci.edu/ml/machine-learning-databases/iris/bezdekIris.data`

It's Java, so first we load in the necessary Tribuo jars. Here we're using the classification experiments jar, along with the json interop jar to read and write the provenance information.

In [13]:
%jars ./tribuo-classification-experiments-4.2.0-SNAPSHOT-jar-with-dependencies.jar
%jars ./tribuo-json-4.2.0-SNAPSHOT-jar-with-dependencies.jar
%jars ./tribuo-regression-sgd-4.2.0-SNAPSHOT-jar-with-dependencies.jar
%jars ./tribuo-regression-xgboost-4.2.0-SNAPSHOT-jar-with-dependencies.jar
%jars ./tribuo-regression-tree-4.2.0-SNAPSHOT-jar-with-dependencies.jar
%jars ./tribuo-clustering-kmeans-4.2.0-SNAPSHOT-jar-with-dependencies.jar
%jars ./tribuo-anomaly-libsvm-4.2.0-SNAPSHOT-jar-with-dependencies.jar
%jars ./tribuo-onnx-4.2.0-SNAPSHOT-jar-with-dependencies.jar

%jars ./tribuo-reproducibility-4.2.0-SNAPSHOT-jar-with-dependencies.jar

In [14]:
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.Files;

In [15]:
import org.tribuo.*;
import org.tribuo.evaluation.TrainTestSplitter;
import org.tribuo.data.csv.CSVLoader;
import org.tribuo.datasource.ListDataSource;
import org.tribuo.evaluation.TrainTestSplitter;
import org.tribuo.classification.*;
import org.tribuo.classification.evaluation.*;
import org.tribuo.classification.sgd.linear.LogisticRegressionTrainer;
import org.tribuo.classification.sgd.linear.LinearSGDModel;
import org.tribuo.math.optimisers.*;
import org.tribuo.regression.*;
import org.tribuo.regression.evaluation.*;
import org.tribuo.regression.sgd.RegressionObjective;
import org.tribuo.regression.sgd.linear.LinearSGDTrainer;
import org.tribuo.regression.sgd.objectives.SquaredLoss;
import org.tribuo.regression.rtree.CARTRegressionTrainer;
import org.tribuo.regression.rtree.impurity.MeanSquaredError;
import org.tribuo.regression.xgboost.XGBoostRegressionTrainer;
import org.tribuo.util.Util;

import org.tribuo.provenance.DatasetProvenance;

import org.tribuo.reproducibility.ReproUtil;

In [16]:
import com.fasterxml.jackson.databind.*;
import com.oracle.labs.mlrg.olcut.provenance.ProvenanceUtil;
import com.oracle.labs.mlrg.olcut.config.json.*;

# Setup 

In [17]:
FileWriter fw = new FileWriter("./results.csv");
fw.append("Model, EqualEval, Diff, Dataset, Note\n");
fw.flush();
fw.close();

public String escapeSpecialCharacters(String data) {
    String escapedData = data.replaceAll("\\R", " ");
    if (data.contains(",") || data.contains("\"") || data.contains("'")) {
        data = data.replace("\"", "\"\"");
        escapedData = "\"" + data + "\"";
    }
    return escapedData;
}

public void addToCSV(String model, String equal, String diff, String dataset, String note) throws Exception{
    FileWriter fw = new FileWriter("./results.csv", true);
    fw.append(escapeSpecialCharacters(model) + ","
              + escapeSpecialCharacters(equal) + "," 
              + escapeSpecialCharacters(diff) + "," + dataset + "," + note + "\n");
    fw.flush();
    fw.close();
}



## Irises Reproduction

In [18]:
var labelFactory = new LabelFactory();
var csvLoader = new CSVLoader<>(labelFactory);

var irisHeaders = new String[]{"sepalLength", "sepalWidth", "petalLength", "petalWidth", "species"};
var irisesSource = csvLoader.loadDataSource(Paths.get("bezdekIris.data"),"species",irisHeaders);
var irisSplitter = new TrainTestSplitter<>(irisesSource,0.7,1L);

var trainingDataset = new MutableDataset<>(irisSplitter.getTrain());
var testingDataset = new MutableDataset<>(irisSplitter.getTest());

Trainer<Label> trainer = new LogisticRegressionTrainer();

Model<Label> irisModel = trainer.train(trainingDataset);

In [19]:
var repro = new ReproUtil(irisModel.getProvenance());
Model<Label> newModel = repro.reproduceFromProvenance();

var evaluator = new LabelEvaluator();
var oldEvaluation = evaluator.evaluate(irisModel,testingDataset);
var newEvaluation = evaluator.evaluate(newModel,testingDataset);
oldEvaluation.toString().equals(newEvaluation.toString());
addToCSV(irisModel.getProvenance().getClassName(), 
         String.valueOf(oldEvaluation.toString().equals(newEvaluation.toString())), 
         ReproUtil.diffProvenance(irisModel.getProvenance(), newModel.getProvenance()),"Irises", "")

### Example Diff of a Reproduced Model

In [20]:
System.out.println(ReproUtil.diffProvenance(irisModel.getProvenance(), newModel.getProvenance()) + '\n');

{
  "dataset" : {
    "datasource" : {
      "source" : {
        "datasource-creation-time" : {
          "original" : "2021-09-30T19:54:01.994181189Z",
          "reproduced" : "2021-09-30T19:54:03.831371998Z"
        }
      }
    }
  },
  "trained-at" : {
    "original" : "2021-09-30T19:54:02.095293115Z",
    "reproduced" : "2021-09-30T19:54:03.834169995Z"
  }
}



# Regression Wine Quality

In [10]:
public Model<Regressor> reproduceRegressor(Dataset<Regressor> trainData, Dataset<Regressor> testData, Trainer<Regressor> trainer, String note) throws Exception{
    Model<Regressor> model = trainer.train(trainData);
    
    var repro = new ReproUtil(model.getProvenance());
    Model<Regressor> newModel = repro.reproduceFromProvenance(); 
    
    RegressionEvaluator eval = new RegressionEvaluator();
    var oldEvaluation = eval.evaluate(model,testData);
    var newEvaluation = eval.evaluate(newModel,testData);
    
    addToCSV(model.getProvenance().getClassName(),
            String.valueOf(oldEvaluation.toString().equals(newEvaluation.toString())),
            ReproUtil.diffProvenance(model.getProvenance(), newModel.getProvenance()), "Wine Quality", note);
    return newModel;
}

var regressionFactory = new RegressionFactory();
var csvLoader = new CSVLoader<>(';',regressionFactory);

var wineSource = csvLoader.loadDataSource(Paths.get("winequality-red.csv"),"quality");
var splitter = new TrainTestSplitter<>(wineSource, 0.7f, 0L);
Dataset<Regressor> trainData = new MutableDataset<>(splitter.getTrain());
Dataset<Regressor> testData = new MutableDataset<>(splitter.getTest());

var lrsgd = new LinearSGDTrainer(
    new SquaredLoss(), // loss function
    SGD.getLinearDecaySGD(0.01), // gradient descent algorithm
    10,                // number of training epochs
    trainData.size()/4,// logging interval
    1,                 // minibatch size
    1L                 // RNG seed
);
var lrada = new LinearSGDTrainer(
    new SquaredLoss(),
    new AdaGrad(0.01),
    10,
    trainData.size()/4,
    1,
    1L 
);
var cart = new CARTRegressionTrainer(6);
var xgb = new XGBoostRegressionTrainer(50);

Model<Regressor> lrsgdModel = reproduceRegressor(trainData, testData, lrsgd, "LinearDecaySGD");
Model<Regressor> lradaModel = reproduceRegressor(trainData, testData, lrada, "AdaGrad");
Model<Regressor> cartModel = reproduceRegressor(trainData, testData, cart, "");

# Example Diff with Same Trainer but Different Optimizers

In [23]:
System.out.println(ReproUtil.diffProvenance(lrsgdModel.getProvenance(), lradaModel.getProvenance()));

{
  "dataset" : {
    "datasource" : {
      "source" : {
        "datasource-creation-time" : {
          "original" : "2021-09-30T19:52:57.390025140Z",
          "reproduced" : "2021-09-30T19:52:57.444953394Z"
        }
      }
    }
  },
  "trained-at" : {
    "original" : "2021-09-30T19:52:57.416278131Z",
    "reproduced" : "2021-09-30T19:52:57.464902493Z"
  },
  "trainer" : {
    "optimiser" : {
      "class-name" : {
        "original" : "org.tribuo.math.optimisers.LinearDecaySGD",
        "reproduced" : "org.tribuo.math.optimisers.AdaGrad"
      },
      "rho" : {
        "original" : "0.0"
      },
      "useMomentum" : {
        "original" : "NONE"
      },
      "epsilon" : {
        "reproduced" : "1.0E-6"
      },
      "initialValue" : {
        "reproduced" : "0.0"
      }
    }
  }
}


# Diff of Classification LinearSGDModel to Regression LinearSGDModel

In [24]:
System.out.println(ReproUtil.diffProvenance(lrsgdModel.getProvenance(), irisModel.getProvenance()));

{
  "class-name" : {
    "original" : "org.tribuo.regression.sgd.linear.LinearSGDModel",
    "reproduced" : "org.tribuo.classification.sgd.linear.LinearSGDModel"
  },
  "dataset" : {
    "datasource" : {
      "seed" : {
        "original" : "0",
        "reproduced" : "1"
      },
      "size" : {
        "original" : "1599",
        "reproduced" : "150"
      },
      "source" : {
        "dataPath" : {
          "original" : "/home/jupyter/results/winequality-red.csv",
          "reproduced" : "/home/jupyter/results/bezdekIris.data"
        },
        "datasource-creation-time" : {
          "original" : "2021-09-30T19:52:57.390025140Z",
          "reproduced" : "2021-09-30T19:54:01.994181189Z"
        },
        "file-modified-time" : {
          "original" : "2009-10-16T21:36:50Z",
          "reproduced" : "1999-12-14T20:12:39Z"
        },
        "resource-hash" : {
          "original" : "4A402CF041B025D4566D954C3B9BA8635A3A8A01E039005D97D6A710278CF05E",
          "reproduced" :

# Configuration 

In [9]:
import org.tribuo.transform.*;
import org.tribuo.transform.transformations.LinearScalingTransformation;
import org.tribuo.classification.*;
import org.tribuo.classification.evaluation.*;
import com.oracle.labs.mlrg.olcut.config.Configurable;
import com.oracle.labs.mlrg.olcut.config.ConfigurationManager;
import com.oracle.labs.mlrg.olcut.config.DescribeConfigurable;
import com.oracle.labs.mlrg.olcut.provenance.*;
import com.oracle.labs.mlrg.olcut.provenance.primitives.*;
import com.oracle.labs.mlrg.olcut.config.json.JsonConfigFactory;

ConfigurationManager.addFileFormatFactory(new JsonConfigFactory());

### Transform Trainer directly

In [10]:
var configPath = Paths.get("..", "tribuo","tutorials", "configuration","example-config.json");
var cm = new ConfigurationManager(configPath.toString());
var logistic = (Trainer<Label>) cm.lookup("logistic");

DataSource<Label> mnistTrain = (DataSource<Label>) cm.lookup("mnist-train");
DataSource<Label> mnistTest = (DataSource<Label>) cm.lookup("mnist-test");

var trainData = new MutableDataset<>(mnistTrain);
var testData = new MutableDataset<>(mnistTest);
var transformations = new TransformationMap(List.of(new LinearScalingTransformation(0,1)));
var transformed = new TransformTrainer(logistic,transformations);
var transformedModel = transformed.train(trainData);


var repro = new ReproUtil(transformedModel.getProvenance());
var newModel = repro.reproduceFromProvenance();



In [11]:
var eval = new LabelEvaluator();
var oldEvaluation = eval.evaluate((Model<Label>)transformedModel, mnistTest);
var newEvaluation = eval.evaluate((Model<Label>)newModel, mnistTest);

addToCSV(newModel.getProvenance().getClassName(),
            String.valueOf(oldEvaluation.toString().equals(newEvaluation.toString())),
            ReproUtil.diffProvenance(transformedModel.getProvenance(), newModel.getProvenance()),"MNIST", "");

### Transform Trainer with TrainTestSplitter

In [12]:
var configPath = Paths.get("..", "tribuo","tutorials", "configuration","example-config.json");
var cm = new ConfigurationManager(configPath.toString());
var logistic = (Trainer<Label>) cm.lookup("logistic");

DataSource<Label> mnistTrain = (DataSource<Label>) cm.lookup("mnist-train");
TrainTestSplitter splitter = new TrainTestSplitter(mnistTrain);

var trainData = new MutableDataset<>(splitter.getTrain());
var testData = new MutableDataset<>(splitter.getTest());
var transformations = new TransformationMap(List.of(new LinearScalingTransformation(0,1)));
var transformed = new TransformTrainer(logistic,transformations);
var transformedModel = transformed.train(trainData);


var repro = new ReproUtil(transformedModel.getProvenance());
var newModel = repro.reproduceFromProvenance();



In [13]:
var eval = new LabelEvaluator();
var oldEvaluation = eval.evaluate((Model<Label>)transformedModel, mnistTest);
var newEvaluation = eval.evaluate((Model<Label>)newModel, mnistTest);

addToCSV(newModel.getProvenance().getClassName(),
            String.valueOf(oldEvaluation.toString().equals(newEvaluation.toString())),
            ReproUtil.diffProvenance(transformedModel.getProvenance(), newModel.getProvenance()), "MNIST", "With TrainTestSplitter");

# Clustering

In [14]:
import org.tribuo.clustering.*;
import org.tribuo.clustering.evaluation.*;
import org.tribuo.clustering.example.GaussianClusterDataSource;
import org.tribuo.clustering.kmeans.*;
import org.tribuo.clustering.kmeans.KMeansTrainer.Distance;
import org.tribuo.clustering.kmeans.KMeansTrainer.Initialisation;

In [15]:
var eval = new ClusteringEvaluator();

In [16]:
var data = new MutableDataset<>(new GaussianClusterDataSource(500, 1L));
var test = new MutableDataset<>(new GaussianClusterDataSource(500, 2L));

var trainer = new KMeansTrainer(5, /* centroids */
                                10, /* iterations */
                                Distance.EUCLIDEAN, /* distance function */
                                1, /* number of compute threads */
                                1 /* RNG seed */
                               );
var kmeansModel = trainer.train(data);
var kmRepro = new ReproUtil(kmeansModel);
var newKmeans = kmRepro.reproduceFromProvenance();

var kmeansEval = eval.evaluate(kmeansModel, test);
var newKmeansEval = eval.evaluate((KMeansModel)newKmeans, test);

addToCSV(kmeansModel.getProvenance().getClassName(),
        String.valueOf(kmeansEval.toString().equals(newKmeansEval.toString())),
        ReproUtil.diffProvenance(kmeansModel.getProvenance(), newKmeans.getProvenance()), "Generated", "")


### KMeans++ 

In [17]:
var plusplusTrainer = new KMeansTrainer(5,10,Distance.EUCLIDEAN,Initialisation.PLUSPLUS,1,1);
var plusplusModel = plusplusTrainer.train(data);

var plusplusRepro = new ReproUtil(plusplusModel);
var newPlusPlus = plusplusRepro.reproduceFromProvenance();

var plusPlusEval = eval.evaluate(plusplusModel, test);
var newPlusPlusEval = eval.evaluate((KMeansModel)newPlusPlus, test);

addToCSV(plusplusModel.getProvenance().getClassName(),
        String.valueOf(plusPlusEval.toString().equals(newPlusPlusEval.toString())),
        ReproUtil.diffProvenance(plusplusModel.getProvenance(), newPlusPlus.getProvenance()),"Generated", "KMeans++")


# Anomaly Detection with LibSVM

In [18]:
import org.tribuo.anomaly.*;
import org.tribuo.anomaly.evaluation.*;
import org.tribuo.anomaly.example.GaussianAnomalyDataSource;
import org.tribuo.anomaly.libsvm.*;
import org.tribuo.common.libsvm.*;

var eval = new AnomalyEvaluator();

In [19]:
var data = new MutableDataset<>(new GaussianAnomalyDataSource(2000,/* number of examples */
                                                              0.0f,/*fraction anomalous */
                                                              1L/* RNG seed */));
var test = new MutableDataset<>(new GaussianAnomalyDataSource(2000,0.2f,2L));

var params = new SVMParameters<>(new SVMAnomalyType(SVMAnomalyType.SVMMode.ONE_CLASS), KernelType.RBF);
params.setGamma(1.0);
params.setNu(0.1); 
var trainer = new LibSVMAnomalyTrainer(params);

var anomModel = trainer.train(data);

var anomRepro = new ReproUtil(anomModel);
var newAnom = anomRepro.reproduceFromProvenance();

var testEvaluation = eval.evaluate(anomModel,test);
var newEvaluation = eval.evaluate((LibSVMModel)newAnom,test);

addToCSV(anomModel.getProvenance().getClassName(),
        String.valueOf(testEvaluation.toString().equals(newEvaluation.toString())),
        ReproUtil.diffProvenance(anomModel.getProvenance(), newAnom.getProvenance()), "Generated", "")

*
optimization finished, #iter = 653
obj = 289.5926348816893, rho = 3.144570476807895
nSV = 296, nBSV = 114
*
optimization finished, #iter = 653
obj = 289.5926348816893, rho = 3.144570476807895
nSV = 296, nBSV = 114


# Loading Columnar Data

In [20]:
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.stream.*;

import com.oracle.labs.mlrg.olcut.config.ConfigurationManager;
import com.oracle.labs.mlrg.olcut.provenance.ProvenanceUtil;

import org.tribuo.*;
import org.tribuo.data.columnar.*;
import org.tribuo.data.columnar.processors.field.*;
import org.tribuo.data.columnar.processors.response.*;
import org.tribuo.data.columnar.extractors.*;
import org.tribuo.data.csv.CSVDataSource;
import org.tribuo.data.text.impl.BasicPipeline;
import org.tribuo.json.JsonDataSource;
import org.tribuo.classification.*;
import org.tribuo.classification.sgd.linear.LogisticRegressionTrainer;
import org.tribuo.util.tokens.impl.BreakIteratorTokenizer;

In [21]:
var csvPath = Paths.get("..", "tribuo", "tutorials", "columnar-data","columnar-example.csv");
var csvLines = Files.readAllLines(csvPath, StandardCharsets.UTF_8);

BasicPipeline textPipeline = new BasicPipeline(new BreakIteratorTokenizer(Locale.US),2);
HashMap<String, FieldProcessor> fieldProcessors = new HashMap<String, FieldProcessor>();

fieldProcessors.put("height",new DoubleFieldProcessor("height"));
fieldProcessors.put("description",new TextFieldProcessor("description",textPipeline));
fieldProcessors.put("transport",new IdentityProcessor("transport"));

HashMap<String,FieldProcessor> regexMappingProcessors = new HashMap<String,FieldProcessor>();
regexMappingProcessors.put("extra.*", new DoubleFieldProcessor("extra.*"));

FieldResponseProcessor responseProcessor = new FieldResponseProcessor("disposition","UNK",new LabelFactory());

ArrayList<FieldExtractor<?>> metadataExtractors = new ArrayList<FieldExtractor<?>>();
metadataExtractors.add(new IntExtractor("id"));
metadataExtractors.add(new DateExtractor("timestamp","timestamp","dd/MM/yyyy HH:mm"));

FloatExtractor weightExtractor = new FloatExtractor("example-weight");

RowProcessor<Label> rowProcessor = new RowProcessor<Label>(metadataExtractors,weightExtractor,responseProcessor,fieldProcessors,regexMappingProcessors, Collections.emptySet());

var jsonPath = Paths.get("..", "tribuo", "tutorials", "columnar-data","columnar-example.json");
var jsonLines = Files.readAllLines(jsonPath, StandardCharsets.UTF_8);

var jsonSource = new JsonDataSource<>(jsonPath,rowProcessor,true);
CSVDataSource csvSource = new CSVDataSource<Label>(csvPath,rowProcessor,true);

var csvSplitter = new TrainTestSplitter(csvSource);
var jsonSplitter = new TrainTestSplitter(jsonSource);


MutableDataset<Label> datasetFromJson = new MutableDataset<Label>(jsonSplitter.getTrain());
MutableDataset<Label> datasetFromCSV = new MutableDataset<Label>(csvSplitter.getTrain());

In [22]:
var csvModel = new LogisticRegressionTrainer().train(datasetFromCSV);
var jsonModel = new LogisticRegressionTrainer().train(datasetFromJson);

var csvRepro = new ReproUtil(csvModel);
var jsonRepro = new ReproUtil(jsonModel);

var newCSV = csvRepro.reproduceFromProvenance();
var newJson = jsonRepro.reproduceFromProvenance();

var evaluator = new LabelEvaluator();

var csvEval = evaluator.evaluate(csvModel, new MutableDataset(csvSplitter.getTest()));
var jsonEval = evaluator.evaluate(jsonModel, new MutableDataset(jsonSplitter.getTest()));

var newCsvEval = evaluator.evaluate((LinearSGDModel) newCSV, new MutableDataset(csvSplitter.getTest()));
var newJsonEval = evaluator.evaluate((LinearSGDModel) newJson, new MutableDataset(jsonSplitter.getTest()));

addToCSV(csvModel.getProvenance().getClassName(),
        String.valueOf(csvEval.toString().equals(newCsvEval.toString())),
        ReproUtil.diffProvenance(csvModel.getProvenance(), newCSV.getProvenance()), "Generated", "CSV Columnar with TrainTest");
addToCSV(jsonModel.getProvenance().getClassName(),
        String.valueOf(jsonEval.toString().equals(newJsonEval.toString())),
        ReproUtil.diffProvenance(jsonModel.getProvenance(), newJson.getProvenance()), "Generated", "JSON Columnar with TrainTest")

# Feature Extraction

In [23]:
import java.util.Collections;
import java.nio.file.Paths;
import com.oracle.labs.mlrg.olcut.provenance.ProvenanceUtil;
import com.oracle.labs.mlrg.olcut.util.Pair;
import org.tribuo.*;
import org.tribuo.data.text.*;
import org.tribuo.data.text.impl.*;
import org.tribuo.dataset.MinimumCardinalityDataset;
import org.tribuo.classification.*;
import org.tribuo.classification.evaluation.*;
import org.tribuo.classification.sgd.linear.LinearSGDTrainer;
import org.tribuo.classification.sgd.objectives.LogMulticlass;
import org.tribuo.interop.onnx.extractors.BERTFeatureExtractor;
import org.tribuo.math.optimisers.AdaGrad;
import org.tribuo.transform.*;
import org.tribuo.transform.transformations.IDFTransformation;
import org.tribuo.util.tokens.universal.UniversalTokenizer;
import org.tribuo.util.Util;

In [24]:
var labelFactory = new LabelFactory();
var labelEvaluator = new LabelEvaluator();
var trainPath = Paths.get(".","20news","20news-bydate-train");
var testPath = Paths.get(".","20news","20news-bydate-test");
var tokenizer = new UniversalTokenizer();
var bowPipeline = new BasicPipeline(tokenizer,1);
var bowExtractor = new TextFeatureExtractorImpl<Label>(bowPipeline);

var newsProc = new NewsPreprocessor();
var lowercase = new CasingPreprocessor(CasingPreprocessor.CasingOperation.LOWERCASE);

public Pair<Dataset<Label>,Dataset<Label>> mkDatasets(String name, TextFeatureExtractor<Label> extractor) {
    var trainSource = new DirectoryFileSource<>(trainPath,labelFactory,extractor,newsProc,lowercase);
    var testSource = new DirectoryFileSource<>(testPath,labelFactory,extractor,newsProc,lowercase);
    var trainDS = new MutableDataset<>(trainSource);
    var testDS = new ImmutableDataset<>(testSource,trainDS.getFeatureIDMap(),trainDS.getOutputIDInfo(),true);
    System.out.println(String.format(name + " training data size = %d, number of features = %d, number of classes = %d",trainDS.size(),trainDS.getFeatureMap().size(),trainDS.getOutputInfo().size()));
    System.out.println(String.format(name + " testing data size = %d, number of features = %d, number of classes = %d",testDS.size(),testDS.getFeatureMap().size(),testDS.getOutputInfo().size()));
    return new Pair<>(trainDS,testDS);
}

var bowPair = mkDatasets("bow",bowExtractor);


bow training data size = 11314, number of features = 122024, number of classes = 20
bow testing data size = 7532, number of features = 122024, number of classes = 20


### Simple bag of words

In [25]:
var lrTrainer = new LinearSGDTrainer(new LogMulticlass(),new AdaGrad(0.1,0.001),5,42);
var bowModel = lrTrainer.train(bowPair.getA());

var bowRepro = new ReproUtil(bowModel);
var newBow = bowRepro.reproduceFromProvenance();

In [26]:
var bowEval = labelEvaluator.evaluate(bowModel,bowPair.getB());
var newBowEval = labelEvaluator.evaluate((LinearSGDModel)newBow,bowPair.getB());

addToCSV(bowModel.getProvenance().getClassName(),
        String.valueOf(bowEval.toString().equals(newBowEval.toString())),
        ReproUtil.diffProvenance(bowModel.getProvenance(), newBow.getProvenance()), "20 News", "BoW Simple Logistic");

### Unigram 

In [27]:
var unigramPipeline = new TokenPipeline(tokenizer, 1, true);
var unigramExtractor = new TextFeatureExtractorImpl<Label>(unigramPipeline);
var unigramPair = mkDatasets("unigram",unigramExtractor);

var model = lrTrainer.train(unigramPair.getA());
var repro = new ReproUtil(model);
var newModel = repro.reproduceFromProvenance();

unigram training data size = 11314, number of features = 122024, number of classes = 20
unigram testing data size = 7532, number of features = 122024, number of classes = 20


In [28]:
var oldEval = labelEvaluator.evaluate(model,unigramPair.getB());
var newEval = labelEvaluator.evaluate((LinearSGDModel)newModel,unigramPair.getB());

addToCSV(model.getProvenance().getClassName(),
        String.valueOf(oldEval.toString().equals(newEval.toString())),
        ReproUtil.diffProvenance(model.getProvenance(), newModel.getProvenance()), "20 News", "Unigram Logistic");

### Bigrams

In [29]:
var bigramPipeline = new TokenPipeline(tokenizer, 2, true);
var bigramExtractor = new TextFeatureExtractorImpl<Label>(bigramPipeline);
var bigramPair = mkDatasets("bigram",bigramExtractor);

var model = lrTrainer.train(bigramPair.getA());
var repro = new ReproUtil(model);
var newModel = repro.reproduceFromProvenance(); 

bigram training data size = 11314, number of features = 1143035, number of classes = 20
bigram testing data size = 7532, number of features = 1143035, number of classes = 20


In [30]:
var oldEval = labelEvaluator.evaluate(model,unigramPair.getB());
var newEval = labelEvaluator.evaluate((LinearSGDModel)newModel,unigramPair.getB());

addToCSV(model.getProvenance().getClassName(),
        String.valueOf(oldEval.toString().equals(newEval.toString())),
        ReproUtil.diffProvenance(model.getProvenance(), newModel.getProvenance()),"20 News", "Bigram Logistic");

# Random Forest Trainer

In [31]:
import org.tribuo.common.tree.RandomForestTrainer;
import org.tribuo.regression.ensemble.AveragingCombiner;


In [32]:
CARTRegressionTrainer subsamplingTree = new CARTRegressionTrainer(Integer.MAX_VALUE,
                CARTRegressionTrainer.MIN_EXAMPLES, 0.0f, 0.5f, false, new MeanSquaredError(), Trainer.DEFAULT_SEED);
    
RandomForestTrainer<Regressor> rfT = new RandomForestTrainer<>(subsamplingTree,new AveragingCombiner(),10);

var regressionFactory = new RegressionFactory();
var csvLoader = new CSVLoader<>(';',regressionFactory);

var wineSource = csvLoader.loadDataSource(Paths.get("winequality-red.csv"),"quality");

var wineSplitter = new TrainTestSplitter(wineSource);

Dataset<Regressor> trainData = new MutableDataset<>(wineSplitter.getTrain());
Dataset<Regressor> testData = new MutableDataset<>(wineSplitter.getTest());

Model<Regressor> model = rfT.train(trainData);

ReproUtil reproUtil = new ReproUtil(model.getProvenance());
Model<Regressor> newModel = reproUtil.reproduceFromProvenance();

RegressionEvaluator eval = new RegressionEvaluator();

var oldEvaluation = eval.evaluate(model,testData);
var newEvaluation = eval.evaluate(newModel,testData);

addToCSV(model.getProvenance().getClassName(),
        String.valueOf(oldEvaluation.toString().equals(newEvaluation.toString())),
        ReproUtil.diffProvenance(model.getProvenance(), newModel.getProvenance()), "Wine Quality", "Random Forest Trainer");