In [1]:
%jars ../tribuo/Classification/Experiments/target/tribuo-classification-experiments-4.2.0-SNAPSHOT-jar-with-dependencies.jar
%jars ../tribuo/Classification/SGD/target/tribuo-classification-sgd-4.2.0-SNAPSHOT-jar-with-dependencies.jar
%jars ../tribuo/Common/NearestNeighbour/target/tribuo-common-nearest-neighbour-4.2.0-SNAPSHOT.jar
%jars ../tribuo/Json/target/tribuo-json-4.2.0-SNAPSHOT-jar-with-dependencies.jar
%jars ../tribuo/MultiLabel/SGD/target/tribuo-multilabel-sgd-4.2.0-SNAPSHOT-jar-with-dependencies.jar
%jars ../tribuo/Regression/SGD/target/tribuo-regression-sgd-4.2.0-SNAPSHOT-jar-with-dependencies.jar
%jars ../tribuo/Regression/SLM/target/tribuo-regression-slm-4.2.0-SNAPSHOT-jar-with-dependencies.jar
%jars ../tribuo/Regression/XGBoost/target/tribuo-regression-xgboost-4.2.0-SNAPSHOT-jar-with-dependencies.jar
%jars ../tribuo/Regression/LibLinear/target/tribuo-regression-liblinear-4.2.0-SNAPSHOT-jar-with-dependencies.jar
%jars ../tribuo/Regression/LibSVM/target/tribuo-regression-libsvm-4.2.0-SNAPSHOT-jar-with-dependencies.jar
%jars ../tribuo/Regression/RegressionTree/target/tribuo-regression-tree-4.2.0-SNAPSHOT-jar-with-dependencies.jar

%jars ../tribuo/Reproducibility/target/tribuo-reproducibility-4.2.0-SNAPSHOT-jar-with-dependencies.jar

In [2]:
import java.nio.file.Files;
import java.nio.file.Paths;

In [3]:
import org.tribuo.*;
import org.tribuo.evaluation.TrainTestSplitter;
import org.tribuo.data.csv.CSVLoader;
import org.tribuo.datasource.ListDataSource;
import org.tribuo.evaluation.TrainTestSplitter;
import org.tribuo.classification.*;
import org.tribuo.classification.evaluation.*;
import org.tribuo.classification.sgd.linear.LogisticRegressionTrainer;
import org.tribuo.classification.sgd.linear.LinearSGDModel;
import org.tribuo.math.optimisers.*;
import org.tribuo.regression.*;
import org.tribuo.regression.evaluation.*;
import org.tribuo.regression.sgd.RegressionObjective;
import org.tribuo.regression.sgd.linear.LinearSGDTrainer;
import org.tribuo.regression.sgd.objectives.SquaredLoss;
import org.tribuo.regression.rtree.CARTRegressionTrainer;
import org.tribuo.regression.rtree.impurity.MeanSquaredError;
import org.tribuo.regression.xgboost.XGBoostRegressionTrainer;
import org.tribuo.util.Util;

import org.tribuo.provenance.DatasetProvenance;

import org.tribuo.reproducibility.ReproUtil;

In [4]:
import org.tribuo.transform.*;
import org.tribuo.transform.transformations.LinearScalingTransformation;
import org.tribuo.classification.*;
import org.tribuo.classification.evaluation.*;
import com.oracle.labs.mlrg.olcut.config.Configurable;
import com.oracle.labs.mlrg.olcut.config.ConfigurationManager;
import com.oracle.labs.mlrg.olcut.config.DescribeConfigurable;
import com.oracle.labs.mlrg.olcut.provenance.*;
import com.oracle.labs.mlrg.olcut.provenance.primitives.*;
import com.oracle.labs.mlrg.olcut.config.json.JsonConfigFactory;

# Setup

In [5]:
FileWriter fw = new FileWriter("./results/configResults.csv");
fw.append("Task, Trainer, Model, Equivalent Evaluation, Model Prov Diff, Dataset Name, Datasource\n");
fw.flush();
fw.close();

public String escapeSpecialCharacters(String data) {
    String escapedData = data.replaceAll("\\R", " ");
    if (data.contains(",") || data.contains("\"") || data.contains("'")) {
        data = data.replace("\"", "\"\"");
        escapedData = "\"" + data + "\"";
    }
    return escapedData;
}

public void addToCSV(String task, String trainer, String model, String equal, String diff, String dataset, String datatype) throws Exception{
    FileWriter fw = new FileWriter("./results/configResults.csv", true);
    fw.append(escapeSpecialCharacters(task) + "," +
              escapeSpecialCharacters(trainer) + "," + 
              escapeSpecialCharacters(model) + "," +
              escapeSpecialCharacters(equal) + "," +
              escapeSpecialCharacters(diff) + "," + dataset + "," + datatype + "\n");
    fw.flush();
    fw.close();
}

# Regression Trainers 

In [6]:
var configPath = Paths.get("configs","all-reg-config.xml");
var cm = new ConfigurationManager(configPath.toString());
HashMap<String,Trainer> regressionTrainers = (HashMap<String,Trainer>) cm.lookupAllMap(Trainer.class);

var regressionFactory = new RegressionFactory();
var csvLoader = new CSVLoader<>(';',regressionFactory);

var wineSource = csvLoader.loadDataSource(Paths.get("data/winequality-red.csv"),"quality");
var splitter = new TrainTestSplitter<>(wineSource, 0.7f, 0L);
Dataset<Regressor> trainData = new MutableDataset<>(splitter.getTrain());
Dataset<Regressor> testData = new MutableDataset<>(splitter.getTest());

In [7]:
for (String trainerKey : regressionTrainers.keySet()){
    Model<Regressor> model = regressionTrainers.get(trainerKey).train(trainData);
    ReproUtil repro = new ReproUtil(model);
    Model<Regressor> newModel = repro.reproduceFromProvenance();
    
    RegressionEvaluator eval = new RegressionEvaluator();
    var oldEvaluation = eval.evaluate(model,testData);
    var newEvaluation = eval.evaluate(newModel,testData);
    try {
        FileOutputStream fileOut =
            new FileOutputStream("./provenance/regression/" + trainerKey + ".prov", false);
        ObjectOutputStream out = new ObjectOutputStream(fileOut);
        out.writeObject(model.getProvenance());
        out.close();
        fileOut.close();
        System.out.println("Serialized data is saved in /provenance/" + trainerKey + ".prov");
    } catch (IOException i) {
         i.printStackTrace();
    }
    
    addToCSV("Regression", 
             regressionTrainers.get(trainerKey).getClass().toString(),
             model.getProvenance().getClassName(), 
             String.valueOf(oldEvaluation.toString().equals(newEvaluation.toString())), 
             ReproUtil.diffProvenance(model.getProvenance(), newModel.getProvenance()),
             "Wine Quality", "CSV");
}

Serialized data is saved in /provenance/extra-reg.prov
Serialized data is saved in /provenance/linear.prov
.*
optimization finished, #iter = 1916
epsilon = 0.05858504392616126
obj = -230.79094690365463, rho = -5.626991935990098
nSV = 833, nBSV = 152
.*
optimization finished, #iter = 1808
epsilon = 0.07793527107924003
obj = -243.15211945947448, rho = -5.627112057086192
nSV = 837, nBSV = 162
.*
optimization finished, #iter = 1755
epsilon = 0.07251134900140022
obj = -236.37515174534084, rho = -5.607265191604213
nSV = 827, nBSV = 155
.*
optimization finished, #iter = 1747
epsilon = 0.08102216879836943
obj = -234.3816170911098, rho = -5.603957730574681
nSV = 833, nBSV = 151
.*
optimization finished, #iter = 1745
epsilon = 0.08276264571544178
obj = -239.56419636505967, rho = -5.593597188837265
nSV = 840, nBSV = 154
Prob. model for test data: target value = predicted value + z,
z: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma=0.5444889322242289
..*
optimization finished, #iter = 2340
eps

# Classification Trainers

In [8]:
var labelFactory = new LabelFactory();
var csvLoader = new CSVLoader<>(labelFactory);

var irisHeaders = new String[]{"sepalLength", "sepalWidth", "petalLength", "petalWidth", "species"};
var irisesSource = csvLoader.loadDataSource(Paths.get("data/bezdekIris.data"),"species",irisHeaders);
var irisSplitter = new TrainTestSplitter<>(irisesSource,0.7,1L);

var trainingDataset = new MutableDataset<>(irisSplitter.getTrain());
var testingDataset = new MutableDataset<>(irisSplitter.getTest());

var configTrainersPath = Paths.get("configs","all-classification-config.xml");
var cm = new ConfigurationManager(configTrainersPath.toString());
HashMap<String,Trainer> classificationTrainers = (HashMap<String,Trainer>) cm.lookupAllMap(Trainer.class);

In [9]:
for (String trainerKey : classificationTrainers.keySet()){
    Model<Label> model = classificationTrainers.get(trainerKey).train(trainingDataset);
    ReproUtil repro = new ReproUtil(model);
    Model<Label> newModel = repro.reproduceFromProvenance();
    
    var eval = new LabelEvaluator();
    var oldEvaluation = eval.evaluate((Model<Label>)model, testingDataset);
    var newEvaluation = eval.evaluate((Model<Label>)newModel, testingDataset);
    
    try {
        FileOutputStream fileOut =
            new FileOutputStream("./provenance/classification/" + trainerKey + ".prov", false);
        ObjectOutputStream out = new ObjectOutputStream(fileOut);
        out.writeObject(model.getProvenance());
        out.close();
        fileOut.close();
        System.out.println("Serialized data is saved in /provenance/" + trainerKey + ".prov");
    } catch (IOException i) {
         i.printStackTrace();
    }
    
    addToCSV("Classification", 
            classificationTrainers.get(trainerKey).getClass().toString(),
             model.getProvenance().getClassName(), 
             String.valueOf(oldEvaluation.toString().equals(newEvaluation.toString())), 
             ReproUtil.diffProvenance(model.getProvenance(), newModel.getProvenance()),
             "Irises", "CSV");
}

Serialized data is saved in /provenance/mnb.prov
*
optimization finished, #iter = 36
C = 0.450676519301985
obj = 4.060713540406142, rho = -0.15557661251414256
nSV = 33, nBSV = 23
Total nSV = 33
*
optimization finished, #iter = 34
C = 0.43377339533652987
obj = 4.204548460300719, rho = -0.18441634078532076
nSV = 32, nBSV = 24
Total nSV = 32
*
optimization finished, #iter = 50
C = 0.4092944865965712
obj = 3.8225298467191338, rho = -0.034279929410371404
nSV = 34, nBSV = 21
Total nSV = 34
*
optimization finished, #iter = 53
C = 0.32645358973724875
obj = 3.1283710053227174, rho = -0.17054925617865305
nSV = 35, nBSV = 23
Total nSV = 35
*
optimization finished, #iter = 44
C = 0.4191082423024166
obj = 3.6510602759484465, rho = -0.23177429322622808
nSV = 35, nBSV = 23
Total nSV = 35
*
optimization finished, #iter = 44
C = 0.3336332759961269
obj = 3.847052253190065, rho = -0.15095910731550125
nSV = 42, nBSV = 28
*
optimization finished, #iter = 30
C = 0.1563558336968146
obj = 2.0040132419155365, 