
### This Interactive Notebook was generated by ML.NET Tooling.

The code below demonstrates how to

1. Define the model input and output schema
1. Load in data from a text file to an IDataView
1. Set up the training pipeline with data transforms
1. Choose an algorithm and append it to the pipeline
1. Train the model
1. Evaluate the model
1. Consume the model


## Install the necessary NuGet packages for training ML.NET model and plotting:

In [1]:

/* ML.NET Model Builder generated Notebook file. Notebook files contain both code snippets and rich text elements.
Use the "run" button in the left margin to execute each code snippet and explore ML.NET. */

#i "nuget:https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet5/nuget/v3/index.json" 
#i "nuget:https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-tools/nuget/v3/index.json" 

#r "nuget:Microsoft.ML,1.5.5"
#r "nuget:Microsoft.Data.Analysis,0.4.0"
#r "nuget:Microsoft.ML.AutoML,0.17.1"
#r "nuget:XPlot.Plotly.Interactive, 4.0.1"


In [1]:
using static Microsoft.DotNet.Interactive.Formatting.PocketViewTags;
using Microsoft.DotNet.Interactive.Formatting;
using Microsoft.Data.Analysis;

In [1]:
// Register your dataset into a dataframe to nicely display metrics

using Microsoft.AspNetCore.Html;
using Microsoft.DotNet.Interactive.Formatting;
using static Microsoft.DotNet.Interactive.Formatting.PocketViewTags;

Formatter.Register<DataFrame>((df, writer) =>
{
    var headers = new List<IHtmlContent>();
    headers.Add(th(i("index")));
    headers.AddRange(df.Columns.Select(c => (IHtmlContent) th(c.Name)));
    var rows = new List<List<IHtmlContent>>();
    var take = 20;
    for (var i = 0; i < Math.Min(take, df.Rows.Count); i++)
    {
        var cells = new List<IHtmlContent>();
        cells.Add(td(i));
        foreach (var obj in df.Rows[i])
        {            cells.Add(td(obj));
        }
        rows.Add(cells);
    }
    
    var t = table(
        thead(
            headers),
        tbody(
            rows.Select(
                r => tr(r))));
    
    writer.Write(t);
}, "text/html");

In [1]:
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;

## Define the model input and output schemas:

In [1]:
// Define the model input schema (which columns you will be loading in for training)
public class ModelInput
{
    [ColumnName(@"vendor_id"), LoadColumn(0)]
    public string Vendor_id { get; set; }
    
    [ColumnName(@"rate_code"), LoadColumn(1)]
    public float Rate_code { get; set; }
    
    [ColumnName(@"passenger_count"), LoadColumn(2)]
    public float Passenger_count { get; set; }
    
    [ColumnName(@"trip_time_in_secs"), LoadColumn(3)]
    public float Trip_time_in_secs { get; set; }
    
    [ColumnName(@"trip_distance"), LoadColumn(4)]
    public float Trip_distance { get; set; }
    
    [ColumnName(@"payment_type"), LoadColumn(5)]
    public string Payment_type { get; set; }
    
    [ColumnName(@"fare_amount"), LoadColumn(6)]
    public float Fare_amount { get; set; }
    
}


In [1]:
﻿// Define the model output schema (what the model will return)
public class ModelOutput
{
    public float Score { get; set; }
}



## Create MLContext and load training data:

In [1]:
// Define path to training data
string trainDataPath = @"C:\Users\t-jhoughton\source\repos\UserStudy\Explainability-Sample\Task1\Task1\Data\taxi-fare-test.csv";


In [1]:
// Create a new MLContext (the starting point for all ML.NET operations)
var mlContext = new MLContext();

// Load data from a text file to an IDataView (a flexible, efficient way of describing tabular data)
IDataView trainData = mlContext.Data.LoadFromTextFile<ModelInput>(
    path: trainDataPath ,
    hasHeader: true ,
    separatorChar: ',',
    allowQuoting: true,
    allowSparse: false);

// Display training data schema
display(trainData.Schema); 



In [1]:
// Show 5 rows of loaded data
public static List<ModelInput> Head(MLContext mlContext, IDataView dataView, int numberOfRows = 4)
{
    var rows = mlContext.Data.CreateEnumerable<ModelInput>(dataView, reuseRowObject: false)
                    .Take(numberOfRows)
                    .ToList();
    
    return rows;
}

display(h4("Showing 5 rows from training DataView:"));

var fewRows = Head(mlContext, trainData, 5);
display(fewRows);

## Create the training pipeline, choose an algorithm, and train the model:

In [1]:
using Microsoft.ML.Data;
using Microsoft.ML.Trainers.LightGbm;
using Microsoft.ML.Trainers;
using Microsoft.ML;


In [1]:
// Append the trainer to the data processing pipeline
    var pipeline = mlContext.Transforms.Categorical.OneHotEncoding(new []{new InputOutputColumnPair(@"vendor_id", @"vendor_id"),new InputOutputColumnPair(@"payment_type", @"payment_type")})      
                 .Append(mlContext.Transforms.ReplaceMissingValues(new []{new InputOutputColumnPair(@"rate_code", @"rate_code"),new InputOutputColumnPair(@"passenger_count", @"passenger_count"),new InputOutputColumnPair(@"trip_time_in_secs", @"trip_time_in_secs"),new InputOutputColumnPair(@"trip_distance", @"trip_distance")}))      
                 .Append(mlContext.Transforms.Concatenate(@"Features", new []{@"vendor_id",@"payment_type",@"rate_code",@"passenger_count",@"trip_time_in_secs",@"trip_distance"}))      
                 .Append(mlContext.Regression.Trainers.LightGbm(new LightGbmRegressionTrainer.Options(){NumberOfLeaves=26,MinimumExampleCountPerLeaf=18,NumberOfIterations=49,MaximumBinCountPerFeature=59,LearningRate=0.000155930943504775F,LabelColumnName=@"fare_amount",FeatureColumnName=@"Features",Booster=new GradientBooster.Options(){SubsampleFraction=0.437835793792913F,FeatureFraction=0.991583386423179F,L1Regularization=2E-10F,L2Regularization=0.000343571254284139F}}));

// Train the model (fit the model to the training data)
var model = pipeline.Fit(trainData);



## Consume the model

In [1]:
﻿ // Define sample model input
var sampleData = new ModelInput()
{
    Vendor_id = @"VTS",
    Rate_code = 1F,
    Passenger_count = 1F,
    Trip_time_in_secs = 1140F,
    Trip_distance = 3.75F,
    Payment_type = @"CRD",
};

// Create a Prediction Engine (used to make single predictions)
var predEngine = mlContext.Model.CreatePredictionEngine<ModelInput, ModelOutput>(model);
// Use the model and Prediction Engine to predict on new sample data
var predictionResult = predEngine.Predict(sampleData);
Console.WriteLine("Using model to make single prediction -- Comparing actual Fare_amount with predicted Fare_amount from sample data...\n\n");

Console.WriteLine($"Vendor_id: {sampleData.Vendor_id}");
Console.WriteLine($"Rate_code: {sampleData.Rate_code}");
Console.WriteLine($"Passenger_count: {sampleData.Passenger_count}");
Console.WriteLine($"Trip_time_in_secs: {sampleData.Trip_time_in_secs}");
Console.WriteLine($"Trip_distance: {sampleData.Trip_distance}");
Console.WriteLine($"Payment_type: {sampleData.Payment_type}");

Console.WriteLine($"\n\nPredicted Fare_amount: {predictionResult.Score}\n\n");


## Evaluate the model:

In [1]:
// Evaluate the model using the cross validation method
// Learn more about cross validation at https://aka.ms/mlnet-cross-validation

var crossValidationResults = mlContext.Regression.CrossValidate(trainData, pipeline, numberOfFolds: 5, labelColumnName:"fare_amount");

// Define which model evaluation metrics you'd like to see
var L1 = crossValidationResults.Select(r => r.Metrics.MeanAbsoluteError);
var L2 = crossValidationResults.Select(r => r.Metrics.MeanSquaredError);
var RMS = crossValidationResults.Select(r => r.Metrics.RootMeanSquaredError);
var lossFunction = crossValidationResults.Select(r => r.Metrics.LossFunction);
var R2 = crossValidationResults.Select(r => r.Metrics.RSquared);

// Print out the evaluation metrics
var metricNames = new StringDataFrameColumn("Metric Name", new[] {"Average L1 Loss", "Average L2 Loss", "Average RMS", "Average Loss Function", "Average R-Squared"});
var metricValues = new StringDataFrameColumn("Value",new[] {$"{L1.Average():0.###}", $"{L2.Average():0.###}", $"{RMS.Average():0.###}", $"{lossFunction.Average():0.###}", $"{R2.Average():0.###}"});
var stats = new DataFrame(metricNames, metricValues);

stats



### This Interactive Notebook was generated by ML.NET Tooling.

The code below demonstrates several methods to explain your model, including how to get and display

1. A Histogram of the distribution of number of instances
1. A Scatter Plot
1. Compare actual values to predicted values in a scatter plot
1. The importance of different features

In [1]:
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.IO;
using System.Linq;
using XPlot.Plotly;

## Compare Distribution of Number of Instances

In [1]:
// Extract some data into arrays for plotting

int numberOfRows = 5000;

// Columns was determined by inputted data
float[] fare_amount = trainData.GetColumn<float>("fare_amount").Take(numberOfRows).ToArray();

// Distribution of Number of Instances
var histogram = Chart.Plot(new Histogram(){x = fare_amount, autobinx = false, nbinsx = 20});
var layout = new Layout.Layout(){title="fare_amount vs Number of Instances"};
histogram.WithLayout(layout);
histogram.WithXTitle("fare_amount");
histogram.WithYTitle("Number of Instances");

display(histogram);



In [1]:
int numberOfRows = 2000;

float[] fare_amount = trainData.GetColumn<float>("fare_amount").Take(numberOfRows).ToArray();
float[] rate_code = trainData.GetColumn<float>("rate_code").Take(numberOfRows).ToArray();


var chartFareVsPassengers = Chart.Plot(
    new Scatter()
    {
        x = rate_code,
        y = fare_amount,
        mode = "markers",
    }
);

var layout = new Layout.Layout(){title="Plot fare_amount depending on rate_code"};
chartFareVsPassengers.WithLayout(layout);
chartFareVsPassengers.Width = 500;
chartFareVsPassengers.Height = 500;
chartFareVsPassengers.WithXTitle("rate_code");
chartFareVsPassengers.WithYTitle("fare_amount");
chartFareVsPassengers.WithLegend(false);

display(chartFareVsPassengers);



## Compare actual values to predicted values in a scatter plot

In [1]:
// Use the model to make batch predictions on training data
var testResults = model.Transform(trainData);

// Get the actual values from the dataset
var trueValues = testResults.GetColumn<float>("fare_amount");

// Get the predicted values from the test results
var predictedValues = testResults.GetColumn<float>("rate_code");

// Create scatter plot of actual vs predicted values
var predictedVsTrue = new Scattergl()
{
    x = trueValues,
    y = predictedValues,
    mode = "markers",
};

var maximumValue = Math.Max(trueValues.Max(), predictedValues.Max());

var perfectLine = new Scattergl()
{
    x = new[] {0, maximumValue},
    y = new[] {0, maximumValue},
    mode = "lines",
};

var chart = Chart.Plot(new[] {predictedVsTrue, perfectLine });
chart.WithXTitle("Actual Values");
chart.WithYTitle("Predicted Values");
chart.WithLegend(false);
chart.Width = 600;
chart.Height = 600;
display(chart);



## Calculate and graph the Permutation Feature Importance (PFI)

In [1]:
// Calculate PFI
var predictor = (ISingleFeaturePredictionTransformer<object>) ((IEnumerable<ITransformer>)model).Last();
var preprocessedTrainData = model.Transform(trainData);


VBuffer<ReadOnlyMemory<char>> nameBuffer = default;
preprocessedTrainData.Schema["Features"].Annotations.GetValue("SlotNames", ref nameBuffer); // NOTE: The column name "Features" needs to match the featureColumnName used in the trainer, the name "SlotNames" is always the same regardless of trainer.
var featureColumnNames = nameBuffer.DenseValues().ToList();

ImmutableArray<RegressionMetricsStatistics> permutationFeatureImportance =
    mlContext.Regression
    .PermutationFeatureImportance(predictor, preprocessedTrainData, permutationCount: 1, labelColumnName: "fare_amount");

var featureImportanceMetrics =
     permutationFeatureImportance
     .Select((metric, index) => new { index, metric.RSquared })
     .OrderByDescending(myFeatures => Math.Abs(myFeatures.RSquared.Mean));

    
var featureNames = new List<string>();
var featurePFI = new List<double>();
foreach (var feature in featureImportanceMetrics)
{
     featureNames.Add($"{featureColumnNames[feature.index],-20}");
     featurePFI.Add(feature.RSquared.Mean);
}
var featureImportance = new DataFrame(new StringDataFrameColumn("Feature", featureNames.ToArray() ), new DoubleDataFrameColumn("R-Squared Impact",featurePFI.ToArray()));
    
featureImportance



In [1]:
// Graph the PFI results
var pfiBar = new Bar()
{
    x = featureNames,
    y = featurePFI,
    dy = featurePFI[0]/100
};

var pfiChart = Chart.Plot(pfiBar);
pfiChart.WithXTitle("Feature");
pfiChart.WithYTitle("Contribution (delta R-Squared)");
pfiChart.Width = 600;
pfiChart.Height = 600;
display(pfiChart);