# Multivariate Linear Regression

In [1]:
import $ivy.`com.github.tototoshi::scala-csv:1.3.5`
import $file.^.datasmarts.ml.toy.scripts.SimpleLinearRegression, SimpleLinearRegression._
import scala.util.Random

[32mimport [39m[36m$ivy.$                                      
[39m
[32mimport [39m[36m$file.$                                                 , SimpleLinearRegression._
[39m
[32mimport [39m[36mscala.util.Random[39m

## Data

In this occasion we will use the [Wine Quality Dataset](https://raw.githubusercontent.com/jesus-a-martinez-v/toy-ml/master/src/main/resources/data/8/winequality-white.csv). Let's load it:

In [4]:
val BASE_DATA_PATH = "../../resources/data"
val wineQualityDataPath = s"$BASE_DATA_PATH/8/winequality-white.csv"

val rawData = loadCsv(wineQualityDataPath)
val numberOfRows = rawData.length
val numberOfColumns = rawData.head.length
println(s"Number of rows in dataset: $numberOfRows")
println(s"Number of columns in dataset: $numberOfColumns")

val data = (0 until numberOfColumns).toVector.foldLeft(rawData) { (d, i) => textColumnToNumeric(d, i)}

Number of rows in dataset: 4898
Number of columns in dataset: 12


[36mBASE_DATA_PATH[39m: [32mString[39m = [32m"../../resources/data"[39m
[36mwineQualityDataPath[39m: [32mString[39m = [32m"../../resources/data/8/winequality-white.csv"[39m
[36mrawData[39m: [32mVector[39m[[32mVector[39m[[32mData[39m]] = [33mVector[39m(
  [33mVector[39m(
    Text(7),
    Text(0.27),
    Text(0.36),
    Text(20.7),
    Text(0.045),
    Text(45),
    Text(170),
    Text(1.001),
    Text(3),
    Text(0.45),
[33m...[39m
[36mnumberOfRows[39m: [32mInt[39m = [32m4898[39m
[36mnumberOfColumns[39m: [32mInt[39m = [32m12[39m
[36mdata[39m: [32mVector[39m[[32mVector[39m[[32mData[39m]] = [33mVector[39m(
  [33mVector[39m(
    Numeric(7.0),
    Numeric(0.27),
    Numeric(0.36),
    Numeric(20.7),
    Numeric(0.045),
    Numeric(45.0),
    Numeric(170.0),
    Numeric(1.001),
    Numeric(3.0),
    Numeric(0.45),
[33m...[39m

In [5]:
def updatedVector[T](vector: Vector[T], newValue: T, index: Int): Vector[T] = {
  val (firstHalf, secondHalf) = vector.splitAt(index)
  firstHalf ++ Vector(newValue) ++ secondHalf.tail
}

defined [32mfunction[39m [36mupdatedVector[39m

In [6]:
def predictLinearRegression(row: Vector[Data], coefficients: Vector[Double]): Double = {
  val indices = row.indices.init

  indices.foldLeft(0.0) { (accumulator, index) =>
    accumulator + coefficients(index + 1) * getNumericValue(row(index)).get
  } + coefficients.head
}

defined [32mfunction[39m [36mpredictLinearRegression[39m

In [7]:
def coefficientsLinearRegressionSgd(train: Dataset, learningRate: Double, numberOfEpochs: Int) = {
  var coefficients = Vector.fill(train.head.length)(0.0)

  for {
    _ <- 1 to numberOfEpochs
    row <- train
    predicted = predictLinearRegression(row, coefficients)
    actual = getNumericValue(row.last).get
    error = predicted - actual
  } {
    // TODO Bias?
    val firstCoefficient = coefficients.head - learningRate * error
    val indices = row.indices.init

    val remainingCoefficients = indices.foldLeft(coefficients) { (c, index) =>
      updatedVector(c, c(index + 1) - learningRate * error * getNumericValue(row(index)).get, index + 1)
    }

    coefficients = Vector(firstCoefficient) ++ remainingCoefficients
  }

  coefficients
}

defined [32mfunction[39m [36mcoefficientsLinearRegressionSgd[39m

In [8]:
def linearRegressionSgd(train: Dataset, test: Dataset, parameters: Parameters) = {
  val learningRate = parameters("learningRate").asInstanceOf[Double]
  val numberOfEpochs = parameters("numberOfEpochs").asInstanceOf[Int]

  val coefficients = coefficientsLinearRegressionSgd(train, learningRate, numberOfEpochs)

  test.map { row =>
    predictLinearRegression(row, coefficients)
  }
}

defined [32mfunction[39m [36mlinearRegressionSgd[39m