<a href="https://colab.research.google.com/github/lezwon/Data-Science-from-Scratch/blob/master/chapter_5_statistics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
typealias Vector = Array<Float>

In [0]:
extension Vector {
    func sum()->Float{
      return self.reduce(0, +)
    }

    func mean()->Float{
      return self.sum() / Float(self.count)
    }
}

In [0]:
func dot(_ v: Vector, _ w: Vector) -> Float {
  assert(v.count == w.count, "Vector lengths do not match.")
  return zip(v, w).map({ $0 * $1 }).reduce(0, +)
}

In [0]:
typealias Matrix = Array<Vector>

In [0]:
extension Matrix {
  func shape() -> (Int, Int){
    return (self.count, self.first?.count ?? 0)
  }

  func get_row(_ i: Int) -> Vector {
    return self[i]
  }

  func get_column(_ j: Int) -> Vector {
    return self.map({ $0[j] })
  }
}

In [0]:
let m: Matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [7]:
m.shape()

▿ 2 elements
  - .0 : 3
  - .1 : 3


In [8]:
m.get_row(1)

▿ 3 elements
  - 0 : 4.0
  - 1 : 5.0
  - 2 : 6.0


In [9]:
m.get_row(1).mean()

5.0


In [10]:
m.get_column(1)

▿ 3 elements
  - 0 : 2.0
  - 1 : 5.0
  - 2 : 8.0


In [0]:
func create_matrix(_ i: Int,_ j: Int,_ entry_fn: (Int, Int) -> Float) -> Matrix{
  return (0..<i).map{ (a) in
    (0..<j).map{ (b) in
      entry_fn(a, b)
    }
  }
}

In [0]:
var median_array: Vector = [ 2, 4, 6, 1, 7, 9, 4 ]

In [0]:
extension Vector {
  func median_odd() -> Float {
    return self.sorted()[self.count / 2]
  }

  func median_even() -> Float {
    let mid_pt = self.count / 2
    let sorted_array = self.sorted()
    return (sorted_array[mid_pt - 1] + sorted_array[mid_pt]) / 2
  }

  func median() -> Float {
    if self.count % 2 == 0 {
      return median_even()
    } 
    else {
      return median_odd()
    }
  }

  func quantile(_ p: Float) -> Float {
    let i = Int(Float(self.count) * p)
    return self.sorted()[i]
  }

  func mode() -> Vector {
    var counter: [Float: Int] = [:]
    self.forEach { counter[$0, default: 0] += 1 }
    let max_frequency = counter.max(by: { a, b in a.value < b.value })?.value
    if let max = max_frequency {
      let mode = counter.filter({ $0.value == max }).keys
      return Vector(mode)
    } else {
      return []
    }
  }
}

In [14]:
median_array.median()

4.0


In [15]:
median_array.quantile(0.2)

2.0


In [0]:
var median_array: Vector = [ 2, 4, 6, 1, 7, 9, 4, 2 ]

In [17]:
median_array.mode()

▿ 2 elements
  - 0 : 2.0
  - 1 : 4.0


In [0]:
extension Vector {
  func data_range() -> Float {
    return (self.max() ?? 0) - (self.min() ?? 0)
  }

  func mean_distance() -> Vector {
    let mean = self.mean()
    return self.map { $0 - mean }
  }

  func squared() -> Vector {
    return self.map({ $0 * $0 })
  }

  func variance() -> Float {
    let distance = self.mean_distance()
    let squared = distance.squared()
    return squared.sum() / Float(squared.count - 1)
  }

  func std_dev() -> Float {
    return self.variance().squareRoot()
  }

  func interquantile_range() -> Float {
    return self.quantile(0.75) - self.quantile(0.25)
  }
}

In [19]:
median_array.data_range()

8.0


In [20]:
median_array.mean_distance()

▿ 8 elements
  - 0 : -2.375
  - 1 : -0.375
  - 2 : 1.625
  - 3 : -3.375
  - 4 : 2.625
  - 5 : 4.625
  - 6 : -0.375
  - 7 : -2.375


In [21]:
median_array.variance()

7.696429


In [22]:
median_array.std_dev()

2.7742438


In [23]:
median_array.interquantile_range()

5.0


In [0]:
func covariance(_ w: Vector, _ v: Vector) -> Float {
  assert(w.count == v.count, "Vector lengths do not match")
  return dot(w.mean_distance(), v.mean_distance()) / Float(w.count - 1)
}

In [0]:
let x: Vector = [1, 2, 3, 4, 5]
let y: Vector = [6, 7, 8, 9, 0]

In [26]:
covariance(y, x)

-2.5


In [0]:
func correlation(_ w: Vector, _ v: Vector) -> Float {
  assert(w.count == v.count, "Vector lengths do not match")
  let covar = covariance(w, v)
  let std_w = w.std_dev()
  let std_v = v.std_dev()

  return covar / (std_w * std_v)
}

In [28]:
correlation(y, x)

-0.4472136
