In [1]:
%matplotlib inline

In [2]:
import numpy as np
import matplotlib.pyplot as plt


# Basic Statistics

## Variance and Standard Deviation

In [3]:
dataset = np.array([12, 23, 34, 44, 59, 70, 98])
dataset

array([12, 23, 34, 44, 59, 70, 98])

In [4]:
mean = dataset.mean()
mean

48.57142857142857

In [8]:
distances_to_mean = dataset - mean
distances_to_mean.round(1)

array([-36.6, -25.6, -14.6,  -4.6,  10.4,  21.4,  49.4])

In [9]:
sq_distances = sum(distances_to_mean ** 2)
sq_distances

5235.714285714286

In [10]:
n = dataset.shape[0]
n

7

In [11]:
v = sq_distances / (n - 1)  # дисперсия, variation
v

872.6190476190477

In [12]:
s = np.sqrt(v)  # std
s

29.54012605963366

In [13]:
s, np.std(dataset, ddof=1) # Validate: calculated s == np.std(dataset)

(29.54012605963366, 29.54012605963366)

## CoVariance

###  2-dimensional data set and covariance calculation

<div>
<img src="attachment:a72df0e1-5aa1-4fd5-8af9-8c4371969004.png" width="300">
</div>

<div>
<img src="attachment:54f247b8-2f63-429e-8ae4-4c9162ea9ecb.png" width="300">
</div>

In [56]:
# Hours studed. hrs  |  Mark on exam, %
dataset = np.array([
    [9, 39],
    [15, 56],
    [25, 93],
    [14, 61],
    [10, 50],
    [18, 75],
    [0, 32],
    [16, 85],
    [5, 42],
    [19, 70],
    [16, 66],
    [20, 80],
])
dataset

array([[ 9, 39],
       [15, 56],
       [25, 93],
       [14, 61],
       [10, 50],
       [18, 75],
       [ 0, 32],
       [16, 85],
       [ 5, 42],
       [19, 70],
       [16, 66],
       [20, 80]])

In [57]:
mean_hours, mean_mark = np.mean(dataset, axis=0)
mean_hours, mean_mark

(13.916666666666666, 62.416666666666664)

In [58]:
hour_differences_to_mean = dataset[:,0] - mean_hours
hour_differences_to_mean

array([ -4.91666667,   1.08333333,  11.08333333,   0.08333333,
        -3.91666667,   4.08333333, -13.91666667,   2.08333333,
        -8.91666667,   5.08333333,   2.08333333,   6.08333333])

In [59]:
mark_differences_to_mean = dataset[:,1] - mean_mark
mark_differences_to_mean

array([-23.41666667,  -6.41666667,  30.58333333,  -1.41666667,
       -12.41666667,  12.58333333, -30.41666667,  22.58333333,
       -20.41666667,   7.58333333,   3.58333333,  17.58333333])

In [60]:
product = hour_difference_to_mean * mark_differences_to_mean
product

array([ 1.15131944e+02, -6.95138889e+00,  3.38965278e+02, -1.18055556e-01,
        4.86319444e+01,  5.13819444e+01,  4.23298611e+02,  4.70486111e+01,
        1.82048611e+02,  3.85486111e+01,  7.46527778e+00,  1.06965278e+02])

In [61]:
np.sum(product)

1352.4166666666667

In [62]:
n = dataset.shape[0]  # number or measurements (rows)
n

12

In [63]:
cov = sum(product) / (n - 1)  # covariance, Коварация
cov

122.9469696969697

<div>
<img src="attachment:54f247b8-2f63-429e-8ae4-4c9162ea9ecb.png" width="300">
</div>

Verify calculation is correct - compare to `np.cov()`.

In [66]:
np.cov(dataset.T)

array([[ 47.71969697, 122.9469697 ],
       [122.9469697 , 370.08333333]])

In [71]:
hours_variation = np.var(dataset[:,0], ddof=1)
marks_variation = np.var(dataset[:,1], ddof=1)

hours_variation, marks_variation

(47.71969696969697, 370.08333333333337)

Variations on the diagonal (COvariance between the same columns).

Co-Variation on the sides, the same one as var(H, M) = var(M, H)

**Conclusions:**
* Non-diagonal values of the covariance matrix are > 0. Therefore the two dimensions **increase together**: the greater the hours studied the higher is the mark. 
* The value (122) depends on how big are the numbers, the nature of what is measured. It does **not** provide helpful information. 


###  3-dimensional data set and covariance matrix

`x, y, z` are vectors and are the columns of matrix `a`.

In [90]:
a = np.array([
    [1, 2, 1],
    [-1, 1, 3],  
    [4, 3, -1],  
])

`np.cov` calculates the covariance between **row vectors**, therefore we need to transpose the matrix: `a.T`.

In [91]:
np.cov(a.T, ddof=1)

array([[ 6.33333333,  2.5       , -5.        ],
       [ 2.5       ,  1.        , -2.        ],
       [-5.        , -2.        ,  4.        ]])