# Statistics

In [51]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats
import statistics

## Descriptive Statistics

Descriptive statistics is about summarizing data. It usually has two branches:
* Quantitative approach
* Visual approach

### Quantitative Approach

#### Types of measure

##### Central Tendency

###### Mean

\begin{equation}
\large \mu = \frac{1}{N}\sum_{i=1}^N x_i
\end{equation}

In [19]:
numbers = [1, 2, 3, 4, 5, 6]
numbers

[1, 2, 3, 4, 5, 6]

In [8]:
import statistics
import numpy as np

In [15]:
%%timeit -n 100000 -r 10
statistics.mean(numbers)

14.1 µs ± 1.26 µs per loop (mean ± std. dev. of 10 runs, 100000 loops each)


In [16]:
%%timeit -n 100000 -r 10
statistics.fmean(numbers)

478 ns ± 20.4 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


In [17]:
%%timeit -n 100000 -r 10
np.mean(numbers)

14.5 µs ± 1.19 µs per loop (mean ± std. dev. of 10 runs, 100000 loops each)


In [18]:
%%timeit -n 100000 -r 10
sum(numbers)/len(numbers)

389 ns ± 20.7 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


###### Weighted Mean

\begin{equation}
\large \bar {x} = \frac{\sum_{i=1}^N x_i w_i}{\sum_{i=1}^N w_i}
\end{equation}

In [29]:
numbers = [1, 2, 3, 4, 5, 6]
numbers

[1, 2, 3, 4, 5, 6]

In [30]:
weights = [0.1, 0.2, 0.5, 0.3, 0.7, 0.2]
weights

[0.1, 0.2, 0.5, 0.3, 0.7, 0.2]

In [34]:
%%timeit -n 100000 -r 10
sum([numbers[i]*weights[i] for i in range(len(numbers))])/sum(weights)

1.53 µs ± 63 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


In [35]:
%%timeit -n 100000 -r 10
sum([n*w for n, w in zip(numbers, weights)])/sum(weights)

1.08 µs ± 28.7 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


In [36]:
%%timeit -n 100000 -r 10
np.average(numbers, weights=weights)

21.8 µs ± 511 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


In [40]:
w = np.array(weights)
n = np.array(numbers)

In [41]:
%%timeit -n 100000 -r 10
(w * n).sum() / w.sum()

5.14 µs ± 116 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


###### Geometric Mean

\begin{equation}
\large GM = \left (\prod_{i=1}^N x_i \right)^\frac{1}{N}
\end{equation}

In [42]:
numbers = [1, 2, 3, 4, 5, 6]
numbers

[1, 2, 3, 4, 5, 6]

In [46]:
%%timeit -n 100000 -r 10
gmean = 1
for i in numbers:
    gmean *= i
gmean **= 1/len(numbers)
gmean

493 ns ± 26.4 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


In [48]:
%%timeit -n 100000 -r 10
statistics.geometric_mean(numbers)

2.65 µs ± 114 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


In [52]:
%%timeit -n 100000 -r 10
scipy.stats.gmean(numbers)

12.4 µs ± 1.84 µs per loop (mean ± std. dev. of 10 runs, 100000 loops each)


###### Harmonic Mean

\begin{equation}
\large H = \left (\frac{\sum_{i=1}^N x_i^{-1}}{N} \right)^{-1}
\end{equation}

In [60]:
numbers = [1, 2, 3, 4, 5, 6]
numbers

[1, 2, 3, 4, 5, 6]

In [61]:
%%timeit -n 100000 -r 10
len(numbers)/sum(1/item for item in numbers)

769 ns ± 29.4 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


In [62]:
%%timeit -n 100000 -r 10
statistics.harmonic_mean(numbers)

42.5 µs ± 5.17 µs per loop (mean ± std. dev. of 10 runs, 100000 loops each)


In [64]:
%%timeit -n 100000 -r 10
scipy.stats.hmean(numbers)

21.9 µs ± 380 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


###### Median

\begin{equation}
\large \text{median} = \frac{1}{2}\left( x_{\lfloor{(n+1)/2}\rfloor} + x_{\lceil{(n+1)/2}\rceil} \right)
\end{equation}

In [65]:
numbers = [1, 2, 3, 4, 5, 6]
numbers

[1, 2, 3, 4, 5, 6]

In [68]:
%%timeit -n 100000 -r 10
n = len(numbers)
sorted_numbers = sorted(numbers)
if n%2 == 0:
    median = 0.5*(sorted_numbers[int((n+1)/2)] + sorted_numbers[int((n+2)/2)])
else:
    median = sorted_numbers[int((n+1)/2)]

689 ns ± 47.1 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


In [69]:
%%timeit -n 100000 -r 10
statistics.median(numbers)

501 ns ± 32.2 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


In [70]:
%%timeit -n 100000 -r 10
statistics.median_low(numbers)

434 ns ± 55.1 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


In [71]:
%%timeit -n 100000 -r 10
statistics.median_high(numbers)

370 ns ± 19.6 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


In [72]:
%%timeit -n 100000 -r 10
np.median(numbers)

25 µs ± 818 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


###### Mode

In [97]:
numbers = [1, 2, 2, 3, 3, 3, 4, 5, 5, 6, 7, 8, 8, 8, 9, 0, 10]

In [98]:
%%timeit -n 100000 -r 10
max((numbers.count(item), item) for item in set(numbers))[1]

5.33 µs ± 390 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


In [99]:
%%timeit -n 100000 -r 10
statistics.mode(numbers)

6.53 µs ± 684 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


In [100]:
%%timeit -n 100000 -r 10
statistics.multimode(numbers)

5.14 µs ± 152 ns per loop (mean ± std. dev. of 10 runs, 100000 loops each)


In [108]:
%%timeit -n 100000 -r 10
scipy.stats.mode(numbers)

102 µs ± 16.3 µs per loop (mean ± std. dev. of 10 runs, 100000 loops each)


##### Variability

##### Correlation or Joint Variability