# Sesi 5

- Descriptive Statistics

In [1]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd

In [3]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]

print(x)
print(x_with_nan)

[8.0, 1, 2.5, 4, 28.0]
[8.0, 1, 2.5, nan, 4, 28.0]


In [4]:
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)

print(y)
print(z)

[ 8.   1.   2.5  4.  28. ]
0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64


## Measure of Central Tendency

### Mean

In [5]:
sum(x) / len(x)

8.7

In [6]:
statistics.mean(x)

8.7

In [7]:
statistics.mean(x_with_nan)

nan

In [8]:
np.mean(y)

8.7

In [9]:
np.mean(y_with_nan)

nan

In [10]:
np.nanmean(y_with_nan)

8.7

In [11]:
z.mean()

8.7

In [12]:
z_with_nan.mean()

8.7

### Weighted Mean

In [13]:
x

[8.0, 1, 2.5, 4, 28.0]

In [14]:
w = [0.1, 0.2, 0.3, 0.25, 0.15]

In [15]:
w = np.array(w)

In [16]:
np.average(y, weights=w)

6.95

In [18]:
np.average(z, weights=w)

6.95

### Harmonic Mean

In [19]:
statistics.harmonic_mean(x)

2.7613412228796843

In [20]:
scipy.stats.hmean(y)

2.7613412228796843

### Geometric Mean

In [21]:
scipy.stats.gmean(y)

4.67788567485604

### Median

In [22]:
x

[8.0, 1, 2.5, 4, 28.0]

In [23]:
x[:-1]

[8.0, 1, 2.5, 4]

In [24]:
statistics.median_low(x[:-1])

2.5

In [25]:
statistics.median_high(x[:-1])

4

In [26]:
statistics.median(x[:-1])

3.25

In [27]:
x_with_nan

[8.0, 1, 2.5, nan, 4, 28.0]

In [28]:
statistics.median_low(x_with_nan)

4

In [29]:
np.median(y)

4.0

### Mode

In [30]:
u = [2, 3, 2, 8, 12]
v = [12, 15, 12, 15, 21, 15, 12]

In [31]:
statistics.mode(u)

2

In [32]:
u, v = np.array(u), np.array(v)

scipy.stats.mode(u)

ModeResult(mode=array([2]), count=array([2]))

In [33]:
u, v = pd.Series(u), pd.Series(v)

In [34]:
u.mode()

0    2
dtype: int32

In [35]:
v.mode()

0    12
1    15
dtype: int32

## Measures of Variability

### Variance

In [36]:
statistics.variance(x)

123.2

In [37]:
np.var(y, ddof=1)

123.19999999999999

In [38]:
z.var(ddof=1)

123.19999999999999

### Standard Deviation

In [39]:
statistics.stdev(x)

11.099549540409287

In [40]:
np.std(y, ddof=1)

11.099549540409285

In [41]:
z.std(ddof=1)

11.099549540409285

### Skewness

In [42]:
y

array([ 8. ,  1. ,  2.5,  4. , 28. ])

In [43]:
scipy.stats.skew(y, bias=False)

1.9470432273905927

In [44]:
z.skew()

1.9470432273905924

### Percentiles

In [45]:
x = [-5.0, -1.1, 0.1, 0.2, 8.0, 12.8, 21.0, 25.8, 41.0]

In [46]:
y = np.array(x)

In [47]:
y

array([-5. , -1.1,  0.1,  0.2,  8. , 12.8, 21. , 25.8, 41. ])

In [48]:
np.percentile(y, 5)

-3.44

In [49]:
np.percentile(y, 50)

8.0

In [50]:
np.percentile(y, 95)

34.919999999999995

In [51]:
np.percentile(y, [25, 50, 75])

array([ 0.1,  8. , 21. ])

In [52]:
np.quantile(y, 0.5)

8.0

In [53]:
z = pd.Series(y)

In [54]:
z.quantile(0.95)

34.919999999999995

### Ranges

In [55]:
np.ptp(y)

46.0

In [56]:
np.ptp(z)

46.0

## Summary of Descriptive Stats

In [57]:
scipy.stats.describe(y, ddof=1, bias=False)

DescribeResult(nobs=9, minmax=(-5.0, 41.0), mean=11.422222222222222, variance=233.44194444444446, skewness=0.9206597142483607, kurtosis=0.07966042430381837)

In [58]:
z.describe()

count     9.000000
mean     11.422222
std      15.278807
min      -5.000000
25%       0.100000
50%       8.000000
75%      21.000000
max      41.000000
dtype: float64

## Measure of Correlation

In [59]:
x = list(range(-10, 11))
y = [0,2,2,2,2,3,3,6,7,4,7,6,6,9,4,5,5,10,11,12,14]

x_, y_ = np.array(x), np.array(y)
x__, y__ = pd.Series(x_), pd.Series(y_)

### Covariance

In [60]:
np.cov(x_, y_)

array([[38.5       , 19.95      ],
       [19.95      , 13.91428571]])

In [61]:
x__.cov(y__)

19.95

### Correlation Coefficient

In [62]:
scipy.stats.pearsonr(x_, y_)

(0.8619500056316061, 5.122760847201135e-07)

In [63]:
np.corrcoef(x_, y_)

array([[1.        , 0.86195001],
       [0.86195001, 1.        ]])