# Calculating Descriptive Statistics

In [17]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd

In [18]:
# Membuat lits
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]
print(x)
print(x_with_nan)

[8.0, 1, 2.5, 4, 28.0]
[8.0, 1, 2.5, nan, 4, 28.0]


In [19]:
# Membuat objek
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)
print(y_with_nan)
print(z_with_nan)

[ 8.   1.   2.5  nan  4.  28. ]
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


 ini merupakan sequence nilai 1D

# Measures of Central Tendency

## Mean

In [20]:
mean_ = sum(x) / len(x)
mean_

8.7

In [21]:
# menerapkan fungsi built-in python

mean_ = statistics.mean(x)
print(mean_)

8.7


In [22]:
# menggunakan numpy

mean_ = np.mean(y)
mean_

8.7

In [23]:
mean_ = y.mean()
mean_

8.7

In [24]:
print(np.mean(y_with_nan))
print(y_with_nan.mean())

nan
nan


In [25]:
np.nanmean(y_with_nan)

8.7

In [26]:
# menggunakan pd.series

mean_ = z.mean()
mean_

8.7

In [27]:
z_with_nan.mean()


8.7

## Weighted Mean

In [19]:
0.2 * 2 + 0.5 * 4 + 0.3 * 8

4.8

In [20]:
x = [8.0, 1, 2.5, 4, 28.0]
w = [0.1, 0.2, 0.3, 0.25, 0.15]

wmean = sum(w[i] * x[i] for i in range(len(x))) / sum(w)
print(wmean)

wmean = sum(x_ * w_ for (x_, w_) in zip(x, w)) / sum(w)
print(wmean)

6.95
6.95


In [23]:
# menggunakan np.average

y, z, w = np.array(x), pd.Series(x), np.array(w)

wmean = np.average(y, weights=w)
print(wmean)

wmean = np.average(z, weights=w)
print(wmean)

6.95
6.95


Kode tersebut digunakan untuk menghitung weighted mean (rata-rata tertimbang) dari sebuah array/list x dengan bobot yang ditentukan dalam array/list w menggunakan fungsi np.average dari library NumPy.

Pertama, array x diubah menjadi numpy array y menggunakan np.array(). Kemudian, x juga diubah menjadi pandas series z menggunakan pd.Series().

Kemudian, nilai weighted mean dihitung menggunakan np.average(). Dua parameter utama dari fungsi ini adalah array/list x dan bobot w. Dalam hal ini, bobot yang diberikan adalah w. Hasil perhitungan weighted mean disimpan dalam variabel wmean.

Pada baris selanjutnya, hasil perhitungan weighted mean yang menggunakan numpy array y disimpan dalam variabel wmean dan kemudian dicetak. Pada baris berikutnya, hasil perhitungan weighted mean yang menggunakan pandas series z disimpan dalam variabel wmean dan kemudian dicetak.

Dalam penghitungan weighted mean ini, setiap angka dalam array x atau pandas series z dikalikan dengan bobotnya masing-masing dari array/list w, kemudian hasil perkalian tersebut dijumlahkan dan dibagi dengan total bobot untuk menghasilkan weighted mean.

In [24]:
# impelemtasi python pure

(w * y).sum() / w.sum()

6.95

## Harmonic Mean

In [9]:
hmean = len(x) / sum(1/item for item in x)
hmean

2.7613412228796843

In [12]:
# Menggunakan statistik harmonic
hmean = statistics.harmonic_mean(x)
hmean

2.7613412228796843

In [13]:
scipy.stats.hmean(y)

2.7613412228796843

In [28]:
scipy.stats.hmean(z)

2.7613412228796843

In [29]:
speed = [60,20]
hmean_speed = len(speed)/sum(1/item for item in speed)
hmean_speed

30.0

In [30]:
# hmean dengan library 1
hmean_speed_stats = statistics.harmonic_mean(speed)
hmean_speed_stats

30.0

In [31]:
# hmean dengan library 1
hmean_speed_stats2 = scipy.stats.hmean(speed)
hmean_speed_stats2

30.0

In [32]:
x

[8.0, 1, 2.5, 4, 28.0]

In [33]:
hmean = len(x) / sum(1/item for item in x)
hmean

2.7613412228796843

In [34]:
arit_mean = np.mean(x)
arit_mean

8.7

# Quiz

In [15]:
x = [1,2,3,6,7,8,30,40,50]
std_dev = statistics.stdev(x)
print("Standar Deviasi dari x adalah:", std_dev)

variance = statistics.variance(x)
print("Variansi dari x adalah:", variance)

Standar Deviasi dari x adalah: 18.58090417606205
Variansi dari x adalah: 345.25000000000006


In [16]:
#Kuis 1 tim 4
x = [1,2,3,6,7,8,30,40,50]
n = len(x)
mean = sum(x)/n
var_ = sum((item - mean)**2 for item in x)/(n-1)
Sd_ = var_**(1/2)

print(var_)
print(Sd_)

345.25000000000006
18.58090417606205


# Geometric Mean

In [35]:
x = 1000
x*=1.3 #1300
x*=1.5 #1950
x*=1.4 #2730
x

2730.0

In [36]:
# Arithmetic Mean
(30+50+40)/3

40.0

In [37]:
x = 1000
x*=1.4
x*=1.4
x*=1.4
x

2743.9999999999995

In [38]:
#Geometric Mean
gmean = ((30*50*40)**(1/3))
gmean

39.148676411688626

In [39]:
x = 1000
x*=1.3915
x*=1.3915
x*=1.3915
x

2694.322835875

In [40]:
scipy.stats.gmean([30,50,40]) #geometric mean menggunakan library scipy

39.14867641168864

# Median

In [41]:
# Ganjil
a = [1,2,4,8,9]
n = len(a)
median = a[2]
median

4

In [42]:
# Genap
a = [1,2,4,8]
n = len(a)
(n+1)/2
median = (2+4)/2
median

3.0

In [43]:
# Menggunakan numpy
x = [8, 1, 2.5, 4, 28]
c = [2,3,4,5]
np.median(c)

3.5

In [44]:
# Pembuktian manual
def median_function(x):
    n = len(x)
    if n%2 == 1: #Ganjil
        median_ = sorted(x)[round(0.5*(n-1))] #ambil median
    else: #Genap
        x_ord, index = sorted(x) , round(0.5*n)
        median_=0.5 * (x_ord[index-1]+x_ord[index])
    return median_

In [45]:
median_function(x)

4

In [46]:
median_function(c)

3.5

In [47]:
sorted(x)

[1, 2.5, 4, 8, 28]

In [48]:
n = len([1,2,3,4,5,6,7])
round(0.5*(n-1))

3

In [49]:
n

7

# Mode/Modus

Nilai atau data yang sering muncul

In [51]:
u = [2,3,2,8,12]

mode_ = max((u.count(i),i)for i in set(u))[1]

In [52]:
statistics.mode(u)

2

In [53]:
scipy.stats.mode(u)[0][0]

  scipy.stats.mode(u)[0][0]


2

In [54]:
mode_lib = scipy.stats.mode(u)

  mode_lib = scipy.stats.mode(u)


In [55]:
print(mode_lib)
print(mode_lib.mode)
print(mode_lib.count)

ModeResult(mode=array([2]), count=array([2]))
[2]
[2]


In [56]:
u

[2, 3, 2, 8, 12]

In [57]:
u_series = pd.Series(u)
u_series.mode()

0    2
dtype: int64

In [58]:
u_array = np.array(u)
statistics.mode(u_array)

2

In [59]:
v = pd.Series([1,2,3,3,3,np.nan,np.nan,np.nan,np.nan])
v.mode()

0    3.0
dtype: float64

In [60]:
statistics.mode(v)

3.0

In [61]:
h = [1,2,3,3,3,np.nan,np.nan,np.nan,np.nan]
statistics.mode(h)

nan

Series akan mengabaikan NaN dan list akan mempertimbangkan NaN

# Measures of Variability

# Variance

In [62]:
x

[8, 1, 2.5, 4, 28]

In [63]:
n  = len(x)
mean_var = np.mean(x)
var_ = sum((item-mean_var)**2 for item in x ) / (n-1)
var_

123.19999999999999

In [64]:
# Variance w/ statistics library
var_stat = statistics.variance(x)
var_stat

123.2

In [67]:
# Variance w/ numpy
var_np = np.var(x,ddof=1)
var_np

123.19999999999999

In [66]:
# Variance using Series function
z = pd.Series(x)
z.var()

123.19999999999999

## Standar Deviasi

In [68]:
x = [8, 1, 2.5, 4, 28]

In [69]:
stdev = var_np**(1/2)

In [70]:
stdev

11.099549540409285

In [71]:
stdev = np.sqrt(var_np) 
stdev

11.099549540409285

In [72]:
# stdev with statistics library
stdev = statistics.stdev(x)
stdev

11.099549540409287

In [73]:
stdev = np.std(x,ddof=1)
stdev

11.099549540409285

## Skewness

In [74]:
x

[8, 1, 2.5, 4, 28]

In [75]:
n = len(x)
mean = np.mean(x)
stdev = statistics.stdev(x)
skew_ = (sum((item - mean)**3 for item in x))*n / ((n-1)*(n-2)*(stdev**3))
skew_

1.947043227390592

In [76]:
scipy.stats.skew(x,bias=False)

1.9470432273905927

In [77]:
z = pd.Series(x)
z.skew()

1.9470432273905924

In [78]:
scipy.stats.skew(x_with_nan)

nan

In [79]:
z_with_nan = pd.Series(x_with_nan)
z_with_nan.skew()

1.9470432273905924

In [80]:
scipy.stats.skew(z_with_nan)

nan

## Percentiles

In [81]:
x = [-5,-1.1,0.1,2,8,12.8,21,25.8,41]
np.percentile(x,5) #Percentile 5

-3.44

In [82]:
np.percentile(x,0)

-5.0

In [83]:
np.percentile(x,100)

41.0

In [84]:
np.percentile(x,90)

28.840000000000003

In [85]:
statistics.quantiles(x,n=2)

[8.0]

In [86]:
statistics.quantiles(x,n=4)

[-0.5, 8.0, 23.4]

In [87]:
statistics.quantiles(x,n=4,method='inclusive')

[0.1, 8.0, 21.0]

In [88]:
x

[-5, -1.1, 0.1, 2, 8, 12.8, 21, 25.8, 41]

In [89]:
np.quantile(x,0)

-5.0

In [90]:
np.quantile(x,1)

41.0

In [91]:
np.quantile(x,0.5)

8.0

In [92]:
x

[-5, -1.1, 0.1, 2, 8, 12.8, 21, 25.8, 41]

In [93]:
x_with_nan = [-5, -1.1,np.nan, 0.1, 2, 8, 12.8, 21, 25.8, 41]
x_with_nan

[-5, -1.1, nan, 0.1, 2, 8, 12.8, 21, 25.8, 41]

In [94]:
np.quantile(x_with_nan,0)

nan

In [95]:
np.nanquantile(x_with_nan,0.5)

8.0

In [96]:
x

[-5, -1.1, 0.1, 2, 8, 12.8, 21, 25.8, 41]

In [97]:
np.quantile(x,[0.25,0.5,0.75])

array([ 0.1,  8. , 21. ])

In [98]:
statistics.quantiles(x,n=4,method='inclusive')

[0.1, 8.0, 21.0]

In [99]:
np.percentile(x,[25,50,75])

array([ 0.1,  8. , 21. ])

## Ranges (Nilai Max - Nilai Min)

In [100]:
x # Ranges  = 41 - (-5) = 46

[-5, -1.1, 0.1, 2, 8, 12.8, 21, 25.8, 41]

In [101]:
# np.ptp untuk menghitung range
np.ptp(x)

46.0

In [102]:
np.amax(x)

41.0

In [103]:
np.amin(x)

-5.0

In [104]:
np.amax(x) - np.amin(x)

46.0

In [105]:
x

[-5, -1.1, 0.1, 2, 8, 12.8, 21, 25.8, 41]

In [106]:
quartiles = np.quantile(x,[0.25,0.75])
IQR = quartiles[1] - quartiles[0]
IQR

20.9

## Summary of Desc Stats

In [107]:
x

[-5, -1.1, 0.1, 2, 8, 12.8, 21, 25.8, 41]

In [108]:
result = scipy.stats.describe(x,ddof=1,bias=False)
result

DescribeResult(nobs=9, minmax=(-5.0, 41.0), mean=11.622222222222222, variance=228.75194444444446, skewness=0.9249043136685094, kurtosis=0.14770623629658886)

In [109]:
(result.variance)**(1/2)

15.12454774346805

In [110]:
result.minmax[1]-result.minmax[0]

46.0

In [111]:
result.skewness

0.9249043136685094

In [112]:
z = pd.Series(x)
z

0    -5.0
1    -1.1
2     0.1
3     2.0
4     8.0
5    12.8
6    21.0
7    25.8
8    41.0
dtype: float64

In [113]:
z.describe()

count     9.000000
mean     11.622222
std      15.124548
min      -5.000000
25%       0.100000
50%       8.000000
75%      21.000000
max      41.000000
dtype: float64

## Correlation

In [114]:
x = list(range(-10,11))
y = [0 , 2, 2, 2, 2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14]
x_ar, y_ar = np.array(x),np.array(y)
x_s,y_s = pd.Series(x),pd.Series(y)

In [115]:
len(x)==len(y)

True

## Covariance

In [116]:
n = len(x)
mean_x = np.mean(x)
mean_y = np.mean(y)

cov_xy = (sum((x[item]-mean_x)*(y[item]-mean_y) for item in range(n)))/(n-1)
cov_xy

19.95

In [117]:
cov_matrix = np.cov(x,y)
cov_matrix

array([[38.5       , 19.95      ],
       [19.95      , 13.91428571]])

In [118]:
np.var(x,ddof=1)

38.5

In [119]:
np.var(y,ddof=1)

13.914285714285711

In [120]:
cov_xy = cov_matrix[0,1]
cov_xy

19.95

In [121]:
cov_xy = cov_matrix[1,0]
cov_xy

19.95

## Correlation Coefficient

In [122]:
var_x = np.var(x,ddof=1)
var_y = np.var(y,ddof=1)
cov_xy
std_x = var_x**(1/2)
std_y = var_y**(1/2)
r = cov_xy / (std_x*std_y)
r

0.861950005631606

r  = 1 / r= -1 atau mendekati merupakan tanda kedua variabel mengalami multicollinearity

sehingga Korelasi sangat kuat karena sangat mendekati 1

In [123]:
r , p = scipy.stats.pearsonr(x,y)
r

0.8619500056316061

In [124]:
p

5.122760847201135e-07