# Chapter 4: Functional Programming: Rudimentary Statistics and Analytics

In [1]:
# The general form of a function
# def function_name(object1, object2, ... , objectn):
#    <operations>

### Total
$\sum_{i=0}^{n-1} x_{i}$

In [2]:
n = 0
total = 0
values = [i for i in range(10)]

print("total\t","value")
for value in values:
    total += value
    print(total,"\t", value)

total	 value
0 	 0
1 	 1
3 	 2
6 	 3
10 	 4
15 	 5
21 	 6
28 	 7
36 	 8
45 	 9


This way of coding is inefficient becuase it must be rewritten each time it needs to be used.
Instead, the code can be stored as a function, dramatically shortening the amount of code that needs to be written.

In [3]:
def total(lst):
    total_ = 0
    # in original, I used the index of the list
    # n = len(lst)
    # for i in range(n)
    for val in lst:
        total_ += val
    return total_
total(values)

45

In [4]:
total([i for i in range(-1000, 10000, 53)])

932984

In [5]:
import random
x1 = [random.randint(0,100) for i in range(10)]
total(x1)

433

### Mean
$\bar{X} = \frac{\sum_{i=0}^{n-1} x_{i}} {n}$

To calculate the mean, sum all the numbers and divide by the quantity of numbers

In [6]:
def mean(lst):
    n = len(lst)
    mean_ = total(lst) / n
    return mean_

#more consice
def meanc(lst):
    return total(lst) / len(lst)

In [7]:
mean(x1)

43.3

In [8]:
meanc(x1)

43.3

Now I will build Median, Mode, Var, SD, SE, Cov, and Corr

### Median

In [9]:
def median(lst):
    n = len(lst)
    lst = sorted(lst)
    # if length is odd (not divisible by two)
    if n % 2 != 0:
        middle_index = int((n - 1) / 2)
        median_ = lst[middle_index]
    # if length is evem
    else:
        upper_middle_index = int(n/2)
        lower_middle_index = upper_middle_index - 1
        median_ = mean(lst[lower_middle_index : upper_middle_index + 1])
   
    return median_

In [10]:
median1 = median(x1)
median1

44.5

In [11]:
median2 = median([random.randint(0,100) for i in range(11)])
median2

83

### Mode

In [12]:
def mode(lst):
    count_dct = {}
    #create entries for each val with zero
    for key in lst:
        count_dct[key] = 0
    #increment entry w/ each occurance
    for key in lst:
        count_dct[key] += 1
    #calc max count upfront    
    max_count = max(count_dct.values())
    #compare each val
    mode_ = []
    for key, count in count_dct.items():
        if count == max_count:
            mode_.append(key)
        
    return mode_

In [13]:
lst = [1,2,3,4,5,5,5,5,5,5,5,5]
mode(lst)

[5]

In [27]:
lst = [1,1,2,2,3,3,4,4,5,5]
mode(lst)

[1, 2, 3, 4, 5]

### Variance

We define population variance as:

$$ \sigma^2 = \frac{\sum_{i=1}^n (X_i - \bar{X})^2}{n}$$

and the sample variance as:

$$ S^2 = \frac{\sum_{i=1}^n (X_i - \bar{X})^2}{n-1}$$


In [28]:
def variance(lst, sample = True):
    mean_ = mean(lst)
    n = len(lst)
    DoF = n - 1
    sum_sq_diff = 0
    
    for val in lst:
        sum_sq_diff += (val - mean_) ** 2
    if sample:
        variance_ = sum_sq_diff / (n-1)
    else:
        variance_ = sum_sq_diff / (n)
    return variance_

In [29]:
variance(lst), variance(lst, sample = False)

(2.2222222222222223, 2.0)

### Standard Deviation
The standard deviation is square root of the variance

In [30]:
def SD(lst, sample = True):
    SD_ = variance(lst, sample) ** (1/2)
    return SD_

In [31]:
SD(lst), SD(lst, False)

(1.4907119849998598, 1.4142135623730951)

### Standard Error
It is the standard deviation of the probability distribution for the random variable $\bar{X}$, which represents all possible samples of a single given sample size $n$.

In [32]:
def STE(lst, sample = True):
    n = len(lst)
    se = SD(lst, sample) / (n ** 0.5)
    return se

In [33]:
STE(lst), STE(lst, False)

(0.4714045207910317, 0.4472135954999579)

To calculate covariance, we multiply the sum of the product of the difference between the observed value and the mean of each list for value _i = 1_ through _n = number of observations_:

$cov_{pop}(x,y) = \frac{\sum_{i=0}^{n-1} (x_{i} - x_{mean})(y_{i} - y_{mean})} {n}$

$cov_{sample}(x,y) = \frac{\sum_{i=0}^{n-1} (x_{i} - x_{mean})(y_{i} - y_{mean})} {n - 1}$


In [70]:
def covariance(lst1, lst2, sample = False):
    # determine means
    mean1 = mean(lst1)
    mean2 = mean(lst2)
    # make cov 0 so can increment later
    cov = 0
    n1 = len(lst1)
    n2 = len(lst2)
    # lists need be same length
    if n1 == n2:
        n = n1
        for i in range(n):
            cov += (lst1[i] - mean1) * (lst2[i] - mean2)
        if sample == False:
            cov = cov / n
        # different denom for sample
        else:
            cov = cov / (n - 1)
        return cov
    else:
        print("List lengths are not equal")
        print("List1:", n1)
        print("List2:", n2)

In [60]:
covariance(x1, lst)

1.3

### Correlation
We can transform the covariance into a correlation value by dividing by the product of the standard deviations. 

$corr_{pop}(x,y) = \frac{cov_{pop}(x, y)} {\sigma_x \sigma_y}$

In [61]:
def correlation(lst1, lst2):
    cov = covariance(lst1, lst2)
    SD1 = SD(lst1)
    SD2 = SD(lst2)
    corr = cov / (SD1 * SD2)
    return corr

In [62]:
x1 = [random.randint(0,100) for i in range(10)]
x2 = [random.randint(0,100) for i in range(10)]
correlation(x1, x2)

0.0376723973495927

### Skewness
Skewness is a measure of asymmetry of a population of data about the mean. It is the expected value of the cube of the standard deviation.

$skew_{pop}(x,y) = \frac{\sum_{i=0}^{n-1}{(x_{i} - x_{mean})^3}} {n\sigma^3}$


$skew_{sample}(x,y) = \frac{\sum_{i=0}^{n-1}{(x_{i} - x_{mean})^3}} {(n-1)(n-2)\sigma^3}$

In [63]:
def skewness(lst, sample = True):
    mean_ = mean(lst)
    SD_ = SD(lst, sample)
    skew = 0
    n = len(lst)
    for val in lst:
        skew += (val - mean_) ** 3
    skew = skew / (n * SD_**3) if not sample else n * skew / ((n - 1)*(n - 1) * SD_ ** 3)
    return skew

In [64]:
skewness(x1), skewness(x1, False)

(-1.3187112459037245, -1.2510393339402472)

In [65]:
skewness(x2), skewness(x2, False)

(-0.06380741276716845, -0.06053302678402782)

### Kurtosis
Kurtosis is an absolute measure of the weight of outliers. While skewness describes the ‘lean’ of a distribution, kurtosis describes the weight of a distribution that is held in the tails.

$kurt_{pop} = \frac{\sum_{i=0}^{n-1} (x_{i} - x_{mean})^4} {n\sigma^4}$

$kurt_{sample} = \frac{n(n+1)\sum_{i=0}^{n-1} (x_{i} - x_{mean})^4} {(n - 1)(n - 2)( n - 3)\sigma^4} - \frac{3(n - 1)^2}{(n - 2)(n - 3)}$


In [68]:
def kurtosis(lst, sample = True):
    mean_ = mean(lst)
    kurt = 0
    SD_ = SD(lst, sample)
    n = len(lst)
    for x in lst:
        kurt += (x - mean_) ** 4
    kurt = kurt / (n * SD_ ** 4) if  sample == False else  n * (n + 1) * kurt / \
    ((n - 1) * (n - 2) * (n - 3) * (SD_ ** 4)) - (3 *(n - 1) ** 2) / ((n - 2) * (n - 3))
    
    return kurt

In [71]:
kurtosis(x1), kurtosis(x1, False)

(2.0036371917350966, 3.5879159872440947)

## Using A Nested Dictionary To Organize Statistics


In [74]:
import pandas as pd
def gather_statistics(df, sample = False, round_dig = 3):
    dct = {key:{} for key in df}
    for key, val in df.items():
        val = val.dropna(axis=0)
        dct[key]["mean"] = round(mean(val), round_dig)
        dct[key]["median"] = round(median(val), round_dig)
        dct[key]["variance"] = round(variance(val, sample), round_dig)
        dct[key]["S.D."] = round(SD(val, sample), round_dig)
        dct[key]["skewness"] = round(skewness(val, sample), round_dig)
        dct[key]["kurtosis"] = round(kurtosis(val, sample), round_dig)
    stats_df = pd.DataFrame(dct)
    return stats_df
data = pd.DataFrame([x1, x2], index = ["List1", "List2"]).T
gather_statistics(data)

Unnamed: 0,List1,List2
mean,67.4,57.4
median,79.0,59.0
variance,747.44,980.84
S.D.,27.339,31.318
skewness,-1.251,-0.061
kurtosis,3.588,1.739


In [75]:
gather_statistics(data, False, 5)

Unnamed: 0,List1,List2
mean,67.4,57.4
median,79.0,59.0
variance,747.44,980.84
S.D.,27.33935,31.31837
skewness,-1.25104,-0.06053
kurtosis,3.58792,1.73884


## Fraser Economic Freedom of the World

In [76]:
import numpy as np
import stats as stat

ModuleNotFoundError: No module named 'stats'

In [81]:
filename = "EFW.xlsx"
data = pd.read_excel(filename, index_col = [1,3], header = [4])
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,ISO Code 2,Countries,Economic Freedom Summary Index,Rank,Quartile,1A Government Consumption,data,1B Transfers and subsidies,data.1,...,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104,Unnamed: 105,Unnamed: 106,Unnamed: 107,Unnamed: 108,Unnamed: 109,Unnamed: 110
Year,ISO Code 3,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2020,ALB,,AL,Albania,7.64,26.0,1.0,8.026471,12.710000,6.978202,11.590000,...,2011.00,2012.00,2013.00,2014.00,2015.00,2016.0,2017.0,2018.0,2019.00,2020.00
2020,DZA,,DZ,Algeria,5.12,157.0,4.0,3.102941,29.450000,7.817129,8.511137,...,153.00,153.00,157.00,159.00,159.00,162.0,162.0,162.0,165.00,165.00
2020,AGO,,AO,Angola,5.91,138.0,4.0,7.700000,13.820000,9.702997,1.590000,...,38.25,38.25,39.25,39.75,39.75,40.5,40.5,40.5,41.25,41.25
2020,ARG,,AR,Argentina,4.87,161.0,4.0,5.985294,19.650000,6.493188,13.370000,...,114.75,114.75,117.75,119.25,119.25,121.5,121.5,121.5,123.75,123.75
2020,ARM,,AM,Armenia,7.84,11.0,1.0,6.605882,17.540000,7.223433,10.690000,...,76.50,76.50,78.50,79.50,79.50,81.0,81.0,81.0,82.50,82.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970,VEN,,VE,"Venezuela, RB",7.19,13.0,1.0,6.602003,17.553191,9.827430,1.133333,...,,,,,,,,,,
1970,VNM,,VN,Vietnam,,,,,,,,...,,,,,,,,,,
1970,YEM,,YE,"Yemen, Rep.",,,,,,,,...,,,,,,,,,,
1970,ZMB,,ZM,Zambia,5.33,54.0,3.0,3.448131,28.276353,9.105430,3.783070,...,,,,,,,,,,


In [82]:
data = pd.read_excel(filename, sheet_name = "EFW Panel Data 2022 Report")
data

Unnamed: 0,Year,ISO_Code_2,ISO_Code_3,World Bank Region,"World Bank Current Income Classification, 1990-present (L=Low income, LM=Lower middle income, UM=Upper middle income, H=High income)",Countries,Panel Data Summary Index,Area 1,Area 2,Area 3,Area 4,Area 5,Standard Deviation of the 5 EFW Areas
0,2020,AL,ALB,Europe & Central Asia,UM,Albania,7.640000,7.817077,5.260351,9.788269,8.222499,7.112958,1.652742
1,2020,DZ,DZA,Middle East & North Africa,LM,Algeria,5.120000,4.409943,4.131760,7.630287,3.639507,5.778953,1.613103
2,2020,AO,AGO,Sub-Saharan Africa,LM,Angola,5.910000,8.133385,3.705161,6.087996,5.373190,6.227545,1.598854
3,2020,AR,ARG,Latin America & the Caribbean,UM,Argentina,4.870000,6.483768,4.796454,4.516018,3.086907,5.490538,1.254924
4,2020,AM,ARM,Europe & Central Asia,UM,Armenia,7.840000,7.975292,6.236215,9.553009,7.692708,7.756333,1.178292
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4450,1970,VE,VEN,Latin America & the Caribbean,,"Venezuela, RB",7.242943,8.349529,5.003088,9.621851,7.895993,5.209592,2.028426
4451,1970,VN,VNM,East Asia & Pacific,,Vietnam,,,,,,,
4452,1970,YE,YEM,Middle East & North Africa,,"Yemen, Rep.",,,,,,,
4453,1970,ZM,ZMB,Sub-Saharan Africa,,Zambia,4.498763,5.374545,4.472812,5.137395,,5.307952,0.412514


In [85]:
rename = {"Panel Data Summary Index": "Summary",
         "Area 1":"Size of Government",
         "Area 2":"Legal System and Property Rights",
         "Area 3":"Sound Money",
         "Area 4":"Freedom to Trade Internationally",
         "Area 5":"Regulation"}
data = data.rename(columns = rename)
data

TypeError: 'dict' object is not callable