In [154]:
# functions
from collections import Counter
import linear_algebra as la
import math

#DESCRIPTVE STATISTICS
def mean(vec):
    return sum(vec) / len(vec)

def median(vec):
    sorted(vec)
    size = len(vec)
    if (size % 2 != 0):
        i = int(((size-1)/2)-1)
        return vec[i]
    else:
        m = int(size / 2)
        return (vec[m] + vec[m-1]) / 2

def mode(vec):
    counts = Counter(vec)
    max_count = max(counts.values())
    return [x for x , count in counts.items()
           if count == max_count]

def data_range(vec):
    return max(vec) + min(vec)

def deviation(vec):
    pop_mean = mean(vec)
    return [x_i - pop_mean for x_i in vec]

def squared_deviation(vec):
    pop_mean = mean(vec)
    return [(x_i - pop_mean)**2 for x_i in vec]

def variance(vec):
    return mean(squared_deviation(vec))

def standard_deviation(vec):
    return math.sqrt(variance(vec))

def sample_standard_deviation(vec):
    return math.sqrt(sum(squared_deviation(vec)) / (len(vec) - 1))

def covariance(vec1, vec2):
    n = len(vec1)
    return la.dot_product(deviation(vec1), deviation(vec2)) / (n-1)

def standard_error(sample_size, pop_standard_deviation):
    return pop_standard_deviation / (math.sqrt(sample_size))

def z_score_sample(value, pop_mean, standard_error):
    return (value - pop_mean) / standard_error

def z_score_pop(value, pop_vec):
     return (value - mean(pop_vec)) / standard_deviation(pop_vec)
    
# iNFERENTIAL STATTISTICS

## Estimation

def margin_error(z_or_t_score, sample_size, std_dev):
    return z_or_t_score * (std_dev / math.sqrt(sample_size))

def confidence_interval(z_or_t_score, mean, std_dev, sample_size):
    me = margin_error(z_or_t_score, sample_size, std_dev)
    left = mean - me
    right = mean + me
    return (left, right)

## t Tests
def t_statistic_(xbar, value, sample_std, sample_size):
    return (xbar - value) / standard_error(sample_size, sample_std)

def t_statistic(value, sample):
    sm = mean(sample)
    s_std =  sample_standard_deviation(sample)
    s_size = len(sample)
    return t_statistic_(sm, value, s_std, s_size)

def t_statistic_paired(sample1, sample2, value):
    sample_size = len(sample1)
    xbar = point_estimate(sample1, sample2)
    sample_std = std_dev_difference_sample(sample1,sample2)
    return t_statistic_(xbar, value, sample_std, sample_size)

def point_estimate(vec1, vec2):
    return mean(vec1) - mean(vec2)

def cohens_d(mean, value, std_dev):
    return (mean - value) /std_dev

def cohens_d_paired(sample1, sample2, value):
    diffs = difference(sample1, sample2)
    diffs_mean = mean(diffs)
    std_dev = std_dev_difference_sample(sample1, sample2)
    return cohens_d(diffs_mean, value ,std_dev)

def z_scores_sample_interval(interval, pop_mean, std_error):
    z1 = z_score_sample(interval[0], pop_mean, std_error)
    z2 = z_score_sample(interval[1], pop_mean, std_error)
    return (z1, z2)
    
def std_dev_difference(vec1, vec2):
    diffs = difference(vec1, vec2)  
    return standard_deviation(diffs)

def std_dev_difference_sample(vec1, vec2):
    diffs = difference(vec1, vec2)    
    return sample_standard_deviation(diffs)

def difference(vec1, vec2):
    return [(x - vec2[i]) for i,x in enumerate(vec1)];

def std_error_difference(vec1, vec2, n):
    std_dev = std_dev_difference_sample(vec1, vec2)
    return standard_error(n, std_dev)

def r_squared(t, degrees_of_freedom):
    t_squared = t ** 2
    return t_squared / (t_squared + degrees_of_freedom)

def correlation(vec1, vec2):
    stdev_1 = standard_deviation(vec1)
    stdev_2 = standard_deviation(vec2)
    if (stdev_1 > 0 and stdev_2 > 0):
        return covariance(vec1, vec2) / stdev_1 / stdev_2
    else:
        return 0 #se nao houver amplitude a correlacao eh zero
    
class ReportResult(object):
    def __init__(self, sample, alpha, p_value_for_alpha):
        self.__sample = sample
        self.__alpha = alpha
        self.__p_value_for_alpha = p_value_for_alpha
        
    def get_mean(self):
        return mean(self.__sample)
    
    def get_std_dev(self):
        return sample_standard_deviation(self.__sample)
    
    def get_t_statistic(self):
        return t_statistic(self.__p_value_for_alpha, self.__sample)
    
    def get_ci(self):
        return confidence_interval(self.__p_value_for_alpha, self.get_mean(), self.get_std_dev(), len(self.__sample))     

    def show(self):
        print('\n--------- Report Results --------- ')
        print('\n1 - Descriptive Statistics')
        print('Sample Size: ' , str(len(self.__sample)))
        print('Mean: ' , str(self.get_mean()))
        print('Standard Deviation: ' , str(self.get_std_dev()))
        print('\n2 - Inferential Statistics')
        print('Alpha Level: ' , str(self.__alpha) , '%')
        print('Test Type: t-Test')
        print('T Statistic: ' , str(self.get_t_statistic()))
        print('Degrees of Freedom: ' , str((len(self.__sample) - 1)))
        print('P-value: ' , str(self.__p_value_for_alpha))
        print('Test Direction: Bi-Directional')
        print('Confidence Interval: ' , str(self.get_ci()))
        

In [103]:
# BELLOW ARE EXERCISES CODES

In [104]:
standard_error(20, 0.64)

0.14310835055998652

In [105]:
z_score_sample(8.94,7.5,standard_error(20, 0.64))

10.06230589874905

In [106]:
z_score_sample(8.35,8.2,standard_error(20, 0.73))

0.918932045547861

In [107]:
standard_error(25, 10)

2.0

In [108]:
margin_error(1.95,25,10)

3.9

In [109]:
standard_error(9,18)

6.0

In [110]:
z_score_sample(7.13,7.5, standard_error(20,0.64))

-2.5854535989841327

In [111]:
z_score_sample(8.3,7.47, standard_error(30,2.41))

1.8863473972169642

In [112]:
pop_mean = 7.47
pop_std = 2.41
hip_mean = 8.3
sample_size = 50
std_error = standard_error(sample_size, pop_std)
where_hip_mean_falls_in_samples_mean = z_score_sample(hip_mean,pop_mean, std_error)
print("where hiposys mean falls in samples mean: ", str(where_hip_mean_falls_in_samples_mean))

where hiposys mean falls in samples mean:  2.4352640181943364


In [113]:
pop_mean = 7.47
pop_std = 2.41
hip_mean = 7.8
sample_size = 50
std_error = standard_error(sample_size, pop_std)
where_hip_mean_falls_in_samples_mean = z_score_sample(hip_mean,pop_mean, std_error)
print("where hiposys mean falls in samples mean: ", str(where_hip_mean_falls_in_samples_mean))

where hiposys mean falls in samples mean:  0.9682375012097956


In [114]:
standard_error(36,6)

1.0

In [115]:
z_score_sample(28,25,standard_error(36,6))

3.0

In [116]:
standard_error(16,0.36)

0.09

In [117]:
z_score_sample(22.793,22.965, 0.09)

-1.9111111111111179

In [118]:
pop_mean = 7895
pop_std = 230
hip_mean = 9640
sample_size = 5
std_error = standard_error(sample_size, pop_std)
where_hip_mean_falls_in_samples_mean = z_score_sample(hip_mean,pop_mean, std_error)
print("where hiposys mean falls in samples mean: ", str(where_hip_mean_falls_in_samples_mean))

where hiposys mean falls in samples mean:  16.964950524944058


In [119]:
#LESSON 6 QUIZ 15
finches = [6.5,6.2,6.6,6,6.6,6.1,6.2,6.3,6.2,6,6.7,7.2,6.6,7,6.4,6.4,6,6.5,6.2,6.5,5.8,7.2,6.5,6.2,6.1,6.1,6.7,7.3,6.4,6.1,6.4,6.2,6.1,6.6,6.4,7.2,6.4,6.4,6.7,6.7,6.4,6,6.1,6,7.1,6.6,7,6.2,6.6,6.6,6.7,6.3,6.7,6.8,6.1,6.2,6,6.7,5.9,7.1,6.4,6.4,7.6,6.4,5.9,6.5,6.7,6.4,6.9,7,6.7,7,5.8,6.5,6.2,6.7,6.2,5.8,6.1,6.9,6.9,6.8,6,7.2,6.2,7,6.5,6.8,6,6.9,6.6,7,6.1,6.8,6.4,6.6,6.5,6.7,6.4,5.9,6.6,6.5,6.5,6.7,6,6.5,7.3,6.6,6.2,6.4,6.2,6.4,6.7,7,5.8,6.8,6.7,6.6,6.1,6.7,6.3,7,6.7,6.8,6.5,6.5,6.3,6.6,7.1,6.7,6.8,6.9,6,6.7,6.9,6.3,6.6,5.8,6.3,5.7,6.4,7,6.4,6.3,6.5,6.8,6.4,6.4,6.3,5.9,6.9,5.8,6.3,6.2,6.4,6.8,6.5,6.2,6.5,5.9,6,5.8,7.3,6.3,6.4,6.5,6.4,6,6.9,6.7,6.8,7,6.1,6.7,6.5,6.4,6.2,6.1,6.1,6.9,6.7,6.8,6.1,6.5,6.4,5.4,5.9,6.5,6.4,6.1,6.7,6.8,6.4,6.7,6.2,6,6.6,6.2,6.5,6.6,5.9,6,6.4,6.5,6.5,5.7,5.9,6.5,6.8,6.5,6.3,6.5,7,7.1,6,6.3,6.6,6.5,6,6.2,6.7,6.5,6.8,6.4,6,6.3,6.6,6.9,7.1,6.9,7.2,6.7,6.2,6.3,6.7,6.5,6.5,6.2,6.7,6.3,6.6,6.3,6.5,6.3,6.5,7,7,6.6,6.8,6.9,6.6,6.4,5.7,6.5,7,7,6,5.8,6.2,6.5,6.5,6.7,6.9,6.9,6.7,7.1,6.3,6,6,6.3,6.9,6.6,5.7,6.9,7,6.9,7.4,6.7,6.9,6.4,6.6,6.4,6.1,6.2,6.3,6.3,6.8,6.1,7.1,7.1,6.3,6,6.7,7.1,6.5,6.1,6,6.5,6.3,6.2,5.8,6.5,7,6.6,6.2,6.1,6.7,6.2,6.3,6,5.7,6.9,6.9,6.1,7,7.5,6,6.5,6.5,6.4,6.7,6.4,7.3,6.7,6,6,6.7,6.3,6.3,6.6,5.3,6.5,6.2,7.1,7,6.4,7.2,6,6.5,6.3,5.8,5.9,6.5,6.8,5.7,6.3,6.1,6.1,6.8,6.5,6.8,6.3,6.8,7.1,6.1,6.2,6.4,6.2,6.1,7.3,6.1,6.4,6.8,6.5,5.8,6.9,6.5,6.2,6.7,6.3,6.3,7.2,6.3,6.5,5.9,7.1,5.9,6.1,5.8,6.1,6.9,5.8,6.4,6.2,6.9,6.5,7,6.6,6.2,6.7,6,6.3,5.9,6.6,6.4,6.1,6.4,6.5,6.2,7.2,6.3,6.8,6.4,5.6,6.4,6.4,6.4,7,6.4,6.4,6.1,7.2,6.3,5.3,6.6,6.1,6,6.4,6.5,6,6.4,6.1,6.5,6.6,6.4,7,6.9,6.5,6.1,7.1,6.5,6.4,6.1,6,6,7.1,6,6,6.7,6.7,6.6,6.5,6.2,6.5,6.9,5.9,6.4,6.6,7,6.3,6,6.9,7.6,6.6,6,7,6.7,6,6.4,6.9,6,6.4,7.5,6.6,5.9,6.7,6.3,6.7,7.1,7.4,7,6.9,6.5,7.2,5.9,5.9,7.1,6.2,7.3,6.4,7,6.6,6.2,6.2,6.3,6.6,5.8,6.7,6,6.7,6.9,6.4,6.6,6.1,6,5.9,7,6.3,6.8,6.7]
# do finches today have differente-sizes beak widths that before?
mean_from_years_before = 6.07
finches_mean = mean(finches)
print('Mean: ', str(mean_from_years_before), ' Xbar: ', str(finches_mean))

Mean:  6.07  Xbar:  6.4696000000000025


In [120]:
#LESSON 6 QUIZ 16
sample_size = len(finches)
degrees_of_freedom = sample_size -1
print('Sample Size: ', str(sample_size), ' Degrees of Freedom: ', str(degrees_of_freedom))

Sample Size:  500  Degrees of Freedom:  499


In [121]:
#LESSON 6 QUIZ 17
xbar = finches_mean
sample_std = sample_standard_deviation(finches)
print('Xbar: ', str(xbar), 'Sample Std: ', str(sample_std))

Xbar:  6.4696000000000025 Sample Std:  0.39611823906333427


In [122]:
#LESSON 6 QUIZ 18
t = t_statistic(mean_from_years_before, finches)
print('t: ', str(t))

t:  22.557223467462105


In [123]:
#LESSON 6 QUIZ 18
t_95perct_499df= 1.96472941
reject_null = t > t_95perct_499df or t < t_95perct_499df
print('Do we reject de null? ', str(reject_null))

Do we reject de null?  True


In [124]:
#LESSON 6 QUIZ 18
nro_sample = [5,19,11,23,12,7,3,21]
m = 10
sig = 0.05
t = t_statistic(m, nro_sample)
print('t: ', str(t))

t:  0.977461894333816


In [125]:
#LESSON 6 QUIZ 24
m = 1700
s = 200
sample_size = 25
mean_rent = 1830
t = t_statistic_(m,mean_rent,s,sample_size)
print('t: ', str(t))

t:  -3.25


In [126]:
#LESSON 6 QUIZ 26
cd = cohens_d(m,mean_rent, s)
print('cohens d : ', str(cd))

cohens d :  -0.65


In [127]:
#LESSON 6 QUIZ 28
confidence_interval(2.064,m, s, sample_size)

(1617.44, 1782.56)

In [128]:
#LESSON 6 QUIZ 30
me = margin_error(1.984,s,100)
print('Margin Error: ', str(me))

Margin Error:  14.028998538741101


In [129]:
# keyboards
qwerty = [6,6,2,7,8,8,2,3,5,7,10,5,4,7,5,7,4,5,2,5,3,4,4,4,4]
abcd = [6,11,8,5,11,8,10,7,4,3,7,6,10,10,6,5,10,11,13,8,5,11,7,8,5]
mean_qwerty = mean(qwerty)
mean_abcd = mean(abcd)
print('xbar qwerty: ', str(mean_qwerty))
print('xbar abcd: ', str(mean_abcd))

pe = point_estimate(qwerty,abcd)
print('Point Estimate: ' , str(pe))

std_dev_diff = std_dev_difference_sample(qwerty, abcd)
print('Standard Deviation Difference: ' , str(std_dev_diff))

t = t_statistic_paired(qwerty,abcd,0)
print('T Statitic: ', str(t))

cd = cohens_d_paired(qwerty,abcd,0)
print('Cohens D: ', str(cd))

ci = confidence_interval(2.064,mean(difference(qwerty,abcd)),std_dev_diff,len(qwerty))
print('Cnofidence Interval: ', str(ci))

xbar qwerty:  5.08
xbar abcd:  7.8
Point Estimate:  -2.7199999999999998
Standard Deviation Difference:  3.6914315199752337
T Statitic:  -3.6842075835369266
Cohens D:  -0.7368415167073854
Cnofidence Interval:  (-4.243822931445777, -1.1961770685542237)


In [130]:
# Vocabularies
math.sqrt((1.2**2)+(2.7**2))

2.9546573405388314

In [131]:
9/(2.9546573405388314/math.sqrt(1000))

96.32419486019033

In [132]:
## r squared lesson 8
r_squared(2,20)

0.16666666666666666

In [133]:
standard_error(25,50)

10.0

In [134]:
t_statistic_(151,126,50,25)

2.5

In [135]:
cohens_d(151,126,50)

0.5

In [136]:
r_squared(t_statistic_(151,126,50,25), 24)

0.2066115702479339

In [138]:
margin_error(2.71,25,50)

27.1

In [140]:
margin_error(t_statistic_(151,126,50,25),standard_error(25,50),25)

19.76423537605237

In [155]:
sample = [6,11,8,5,11,8,10,7,4,3,7,6,10,10,6,5,10,11,13,8,5,11,7,8,5]
report = ReportResult(sample, 95, 1.61)
report.show()


--------- Report Results --------- 

1 - Descriptive Statistics
Sample Size:  25
Mean:  7.8
Standard Deviation:  2.6457513110645903

2 - Inferential Statistics
Alpha Level:  95 %
Test Type: t-Test
T Statistic:  11.698000439635583
Degrees of Freedom:  24
P-value:  1.61
Test Direction: Bi-Directional
Confidence Interval:  (6.948068077837202, 8.651931922162799)
