# Inerval Estimation of the proportion

In [1]:
import numpy as np
import scipy.stats as st

In [2]:
n_yes = 55
n_total = 100

p = n_yes / n_total

SE = np.sqrt(p * (1 - p) / n_total)

st.norm.interval(0.95, loc=p, scale=SE)


(0.45249302291006066, 0.6475069770899394)

In [3]:
yes = 550
total = 1000
p = yes / total

# SE of proportion
se = np.sqrt(p * (1 - p) / total)

st.norm.interval(0.95, loc= p, scale=se)

(0.5191655864637935, 0.5808344135362066)

# Correlation

In [4]:
import pandas as pd
import numpy as np
import scipy.stats as st
import os

In [5]:
df = pd.read_csv('data_Iris.csv', header='infer')

In [6]:
df.head(3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa


In [7]:
df.shape

(150, 5)

In [8]:
x = df['Petal.Length']
y = df['Sepal.Length']

# Pearson

In [9]:
np.round(st.pearsonr(x, y), 3)

array([0.872, 0.   ])

In [10]:
x.corr(y)

0.8717541573048718

In [11]:
np.round(df[['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width']].corr(), 3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
Sepal.Length,1.0,-0.109,0.872,0.818
Sepal.Width,-0.109,1.0,-0.421,-0.357
Petal.Length,0.872,-0.421,1.0,0.963
Petal.Width,0.818,-0.357,0.963,1.0


# Spearman

In [12]:
np.round(st.spearmanr(x, y), 3)

array([0.881, 0.   ])

# Kendall

In [13]:
np.round(st.kendalltau(x, y), 3)

array([0.718, 0.   ])

In [14]:
df = pd.read_csv('data_studentlist.csv', header='infer')


In [15]:
df.head(3)

Unnamed: 0,name,gender,age,grade,absence,bloodtype,height,weight
0,Jared Diamond,M,23,3,Y,O,165.3,68.2
1,Sarah O'Donnel,F,22,2,N,AB,170.1,53.0
2,Brian Martin,M,24,4,N,B,175.0,80.1


In [16]:
x = df.height
y = df.weight

In [17]:
n = len(x)
r = x.corr(y)
z = np.arctanh(r)
std_error = 1 / np.sqrt(n-3)

In [18]:
{
    "Low": np.tanh(z - st.norm.ppf(0.975) * std_error), 
    "High": np.tanh(z + st.norm.ppf(0.995) * std_error)
}

{'Low': 0.26960396330592223, 'High': 0.9030812794020874}

# One sample t-test

In [19]:
n = 10
mu = 3
sigma = 2

np.random.seed(1234)
x = np.random.randn(n) * sigma + mu

two tail test:

In [20]:
mu_0 = 0
st.ttest_1samp(x, mu_0)

TtestResult(statistic=3.8682023385794677, pvalue=0.0037991794351627216, df=9)

In [21]:
# using the formula from the lecture
# compare the mean woth zero

mu_0 = 0
x_mean = x.mean()
SEM = x.std(ddof=1) / np.sqrt(n)
p_value = (1 - st.t.cdf((x_mean - mu_0) / SEM, df = n-1)) * 2
p_value


0.003799179435162836

In [22]:
# using the formula from the lecture
# compare the mean woth 1

mu_0 = 1
p_value = (1 - st.t.cdf((x_mean - mu_0) / SEM, df = n-1)) * 2
p_value


0.03722962101247096

In [23]:
# using the formula from the lecture
# compare the mean woth 2

mu_0 = 2
p_value = (1 - st.t.cdf((x_mean - mu_0) / SEM, df = n-1)) * 2
p_value

0.3360674694425527

Right tail test

In [24]:
# using the formula from the lecture
# compare the mean woth zero

mu_0 = 0
p_value = (1 - st.t.cdf((x_mean - mu_0) / SEM, df = n-1))
p_value

0.001899589717581418

In [25]:
# using the formula from the lecture
# compare the mean woth 1

mu_0 = 1
p_value = (1 - st.t.cdf((x_mean - mu_0) / SEM, df = n-1))
p_value

0.01861481050623548

In [26]:
# using the formula from the lecture
# compare the mean woth 2

mu_0 = 2
p_value = (1 - st.t.cdf((x_mean - mu_0) / SEM, df = n-1))
p_value

0.16803373472127636

Left tail test

In [27]:
# using the formula from the lecture
# compare the mean woth zero

mu_0 = 0
p_value = st.t.cdf((x_mean - mu_0) / SEM, df = n-1)
p_value

0.9981004102824186

In [28]:
# using the formula from the lecture
# compare the mean woth 1

mu_0 = 1
p_value = st.t.cdf((x_mean - mu_0) / SEM, df = n-1)
p_value

0.9813851894937645

In [29]:
# using the formula from the lecture
# compare the mean woth 2

mu_0 = 2
p_value = st.t.cdf((x_mean - mu_0) / SEM, df = n-1)
p_value

0.8319662652787236

Independent two sample t-test

In [30]:
n1 = 10
n2 = 20
mu1 = 3
mu2 = 5
sigma1 = 2
sigma2 = 4

x1 = np.random.randn(n1)* sigma1 + mu1
x2 = np.random.randn(n2)* sigma2 + mu2

In [31]:
st.ttest_ind(x1, x2, equal_var=False)


TtestResult(statistic=-2.786689415148105, pvalue=0.009768625607692464, df=26.230953494301154)

In [32]:
x1_mean = x1.mean()
x2_mean = x2.mean()

s1 = x1.std(ddof=1)
s2 = x2.std(ddof=1)

st.ttest_ind_from_stats(x1_mean, s1, n1, x2_mean, s2, n2, equal_var=False)

Ttest_indResult(statistic=-2.786689415148105, pvalue=0.009768625607692467)

Paired two sample test

In [33]:
# Simulate two samples with Normal distribution.
# Notice that the x2 is created by adding some 'noise' to the x1.
# So,'one to one' relation is established.

n = 10
x1 = np.random.randn(n) * 5 + 3
x2 = x1 + np.random.randn(n) * 1 + 1

In [34]:
# Using the ttest_rel() function from the SciPy library. 
# Two tail test.
st.ttest_rel(x1, x2)

TtestResult(statistic=-3.2467018903823366, pvalue=0.0100502680325701, df=9)

In [35]:
# Compare with the indepdent two sample t-test on the same dataset. 
# The p-value is completely diffetent!
st.ttest_ind(x1, x2, equal_var=False)

TtestResult(statistic=-0.523719559914853, pvalue=0.6069034633660737, df=17.88383285037342)

# Annova

In [36]:
x1 = np.random.randint(50, 100, 7)
x2 = np.random.randint(70, 85, 7)
x3 = np.random.randint(60, 90, 7)

In [37]:
# Using the f_onway() function from the SciPy library.
res = st.f_oneway(x1, x2, x3)
print("Test statistic: %f " % res[0])
print("P value: %f " % res[1])

Test statistic: 1.646161 
P value: 0.220519 


# Summarizing the categorical values

In [38]:
import pandas as pd
import numpy as np 
import os

In [39]:
df = pd.read_csv('data_studentlist.csv', header='infer')

Frequency table

In [40]:
bloodType = df['bloodtype']
gender = df['gender']

In [41]:
bloodType.value_counts()

bloodtype
O     5
B     5
A     4
AB    3
Name: count, dtype: int64

In [42]:
gender.value_counts()

gender
M    10
F     7
Name: count, dtype: int64

In [43]:
bloodType.value_counts() / bloodType.size

bloodtype
O     0.294118
B     0.294118
A     0.235294
AB    0.176471
Name: count, dtype: float64

In [44]:
gender.value_counts() / gender.size

gender
M    0.588235
F    0.411765
Name: count, dtype: float64

Contingency table

In [45]:
# Contingency table.
pd.crosstab(bloodType, gender)

gender,F,M
bloodtype,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2,2
AB,1,2
B,1,4
O,3,2


In [46]:
# Contingency table with marginalization.
pd.crosstab(bloodType, gender, margins=True)

gender,F,M,All
bloodtype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,2,2,4
AB,1,2,3
B,1,4,5
O,3,2,5
All,7,10,17


# CHI SQUARE

In [47]:
data = np.array([220, 215, 93, 23])
eye = pd.Series(data, index=['Brown','Blue','Hazel','Green'])
eye

Brown    220
Blue     215
Hazel     93
Green     23
dtype: int64

In [48]:
eye / eye.sum()

Brown    0.399274
Blue     0.390200
Hazel    0.168784
Green    0.041742
dtype: float64

In [49]:
# Compare with the default model of equal frequencies.
# We conclide that the data and the provided model do not coincide.
st.chisquare(eye)

Power_divergenceResult(statistic=202.56079854809437, pvalue=1.1798521738452924e-43)

In [50]:
# Compare with the model provide by the user.
# Again, this model corresponds to the equal frequencies.
# We conclide that the data and the provided model do not coincide.

model = len(eye) * [np.mean(eye)]
st.chisquare(f_obs=eye, f_exp=model)


Power_divergenceResult(statistic=202.56079854809437, pvalue=1.1798521738452924e-43)

In [51]:
# Compare with the model provide by the user.
# A more realistic model is provided.
# We conclide that the data and the provided model coincide.
model = eye.sum() * np.array([0.40, 0.33,0.15,0.12])
st.chisquare(f_obs=eye, f_exp=model)

Power_divergenceResult(statistic=35.46840455370401, pvalue=9.699714943948286e-08)

For two way table

In [52]:
# Data.
# Row label    = hair color.
# Column label = eye color. 

data_eye = {
    'Brown': [68,119,26,7], 
    'Blue':[20,84,17,94], 
    'Hazel':[15,54,14,10], 
    'Green':[5, 29, 14,16]
}
data = pd.DataFrame(data_eye, index=['Black', 'Brown', 'Red', 'Blonde'])
data

Unnamed: 0,Brown,Blue,Hazel,Green
Black,68,20,15,5
Brown,119,84,54,29
Red,26,17,14,14
Blonde,7,94,10,16


In [53]:
# Test statistic, p-value, degree of freedom, expected frequency table.
# We conclude that the eye color and hair color are not independent.
st.chi2_contingency(data)

Chi2ContingencyResult(statistic=138.28984162600824, pvalue=2.325286787098839e-25, dof=9, expected_freq=array([[ 40.13513514,  39.22297297,  16.96621622,  11.67567568],
       [106.28378378, 103.86824324,  44.92905405,  30.91891892],
       [ 26.38513514,  25.78547297,  11.15371622,   7.67567568],
       [ 47.19594595,  46.12331081,  19.95101351,  13.72972973]]))

In [54]:
data_survived = {
    'Yes': [123, 123, 123, 343],
    'No': [302, 323, 239, 231]
}
data = pd.DataFrame(data_survived, index=['1st', '2nd', '3rd', 'Crew'])
data

Unnamed: 0,Yes,No
1st,123,302
2nd,123,323
3rd,123,239
Crew,343,231


In [55]:
# Test statistic, p-value, degree of freedom, expected frequency table.
# We conclude that the room class and survived are not independent.

st.chi2_contingency(data)

Chi2ContingencyResult(statistic=149.6461684287804, pvalue=3.141187542584634e-32, dof=3, expected_freq=array([[167.45987825, 257.54012175],
       [175.73436635, 270.26563365],
       [142.63641395, 219.36358605],
       [226.16934145, 347.83065855]]))

# CHI SQUARED test for variance

In [56]:
np.random.seed(1234)
n = 20
mu = 10
sigma = 2
x = np.random.randn(n) * sigma + mu
ssq = x.var(ddof=1)

Use the acceptance range

In [57]:
sigma0sq = 2 ** 2

test_start = (n-1) * ssq / sigma0sq
print("Test statistic: %f" % test_start)
# obtain the acceptance range
acceptance_range = {
    'low': st.chi2.ppf(0.025, n-1), 
    'high': st.chi2.ppf(0.975,n-1)
}
acceptance_range

Test statistic: 23.067557


{'low': 8.906516481987971, 'high': 32.85232686172969}

using the p value

In [58]:
p_value = 1 - st.chi2.cdf(test_start, n-1)
p_value

0.23437934469820776

In [59]:
# Left tail
p_value2 = st.chi2.cdf(test_start, n-1)
p_value2

0.7656206553017922

In [60]:
p_value3 = 2 * p_value
p_value3

0.4687586893964155

# F-TEST

In [61]:
np.random.seed(1234)
n1 = 25
n2 = 20
n3 = 30

x1 = np.random.randn(n1) * 2 +10
x2 = np.random.randn(n2) * 4 -5
x3 = np.random.randn(n3) * 2 + 50

ssq_1 = x1.var(ddof=1)
ssq_2 = x2.var(ddof=1)
ssq_3 = x3.var(ddof=1)

In [62]:
# Print out the sample variance
print("s1^2 = %f" % ssq_1)
print("s2^2 = %f" % ssq_2)
print("s3^2 = %f" % ssq_3)

s1^2 = 4.264041
s2^2 = 16.174581
s3^2 = 3.543988


using the p value

In [63]:
test_stat = ssq_2 / ssq_1
p_value = 2 * (1 - st.f.cdf(test_stat, n2-1, n1-1))
p_value

0.002514298584073593

In [64]:
p_val = 1 - st.f.cdf(test_stat, n2-1, n1-1)
p_val

0.0012571492920367966

In [70]:
test_start = ssq_1 / ssq_3
_value = 2 * (1 - st.f.cdf(test_start, n1 - 1, n3 - 1))
_value

0.6291925258409772

In [72]:
p_value = 1 - st.f.cdf(test_start, n1-1, n3-1)
p_value

0.3145962629204886