In [None]:
# Statistical testing
# Ronald A. Fisher coined the Lady Tasting Tea Problem
# The lady can ascertain whether tea had milk first or second during preparation phase
# How many combinations of arrangements can the lady make from 8 cups of tea?
cups = list(range(8))
cups

In [None]:
# If we or the lady were to select cups randomly, 1 of 70 options are possible.
# 1 of 70 from: (8*7*6*5)/(4*3*2*1) = 70

# We will use the itertools package from Python
# All possibilities/combinations are generated
import itertools

cups = list(range(8))
poss = list(itertools.combinations(cups, 4))
poss

# This takes the list(cups) along with a number(4)
# This returns every possible way of selecting 4 random cups from the list.

In [None]:
# Null Hypothesis
# Null hypothesis is defined as the test subject unable to tell if the cup had milk first or second.

# Alternative Hypothesis
# If the test subject can choose the 4 cups with milk correctly,
# there is a 1 in 70 chance of this happening or approx 1.4%.

(1/70)*100

In [None]:
# Exercise 1:
# The code above gives about a 1.5% chance of randomly selecting the correct cups with milk first.
# Calculate the minimum number of cups of tea required to ensure the
# probability of randomly selecting the correct cups of tea is less than or equal to 1%

# Bonus: How many would be required if you were to allow the taster to get one cup wrong whilst maintaining the 1% threshold?

import itertools

cups = list(range(100))
poss = list(itertools.combinations(cups, 2))


print(poss)

(1/6)*100

# 1 of 6 from: (4*3)/(2*1) = 6

In [None]:
# Distribution
import random
import seaborn

milkfirst = set(random.choice(poss))

counts = [len(milkfirst & set(i)) for i in itertools.combinations(cups, 4)]

seaborn.countplot(x = counts)

In [None]:
# Exercise 2: Use scipy's version of Fisher's exact test to simulate the Lady Tasting Tea problem

# https://www.statology.org/fishers-exact-test-python/
# Fisher's Exact Test is used to examine whether or not there is a major association between two variables.

# Research on reference link TBC soon

In [None]:
# T-tests & simulated data
# Fake data sets can be created with specific properties to investigate numerical methods

# Parameters for two different lists of numbers
m_a, s_a, m_b, s_b = 1.0, 0.4, 2.0, 0.4
# Sample size
N = 40

# Creating two lists of numbers based on bell-shaped probability curves
a = np.random.normal(loc = m_a, scale = s_a, size = N)
b = np.random.normal(loc = m_b, scale = s_b, size = N)

# Placing both samples into one data frame
df = pd.DataFrame({'Category': ['A'] * len(a) + ['B'] * len(b), 'Value': np.hstack([a, b])})
df

In [None]:
# Importing various useful packages for Python
# Efficient numerical arrays.
import numpy as np

# Data frames.
import pandas as pd

# Alternative statistics package.
import statsmodels.stats.weightstats as stat

# Mains statistics package.
import scipy.stats as ss

# Plotting.
import matplotlib.pyplot as plt

# Fancier plotting.
import seaborn as sns

# Better sized plots.
plt.rcParams['figure.figsize'] = (12, 8)

# Nicer colours and styles for plots.
plt.style.use("ggplot")

# Visualising data
sns.catplot(x = 'Category', y = 'Value', jitter = False, data = df)

In [None]:
# Running T-test via scipy.stats
import scipy.stats as ss

t_ss, p_ss = ss.ttest_ind(a, b)
print(f"t-value: {t_ss}\tp-value: {p_ss}")
print(f"P_scipy: {p_ss: 0.2f}")

In [None]:
# Running T-test via statsmodels

t_sm, p_sm, d_sm = stat.ttest_ind(a, b)
print(f"t-value: {t_sm}\tp-value: {p_sm}\tDeg Free: {d_sm}")
print(f"P_statsmodels: {p_sm: 0.2f}")

In [None]:
# Calculating t statistic "by hand"
# https://en.wikipedia.org/wiki/Test_statistic

# Length of the arrays
n1 = len(a)
n2 = len(b)

# Means of the samples
m1 = np.sum(a) / n1
m2 = np.sum(b) / n2

# Sample standard deviations
s1 = np.sqrt(np.sum((a - m1) ** 2) / (n1 - 1))
s2 = np.sqrt(np.sum((b - m2) ** 2) / (n2 - 1))

df = n1 + n2 - 2
sp2 = ((n1 - 1) * s1**2 + (n2 - 1) * s2**2) / df
t = (m1 - m2) / (np.sqrt(sp2) * np.sqrt(1.0/n1 + 1.0/n2))
t

In [None]:
# Populations
# Setting x values

min_x = min(m_a, m_b) - 5.0 * max(s_a, s_b)
max_x = max(m_a, m_b) + 5.0 * max(s_a, s_b)
x = np.linspace(min_x, max_x, 1000)

# Normal distribution plots of two different populations
# https://en.wikipedia.org/wiki/Normal_distribution

y_a = ss.norm.pdf(x, m_a, s_a)
y_b = ss.norm.pdf(x, m_b, s_b)

fig, ax = plt.subplots(figsize = (10, 6))
ax.plot(x, y_a)
ax.plot(x, y_b)
plt.show()

In [None]:
# Critical Value: used to make a decision in relation to the calculated t statistic from samples

# The critical probability value
critical = 0.05

# Creating the figure
fig, ax = plt.subplots(figsize = (10, 6))

# Range of x values, which represent the t statistic
min_x = -5.0
max_x = 5.0
x = np.linspace(min_x, max_x, 1000)

# The probability density function of the t statistic
# Using the degrees of freedom listed above and plotting figure
t = ss.t.pdf(x, d_sm)

ax.plot(x, t, color = 'red')

# Getting the tails & plotting them
tf = pd.DataFrame({'x': x, 't': t})
tcrit = abs(ss.t.ppf(critical / 2.0, d_sm))
tail_one = tf[tf['x'] >= tcrit]
tail_two = tf[tf['x'] <= -tcrit]

ax.fill_between(tail_one['x'], tail_one['t'], 0, facecolor = "red")
ax.fill_between(tail_two['x'], tail_two['t'], 0, facecolor = "red")
plt.show()


In [None]:
# Type I Errors: False Positives
# Running 10,000 t-tests where the population means are equal
# We should make the wrong decision (reject the hypothesis) (100 * critical) percent of the time

# Setting number of t-tests to run
trials = 10000
# Setting number of values per sample
N = 100
# Population 1 mean, Population 2 mean, Standard Deviation in both
mean1, mean2, stddev = 2.0, 2.0, 0.3
# Critical probability value
critical = 0.05

# Running total of type I errors committed
rejects = 0

# Looping through the t-tests
for i in range(trials):
    # Generating Sample 1
    sample1 = np.random.normal(loc = mean1, scale = stddev, size = N)
    # Generating Sample 2
    sample2 = np.random.normal(loc = mean2, scale = stddev, size = N)
    # Running t-test
    t, p = ss.ttest_ind(sample1, sample2)
    # If p is less than or equal to critical, reject it
    if p <= critical:
        rejects = rejects + 1

# Printing results
typei = 100.0 * (rejects / trials)
print(f"{typei: 0.2f}%")

In [None]:
# Type II Errors: False Negatives
# Running 10,000 t-tests where the population means are not equal

# Setting number of t-tests to run
trials = 10000
# Setting number of values per sample
N = 100
# Population 1 mean, Population 2 mean, Standard Deviation in both
mean1, mean2, stddev = 2.0, 2.1, 0.3
# Critical probability value
critical = 0.05

# Running total of type I errors committed
notrejects = 0

# Looping through the t-tests
for i in range(trials):
    # Generating Sample 1
    sample1 = np.random.normal(loc = mean1, scale = stddev, size = N)
    # Generating Sample 2
    sample2 = np.random.normal(loc = mean2, scale = stddev, size = N)
    # Running t-test
    t, p = ss.ttest_ind(sample1, sample2)
    # If p is greater than critical, do not reject it
    if p > critical:
        notrejects = notrejects + 1

# Printing results
typeii = 100.0 * (notrejects / trials)
print(f"{typeii: 0.2f}%")

In [None]:
# Paired Samples
# Vincent Arel-Bundock's R datasets list
dfsleep = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/datasets/sleep.csv")
dfsleep

# Extracting first sample from the data set
drugA = dfsleep[dfsleep["group"] == 1]
drugA = drugA.sort_values("ID")
drugA = drugA["extra"].to_numpy()
drugA

In [None]:
# Extracting second sample from the data set
drugB = dfsleep[dfsleep["group"] == 2]
drugB = drugB.sort_values("ID")
drugB = drugB["extra"].to_numpy()
drugB

In [None]:
# Running a paired samples t-test
ss.ttest_rel(drugA, drugB)

# Equivalent to a one sample t-test
ss.ttest_1samp(drugB - drugA, 0)

# Suggestion from statsmodels for running the t-test
stat.DescrStatsW(drugB - drugA).ttest_mean(0)

In [None]:
# Problems with multiple t-tests
# If we want to compare three groups, and null hypothesis is all population means are equal.
# Can three t-tests run in parallel?

# Size of each sample
N = 100

# Creating three samples
sampA = np.random.normal(1.0, 0.2, N)
sampB = np.random.normal(1.0, 0.2, N)
sampC = np.random.normal(2.0, 0.2, N)

# Put samples in a single data frame
sample = ['A'] * N + ['B'] * N + ['C'] * N
values = np.hstack([sampA, sampB, sampC])
dfsamps = pd.DataFrame({'Sample': sample, 'Value': values})

# Visualising samples
sns.catplot(x = 'Sample', y = 'Value', jitter = False, data = dfsamps)

In [None]:
# T-tests for each pair
t_AB, p_AB = ss.ttest_ind(sampA, sampB)
t_AC, p_AC = ss.ttest_ind(sampA, sampC)
t_BC, p_BC = ss.ttest_ind(sampB, sampC)
print(f"p_AB: {p_AB: .2f}\tp_AC: {p_AC: .2f}\tp_BC: {p_BC: .2f}")

In [None]:
# Running 10,000 t-tests where the population means are equal
# We should make the wrong decision (reject the hypothesis) (100 * critical) percent of the time
# We expect to incorrectly reject the null hypothesis 5% of the time

# Setting number of t-tests to run
trials = 10000
# Setting number of values per sample
N = 100
# Population 1 mean, Population 2 mean, Population 3 mean, Standard Deviation in both
mean1, mean2, mean3, stddev = 2.0, 2.0, 2.0, 0.3
# Critical probability value
critical = 0.05

# Running total of type I errors committed
rejects = 0

# Looping through the t-tests
for i in range(trials):
    # Generating Sample 1
    sample1 = np.random.normal(loc = mean1, scale = stddev, size = N)
    # Generating Sample 2
    sample2 = np.random.normal(loc = mean2, scale = stddev, size = N)
    # Generating Sample 3
    sample3 = np.random.normal(loc = mean3, scale = stddev, size = N)
    # Running t-tests
    t1, p1 = ss.ttest_ind(sample1, sample2)
    t2, p2 = ss.ttest_ind(sample1, sample3)
    t3, p3 = ss.ttest_ind(sample2, sample3)
    # If any are less than or equal to critical, reject them
    if p1 <= critical or p2 <= critical or p3 <= critical:
        rejects = rejects + 1

# Printing results
typei = 100.0 * (rejects / trials)
print(f"{typei: 0.2f}%")

In [None]:
# Analysis of Variance (ANOVA)
# ANOVA may be used to avoid a higher Type I error rate: false positives

# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html
F, P = ss.f_oneway(sampA, sampB, sampC)
print(f"F: {F: .2f} P: {P: .2f}")

In [None]:
# Running 10,000 ANOVAs where the population means are equal
# We should make the wrong decision (reject the hypothesis) (100 * critical) percent of the time
# We expect to incorrectly reject the null hypothesis 5% of the time

# Setting number of t-tests to run
trials = 10000
# Setting number of values per sample
N = 100
# Population 1 mean, Population 2 mean, Population 3 mean, Standard Deviation in both
mean1, mean2, mean3, stddev = 2.0, 2.0, 2.0, 0.3
# Critical probability value
critical = 0.05

# Running total of type I errors committed
rejects = 0

# Looping through the t-tests
for i in range(trials):
    # Generating Sample 1
    sample1 = np.random.normal(loc = mean1, scale = stddev, size = N)
    # Generating Sample 2
    sample2 = np.random.normal(loc = mean2, scale = stddev, size = N)
    # Generating Sample 3
    sample3 = np.random.normal(loc = mean3, scale = stddev, size = N)
    # Running ANOVA test
    F, p = ss.f_oneway(sample1, sample2, sample3)
    # If less than or equal to critical, reject it
    if p <= critical:
        rejects = rejects + 1

# Printing results
typei = 100.0 * (rejects / trials)
print(f"{typei: 0.2f}%")

In [None]:
# Exercise 3
# Take the code from the Examples section of the scipy stats
# documentation for independent samples t-tests,
# add it to your own notebook and explain how it works
# using MarkDown cells and code comments.
# Improve it in any way you think it could be improved.

# Research TBC
