In [6]:
import numpy as np
from scipy.stats import norm, t
import plotly.express as px
import plotly.graph_objects as go

In 1861, 10 essays appeared in the New Orleans Daily Crescent. They
were signed “Quintus Curtius Snodgrass” and some people suspected
they were actually written by Mark Twain. To investigate this, we will
consider the proportion of three letter words found in an author’s work.
From eight Twain essays we have:
.225 .262 .217 .240 .230 .229 .235 .217
From 10 Snodgrass essays we have:
.209 .205 .196 .210 .202 .207 .224 .223 .220 .201
(a) Perform a Wald test for equality of the means. Use the nonparametric
plug-in estimator. Report the p-value and a 95 per cent confidence
interval for the difference of means. What do you conclude?
(b) Now use a permutation test to avoid the use of large sample methods.
What is your conclusion? (Brinegar (1963)).

In [17]:
# similar to example 10.15
X = np.array([.225, .262, .217, .240, .230, .229, .235, .217])
Y = np.array([.209, .205, .196, .210, .202, .207, .224, .223, .220, .201])

mu_x_hat = np.mean(X)
mu_y_hat = np.mean(Y)
theta_hat = mu_x_hat - mu_y_hat
se_hat = np.sqrt(X.var(ddof = 1)/len(X) + Y.var(ddof = 1)/len(Y))

alpha = 0.05
confidence_interval = (theta_hat - norm.ppf(1-alpha/2) * se_hat,
                       theta_hat + norm.ppf(1-alpha/2) * se_hat)
print(f'Estimate for theta: {theta_hat}')
print(f'CI for theta: {confidence_interval}')

w = theta_hat / se_hat
pval = 2 * norm.cdf(-np.abs(w)) # Theorem 10.13
print(f'Wald stat = {w}, p-val = {pval}')

Estimate for theta: 0.022175
CI for theta: (0.010439729850154362, 0.03391027014984564)
Wald stat = 3.7035535443338206, p-val = 0.00021260028225810121


the p-value is 0.0002
this provides strong evidence to reject the null hypothesis that the means are identical;
tihs means that the distributions have different means;
this means that the writer is probably not Mark Twain

# Permutation test

In [20]:
X_concat_Y = np.concatenate([X,Y])
slice = len(X)
num_permutations_to_evaluate = 9999
num_perm_satisfied_indicator = 0
for _ in range(num_permutations_to_evaluate):
    permutation = np.random.permutation(X_concat_Y)
    X_perm = permutation[:slice]
    Y_perm = permutation[slice:]
    T_perm = X_perm.mean() - Y_perm.mean()
    indicator = int(T_perm > theta_hat)
    num_perm_satisfied_indicator += indicator

pval_permutation_test = num_perm_satisfied_indicator / num_permutations_to_evaluate
print(f'Permutation test p-val: {pval_permutation_test}')

Permutation test p-val: 0.00040004000400040005


the p-value is 0.0004
this provides strong evidence to reject the null hypothesis that the means are identical;
tihs means that the distributions have different means;
this means that the writer is probably not Mark Twain