In [60]:
import numpy as np
import pandas as pd
from scipy import stats
from IPython.display import Markdown

# Q4.1

Consider a population consisting of four subjects, A, B, C, and D. The values
for a random variable X under investigation are given in Table E4.1. Form the
sampling distribution for the sample mean of size n = 2 and verify that $\mu_{\bar{x}} = \mu$.
Then repeat the process with sample size of n = 3.



In [8]:
index = ['A', 'B', 'C', 'D']
columns = ['Value']
data = [1, 1, 0, 0]

df = pd.DataFrame(data=data, index=index, columns=columns)
df.index = df.index.rename('Subject')

df

Unnamed: 0_level_0,Value
Subject,Unnamed: 1_level_1
A,1
B,1
C,0
D,0


## A4.1

In [37]:
from itertools import combinations

for n in [2, 3]:
    print(f"For n = {n}:")
    sum_means = 0
    for comb in combinations('ABCD', n):
        mean = df.loc[comb, :].values.sum() / n
        sum_means += mean
        print(f"  {comb} mean = {mean :.2}")
    num_combs = len(list(combinations('ABCD', n)))
    print(f"  Mean of means = {sum_means / num_combs :.2}\n")

For n = 2:
  ('A', 'B') mean = 1.0
  ('A', 'C') mean = 0.5
  ('A', 'D') mean = 0.5
  ('B', 'C') mean = 0.5
  ('B', 'D') mean = 0.5
  ('C', 'D') mean = 0.0
  Mean of means = 0.5

For n = 3:
  ('A', 'B', 'C') mean = 0.67
  ('A', 'B', 'D') mean = 0.67
  ('A', 'C', 'D') mean = 0.33
  ('B', 'C', 'D') mean = 0.33
  Mean of means = 0.5



# Q4.2
The body mass index ($kg/m^2$) is calculated by dividing a person’s weight by the
square of his or her height and is used as a measure of the extent to which the
person is overweight. Suppose that the distribution of the body mass index for
men has a standard deviation of σ = 3 $kg/m^2$, and we wish to estimate the mean
μ using a sample of size n = 49. Find the probability that we would be correct
within 1 $kg/m^2$.

## A4.2

In [65]:
se = 3 / np.sqrt(49)
z = 1 / se
p = stats.norm.cdf(z) - stats.norm.cdf(-z)

md = f"""
$Pr(\mu-1 \le \overline{{x}} \le \mu+1)$
$= Pr({-z :.3} \le z \le {z :.3}) = {p :.4}$
"""

Markdown(md)


$Pr(\mu-1 \le \overline{x} \le \mu+1)$
$= Pr(-2.33 \le z \le 2.33) = 0.9804$


# Q4.5
A study was conducted to investigate drinking problems among college
students.
In 1983, a group of students was asked whether they had ever driven
an automobile while drinking. In 1987, after the legal drinking age was
raised, a different group of college students was asked the same question.
The results are given in Table E4.5. Calculate, separately for 1983 and 1987,
the 95% confidence interval for the proportion of students who had driven an
automobile while drinking.

In [67]:
index = ['Yes', 'No', 'Total']

columns = ['1983', '1987', 'Total']

data = [
    [1250, 991, 2241],
    [1387, 1666, 3053],
    [2637, 2657, 5294],
]

df = pd.DataFrame(data=data, index=index, columns=columns)
df.index = df.index.rename('Drove while drinking')

df

Unnamed: 0_level_0,1983,1987,Total
Drove while drinking,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Yes,1250,991,2241
No,1387,1666,3053
Total,2637,2657,5294


## A4.5

In [89]:
z = stats.norm.interval(0.95)[1]
for year in ['1983', '1987']:
    n = df.loc['Total', year]
    p = df.loc['Yes', year] / n
    s = np.sqrt(p * (1 - p))
    se = s / np.sqrt(n)
    print(f"For {year}, 95% CI for proportion = [{p - z * se :.3}, {p + z * se :.3}]")

For 1983, 95% CI for proportion = [0.455, 0.493]
For 1987, 95% CI for proportion = [0.355, 0.391]


# Q4.9
Sera from a T‐lymphotropic virus type (HTLV‐I) risk group (prostitute women)
were tested with two commercial research enzyme‐linked immunoabsorbent
assays (EIA) for HTLV‐I antibodies. These results were compared with a gold
standard, and the outcomes are shown in Table E4.9. Calculate the 95% confidence
intervals for the sensitivity and specificity separately for the two EIAs.



In [90]:
index = ['Positive', 'Negative']

columns = [
    ['Dupont’s EIA'] * 2 + ['Cellular product’s EIA'] * 2,
    ['Positive', 'Negative'] * 2,
]

data = [
    [15, 1, 16, 0],
    [2, 164, 7, 179],
]

df = pd.DataFrame(data=data, index=index, columns=columns)
df.index = df.index.rename('True')

df

Unnamed: 0_level_0,Dupont’s EIA,Dupont’s EIA,Cellular product’s EIA,Cellular product’s EIA
Unnamed: 0_level_1,Positive,Negative,Positive,Negative
True,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Positive,15,1,16,0
Negative,2,164,7,179


## A4.9

In [105]:
z = stats.norm.interval(0.95)[1]
for EIA in ['Dupont’s EIA', 'Cellular product’s EIA']:
    print(f"{EIA}:")
    
    n = df.loc['Positive', EIA].sum()
    sens = df.loc['Positive', (EIA, 'Positive')] / n
    se = np.sqrt((sens * (1 - sens)) / n)
    ci_left = sens - z * se if sens - z * se >= 0 else 0.0
    ci_right = sens + z * se if sens + z * se <= 1 else 1.0
    print(f"  Sensitivity (95% CI) = [{ci_left :.3}, {ci_right :.3}]")
    
    n = df.loc['Negative', EIA].sum()
    spec = df.loc['Negative', (EIA, 'Negative')] / n
    se = np.sqrt((spec * (1 - spec)) / n)
    ci_left = spec - z * se if spec - z * se >= 0 else 0.0
    ci_right = spec + z * se if spec + z * se <= 1 else 1.0
    print(f"  Specificity (95% CI) = [{ci_left :.3}, {ci_right :.3}]")

Dupont’s EIA:
  Sensitivity (95% CI) = [0.819, 1.0]
  Specificity (95% CI) = [0.971, 1.0]
Cellular product’s EIA:
  Sensitivity (95% CI) = [1.0, 1.0]
  Specificity (95% CI) = [0.935, 0.99]


Note: Cellular product’s EIA's sensitivity is unreliable to due the zero false negatives.

# Q4.13
A case–control study was conducted in Auckland, New Zealand, to investigate
the effects of alcohol consumption on both nonfatal myocardial infarction
and coronary death in the 24 hours after drinking, among regular drinkers.
Data were tabulated separately for men and women (Table E4.13).

(a) Refer to the myocardial infarction data and calculate separately for men
and women the 95% confidence interval for the odds ratio associated with
drinking.

(b) Refer to coronary death data and calculate separately for men and women
the 95% confidence interval for the odds ratio associated with drinking.

(c) From the results in parts (a) and/or (b), is there any indication that gender
may act as an effect modifier?

In [108]:
index = [
    ['Men'] * 2 + ['Women'] * 2,
    ['No', 'Yes'] * 2,
]

columns = [
    ['Myocardial infarction'] * 2 + ['Coronary death'] * 2,
    ['Controls', 'Cases'] * 2,
]

data = [
    [197, 142, 135, 103],
    [201, 136, 159, 69],
    [144, 41, 89, 12],
    [122, 19, 76, 4],
]

df = pd.DataFrame(data=data, index=index, columns=columns)
df.index = df.index.rename(('', 'Drink in the last 24 h'))

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Myocardial infarction,Myocardial infarction,Coronary death,Coronary death
Unnamed: 0_level_1,Unnamed: 1_level_1,Controls,Cases,Controls,Cases
Unnamed: 0_level_2,Drink in the last 24 h,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Men,No,197,142,135,103
Men,Yes,201,136,159,69
Women,No,144,41,89,12
Women,Yes,122,19,76,4


## A4.13(a)(b)

In [130]:
z = stats.norm.interval(0.95)[1]
for data in ['Myocardial infarction', 'Coronary death']:
    print(f"{data}:")
    for gender in ['Men', 'Women']:
        a = df.loc[(gender, 'Yes'), (data, 'Cases')]
        b = df.loc[(gender, 'Yes'), (data, 'Controls')]
        c = df.loc[(gender, 'No'), (data, 'Cases')]
        d = df.loc[(gender, 'No'), (data, 'Controls')]

        log_OR = np.log((a/b) / (c/d))
        se = np.sqrt(1/a + 1/b + 1/c + 1/d)
        ci_left = np.exp(log_OR - z * se)
        ci_right = np.exp(log_OR + z * se)

        print(f"  {gender :5}: odds ratio (95% CI) = [{ci_left :.3}, {ci_right :.3}]")

Myocardial infarction:
  Men  : odds ratio (95% CI) = [0.691, 1.28]
  Women: odds ratio (95% CI) = [0.302, 0.992]
Coronary death:
  Men  : odds ratio (95% CI) = [0.388, 0.833]
  Women: odds ratio (95% CI) = [0.121, 1.26]


## A4.13(c)
CIs for gender have substantial overlap; gender is unlikely an effect modifier.

# Q4.19
Consider the following measurements of forced expiratory volume (liters) for
10 subjects taken from a study that examines the response to ozone and sulfur
dioxide among adolescents suffering from asthma:

In [201]:
data = np.array([3.50, 2.60, 2.75, 2.82, 4.05, 2.25, 2.68, 3.00, 4.02, 2.85])

Calculate the 95% confidence interval for the (population) mean of forced
expiratory volume (liters).

## A4.19

In [202]:
z = stats.norm.interval(0.95)[1]
mean = data.mean()
n = len(data)
se = data.std(ddof=1) / np.sqrt(n)
print(f"Mean (95% CI) = [{mean - z * se :.4}, {mean + z * se :.4}]")

Mean (95% CI) = [2.676, 3.428]


# Q4.25
The systolic blood pressures (mmHg) of 12 women between the ages of 20
and 35 were measured before and after administration of a newly developed
oral contraceptive (Table E4.25).

(a) Calculate the 95% confidence interval for the mean systolic blood pressure
change. Does the oral contraceptive seem to change the mean systolic
blood pressure?

(b) Calculate a 95% confidence interval for Pearson’s correlation coefficient
representing a possible relationship between systolic blood pressures
measured before and after the administration of oral contraceptive. What
does it mean that these measurements are correlated (if confirmed)?

In [180]:
index = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

columns = ['Before', 'After', 'After–before difference']

data = [
    [122, 127, 5],
    [126, 128, 2],
    [132, 140, 8],
    [120, 119, -1],
    [142, 145, 3],
    [130, 130, 0],
    [142, 148, 6],
    [137, 135, -2],
    [128, 129, 1],
    [132, 137, 5],
    [128, 128, 0],
    [129, 133, 4],
]

df = pd.DataFrame(data=data, index=index, columns=columns)
df.index = df.index.rename('Subject')

df

Unnamed: 0_level_0,Before,After,After–before difference
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,122,127,5
2,126,128,2
3,132,140,8
4,120,119,-1
5,142,145,3
6,130,130,0
7,142,148,6
8,137,135,-2
9,128,129,1
10,132,137,5


## A4.25(a)

In [198]:
n = len(df['After–before difference'])
mean = df['After–before difference'].mean()
se = df['After–before difference'].std(ddof=1) / np.sqrt(n)
z = stats.t(n-1).interval(0.95)[1]
print(f"Mean (95% CI) = [{mean - z * se :.4}, {mean + z * se :.4}]")

Mean (95% CI) = [0.6211, 4.546]


## A4.25(b)

In [199]:
coef = stats.norm.interval(0.95)[1]
n = len(df)
r = df.corr().iloc[0, 1]
z = 0.5 * np.log((1+r) / (1-r))
se = np.sqrt(1 / (n - 3))

z_left = z - coef * se
z_right = z + coef * se
r_left = (np.exp(2 * z_left) - 1) / (np.exp(2 * z_left) + 1)
r_right = (np.exp(2 * z_right) - 1) / (np.exp(2 * z_right) + 1)

print(f"Pearson correlation (95% CI) = [{r_left :.3}, {r_right :.3}]")

Pearson correlation (95% CI) = [0.767, 0.981]


# Q4.26
Suppose that we are interested in studying patients with systemic cancer who
subsequently develop a brain metastasis; our ultimate goal is to prolong their
lives by controlling the disease. A sample of 23 such patients, all of whom
were treated with radiotherapy, were followed from the first day of their
treatment until recurrence of the original tumor. Recurrence is defined as the
reappearance of a metastasis in exactly the same site, or in the case of patients
whose tumor never completely disappeared, enlargement of the original
lesion. Times to recurrence (in weeks) for the 23 patients were:

In [14]:
data = np.array([2, 2, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 14, 14, 18, 19, 20, 22, 22, 31, 33, 39, 195])

First, calculate
the 95% confidence interval for the mean time to recurrence on the log scale;
then convert the endpoints to weeks.

# Q4.29
A study was undertaken to clarify the relationship between heart disease and
occupational carbon disulfide exposure along with another important factor,
elevated diastolic blood pressure (DBP), in a data set obtained from a 10‐year
prospective follow‐up of two cohorts of over 340 male industrial workers in
Finland. Carbon disulfide is an industrial solvent that is used all over the
world in the production of viscose rayon fibers. Table E4.29 gives the mean
and standard deviation (SD) of serum cholesterol (mg/100 mL) among
exposed and nonexposed cohorts, by diastolic blood pressure (DBP). Compare
serum cholesterol levels between exposed and nonexposed cohorts at each
level of DBP by calculating the two 95% confidence intervals for the means
(exposed and nonexposed groups).

In [15]:
index = ['<95', '95–100', '≥100']

columns = [
    ['Exposed'] * 3 + ['Nonexposed'] * 3,
    ['n', 'Mean', 'SD'] * 2,
]

data = [
    [205, 220, 50, 271, 221, 42],
    [92, 227, 57, 53, 236, 46],
    [20, 233, 41, 10, 216, 48],
]

df = pd.DataFrame(data=data, index=index, columns=columns)
df.index = df.index.rename('DBP (mmHg)')

df

Unnamed: 0_level_0,Exposed,Exposed,Exposed,Nonexposed,Nonexposed,Nonexposed
Unnamed: 0_level_1,n,Mean,SD,n,Mean,SD
DBP (mmHg),Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
<95,205,220,50,271,221,42
95–100,92,227,57,53,236,46
≥100,20,233,41,10,216,48


# Q4.32
In an assay of heparin, a standard preparation is compared with a test preparation
by observing the log clotting times (y, in seconds) of blood containing
different doses of heparin (x is the log dose; Table E4.32). Replicate readings
are made at each dose level. Calculate separately for the standard preparation
and the test preparation the 95% confidence interval for Pearson’s correlation
coefficient between the log clotting times and log dose.

In [19]:
columns = ['Standard', 'Test', 'Log dose']

data = np.array([
    [1.806, 1.756, 1.851, 1.785, 1.954, 1.929, 2.124, 1.996, 2.262, 2.161],
    [1.799, 1.763, 1.826, 1.832, 1.898, 1.875, 1.973, 1.982, 2.14, 2.1],
    [0.72, 0.72, 0.87, 0.87, 1.02, 1.02, 1.17, 1.17, 1.32, 1.32],
]).T

df = pd.DataFrame(data=data, columns=columns)

df

Unnamed: 0,Standard,Test,Log dose
0,1.806,1.799,0.72
1,1.756,1.763,0.72
2,1.851,1.826,0.87
3,1.785,1.832,0.87
4,1.954,1.898,1.02
5,1.929,1.875,1.02
6,2.124,1.973,1.17
7,1.996,1.982,1.17
8,2.262,2.14,1.32
9,2.161,2.1,1.32


# Q4.35
Data are shown in Table E4.35 for two groups of patients who died of acute
myelogenous leukemia. Patients were classified into the two groups according
to the presence or absence of a morphologic characteristic of white cells.
Patients termed AG positive were identified by the presence of Auer rods and/
or significant granulature of the leukemic cells in the bone marrow at diagnosis.
For AG‐negative patients these factors were absent. Leukemia is a cancer
characterized by an overproliferation of white blood cells; the higher the
white blood count (WBC), the more severe the disease. Calculate separately
for the AG‐positive and AG‐negative patients the 95% confidence interval for
Pearson’s correlation coefficient between survival time and white blood count
(both on a log scale). Is there any indication that the two population correlation
coefficients are different?

In [20]:
columns = [
    ['AG positive, N = 17', 'AG positive, N = 17', 'AG negative, N = 16', 'AG negative, N = 16'],
    ['WBC', 'Survival time (weeks)', 'WBC', 'Survival time (weeks)']
]

data = [
    [2300, 65, 4400, 56],
    [750, 156, 3000, 65],
    [4300, 100, 4000, 17],
    [2600, 134, 1500, 7],
    [6000, 16, 9000, 16],
    [10500, 108, 5300, 22],
    [10000, 121, 10000, 3],
    [17000, 4, 19000, 4],
    [5400, 39, 27000, 2],
    [7000, 143, 28000, 3],
    [9400, 56, 31000, 8],
    [32000, 26, 26000, 4],
    [35000, 22, 21000, 3],
    [100000, 1, 79000, 30],
    [100000, 1, 100000, 4],
    [52000, 5, 100000, 43],
    [100000, 65, np.nan, np.nan],
]

df = pd.DataFrame(data=data, columns=columns)

df

Unnamed: 0_level_0,"AG positive, N = 17","AG positive, N = 17","AG negative, N = 16","AG negative, N = 16"
Unnamed: 0_level_1,WBC,Survival time (weeks),WBC,Survival time (weeks)
0,2300,65,4400.0,56.0
1,750,156,3000.0,65.0
2,4300,100,4000.0,17.0
3,2600,134,1500.0,7.0
4,6000,16,9000.0,16.0
5,10500,108,5300.0,22.0
6,10000,121,10000.0,3.0
7,17000,4,19000.0,4.0
8,5400,39,27000.0,2.0
9,7000,143,28000.0,3.0
