In [1]:
import numpy as np
import pandas as pd
from scipy import stats

# Q3.1
Although cervical cancer is not a leading cause of death among women in the
United States, it has been suggested that virtually all such deaths are preventable
(5166 American women died from cervical cancer in 1977). In an effort to
find out who is being or not being screened for cervical cancer (Pap testing),
data were collected from a certain community (Table E3.1). Is there a statistical
relationship here? (Try a few different methods: calculation of odds ratio,
comparison of conditional and unconditional probabilities, and comparison of
conditional probabilities.)

In [2]:
index = ['No', 'Yes', 'Total']

columns = ['White', 'Black', 'Total']

data = [
    [5244, 785, 6029],
    [25117, 2348, 27465],
    [30361, 3133, 33494],
]

df = pd.DataFrame(data=data, index=index, columns=columns)
df.index = df.index.rename('Pap test')

df

Unnamed: 0_level_0,White,Black,Total
Pap test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,5244,785,6029
Yes,25117,2348,27465
Total,30361,3133,33494


## A3.1

In [3]:
odds_b = df.loc['Yes', 'Black'] / df.loc['No', 'Black']
odds_w = df.loc['Yes', 'White'] / df.loc['No', 'White']
odds_ratio = odds_b / odds_w

print(f"OR for black vs. white = {odds_ratio :.3f}")
print(f"P(Pap = Yes)           = {df.loc['Yes', 'Total'] / df.loc['Total', 'Total'] :.3f}")
print(f"P(Pap = Yes | Black)   = {df.loc['Yes', 'Black'] / df.loc['Total', 'Black'] :.3f}")
print(f"P(Pap = Yes | White)   = {df.loc['Yes', 'White'] / df.loc['Total', 'White'] :.3f}")

OR for black vs. white = 0.624
P(Pap = Yes)           = 0.820
P(Pap = Yes | Black)   = 0.749
P(Pap = Yes | White)   = 0.827


According to the data above, the black community is less likely to receive a screening for cervical cancer.

# Q3.3
From the intraobserver variability study, find:

(a) The probability that abnormal squamous cells were found to be absent in
both screenings.

(b) The probability of an absence in the second screening given that abnormal
cells were found in the first screening.

(c) The probability of an abnormal presence in the second screening given that
no abnormal cells were found in the first screening.

(d) The probability that the screenings disagree.

In [4]:
index = ['Present', 'Absent', 'Total']

columns = [
    ['Second screening', 'Second screening', 'Total'],
    ['Present', 'Absent', '']
]

data = [
    [1763, 489, 2252],
    [403, 670, 1073],
    [2166, 1159, 3325],
]

df = pd.DataFrame(data=data, index=index, columns=columns)
df.index = df.index.rename('First screening')

df

Unnamed: 0_level_0,Second screening,Second screening,Total
Unnamed: 0_level_1,Present,Absent,Unnamed: 3_level_1
First screening,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Present,1763,489,2252
Absent,403,670,1073
Total,2166,1159,3325


## A3.3

In [5]:
a = df.loc['Absent', ('Second screening', 'Absent')] / df.loc['Total', ('Total', '')]
b = df.loc['Present', ('Second screening', 'Absent')] / df.loc['Present', ('Total', '')]
c = df.loc['Absent', ('Second screening', 'Present')] / df.loc['Absent', ('Total', '')]
d = (df.loc['Present', ('Second screening', 'Absent')] + \
     df.loc['Absent', ('Second screening', 'Present')]) / \
     df.loc['Total', ('Total', '')]

print(f"(a) {a :.3f}")
print(f"(b) {b :.3f}")
print(f"(c) {c :.3f}")
print(f"(d) {d :.3f}")

(a) 0.202
(b) 0.217
(c) 0.376
(d) 0.268


# Q3.4
Given the screening test of Example 1.4, where:
- Sensitivity = 0.406
- Specificity = 0.985

Calculate the positive predictive values when the test is applied to the following
populations:
- Population A: 80% prevalence
- Population B: 25% prevalence

## A3.4

In [6]:
sens = 0.406
spec = 0.985
for prev in [0.80, 0.25]:
    pos_pred = (prev * sens) / (prev * sens + (1 - prev) * (1 - spec))
    print(f"For population A, positive predictivity = {pos_pred :.3f}")

For population A, positive predictivity = 0.991
For population A, positive predictivity = 0.900


# Q3.5
Consider the data shown in Table E3.5 on the use of x‐ray as a screening test
for tuberculosis:

(a) Calculate the sensitivity and specificity.

(b) Find the disease prevalence.

(c) Calculate the positive predictive value both directly and indirectly using
Bayes’ theorem.

In [7]:
index = ['Negative', 'Positive', 'Total']

columns = [
    ['Tuberculosis', 'Tuberculosis'],
    ['No', 'Yes']
]

data = [
    [1739, 8],
    [51, 22],
    [1790, 30],
]

df = pd.DataFrame(data=data, index=index, columns=columns)
df.index = df.index.rename('X-ray')

df

Unnamed: 0_level_0,Tuberculosis,Tuberculosis
Unnamed: 0_level_1,No,Yes
X-ray,Unnamed: 1_level_2,Unnamed: 2_level_2
Negative,1739,8
Positive,51,22
Total,1790,30


## A3.5(a)

In [8]:
sens = df.loc['Positive', ('Tuberculosis', 'Yes')] / df.loc['Total', ('Tuberculosis', 'Yes')]
spec = df.loc['Negative', ('Tuberculosis', 'No')] / df.loc['Total', ('Tuberculosis', 'No')]
print(f"Sensitivity = {sens :.3f}")
print(f"Specificity = {spec :.3f}")

Sensitivity = 0.733
Specificity = 0.972


## A3.5(b)

In [9]:
prev = df.loc['Total', ('Tuberculosis', 'Yes')] / df.loc['Total'].sum()
print(f"Disease prevalence = {prev :.3f}")

Disease prevalence = 0.016


## A3.5(c)

In [10]:
ppv_direct = (prev * sens) / (prev * sens + (1 - prev) * (1 - spec))
ppv_indirect = df.loc['Positive', ('Tuberculosis', 'Yes')] / df.loc['Positive'].sum()
print(f"PPV (direct)   = {ppv_direct :.3f}")
print(f"PPV (indirect) = {ppv_indirect :.3f}")

PPV (direct)   = 0.301
PPV (indirect) = 0.301


# Q3.6
From the sensitivity and specificity of x‐rays found in Exercise 3.5, compute
the positive predictive value corresponding to these prevalences: 0.2, 0.4, 0.6,
0.7, 0.8, and 0.9. Can we find a prevalence when the positive predictive value
is preset at 0.8 or 80%?

## A3.6

In [11]:
for prev in [0.2, 0.4, 0.6, 0.7, 0.8, 0.9]:
    ppv = (prev * sens) / (prev * sens + (1 - prev) * (1 - spec))
    print(f"For prevalence = {prev}, PPV = {ppv :.3f}")

For prevalence = 0.2, PPV = 0.865
For prevalence = 0.4, PPV = 0.945
For prevalence = 0.6, PPV = 0.975
For prevalence = 0.7, PPV = 0.984
For prevalence = 0.8, PPV = 0.990
For prevalence = 0.9, PPV = 0.996


To find prevalence, we must rearrange the given equation:

$\large{PPV = \frac{(prev)(sens)}{(prev)(sens) + (1-prev)(1-spec)}}$

$\large{\frac{(prev)(sens)}{PPV} = (prev)(sens) + 1 - spec - prev + (prev)(spec)}$

$\large{\frac{(prev)(sens)}{PPV} - (prev)(sens) + prev - (prev)(spec) = 1 - spec}$

$\large{prev(\frac{sens}{PPV} - sens + 1 - spec) = 1 - spec}$

$\large{prev = \frac{1 - spec}{\frac{sens}{PPV} - sens + 1 - spec}}$

In [12]:
ppv = 0.8
numer = 1 - spec
denom = sens / ppv - sens + 1 - spec
prev = numer / denom
print(f"When PPV = {ppv}, prevalence = {prev :.3f}")

When PPV = 0.8, prevalence = 0.135


# Q3.7
Refer to the standard normal distribution. What is the probability of obtaining
a z value of:  
(a) At least 1.25?  
(b) At least −0.84?

## A3.7

In [13]:
print(f"(a) {stats.norm.sf(1.25) :.4f}")
print(f"(b) {stats.norm.sf(-0.84) :.4f}")

(a) 0.1056
(b) 0.7995


# Q3.8
Refer to the standard normal distribution. What is the probability of obtaining
a z value:  
(a) Between −1.96 and 1.96?  
(b) Between 1.22 and 1.85?  
(c) Between −0.84 and 1.28?

## A3.8

In [14]:
print(f"(a) {stats.norm.cdf(1.96) - stats.norm.cdf(-1.96) :.4f}")
print(f"(b) {stats.norm.cdf(1.85) - stats.norm.cdf(1.22) :.4f}")
print(f"(c) {stats.norm.cdf(1.28) - stats.norm.cdf(-0.84) :.4f}")

(a) 0.9500
(b) 0.0791
(c) 0.6993


# Q3.10
Refer to the standard normal distribution. Find a z value such that the probability
of obtaining a larger z value is:  
(a) 0.05  
(b) 0.025  
(c) 0.2

## A3.10

In [15]:
print(f"(a) {stats.norm.ppf(1 - 0.05) :.4f}")
print(f"(b) {stats.norm.ppf(1 - 0.025) :.4f}")
print(f"(c) {stats.norm.ppf(1 - 0.2) :.4f}")

(a) 1.6449
(b) 1.9600
(c) 0.8416


# Q3.13
Medical research has concluded that people experience a common cold
roughly two times per year. Assume that the time between colds is normally
distributed with a mean of 160 days and a standard deviation of 40 days.

(a) What is the probability of going 200 or more days between colds? Of
going 365 or more days?  
(b) What is the probability of getting a cold within 80 days of a previous cold?

## A3.13

In [16]:
N = stats.norm(160, 40)
print(f"(a) 200 or more = {N.sf(200) :.4}, 365 or more = {N.sf(365) :.4}")
print(f"(b) {N.cdf(80) :.3}")

(a) 200 or more = 0.1587, 365 or more = 1.488e-07
(b) 0.0228


# Q3.15
Intelligence test scores, referred to as intelligence quotient or IQ scores, are
based on characteristics such as verbal skills, abstract reasoning power,
numerical ability, and spatial visualization. If plotted on a graph, the distribution
of IQ scores approximates a normal curve with a mean of about 100. An
IQ score above 115 is considered superior. Studies of “intellectually gifted”
children have generally defined the lower limit of their IQ scores at 140;
approximately 1% of the population have IQ scores above this limit (based on
Biracree, 1984).

(a) Find the standard deviation of this distribution.  
(b) What percent are in the “superior” range of 115 or above?  
(c) What percent of the population have IQ scores of 70 or below?

## A3.15

In [17]:
mean = 100
std = (140 - mean) / stats.norm.ppf(0.99)
N = stats.norm(100, std)

print(f"(a) {std :.3}")
print(f"(b) {N.sf(115) :.3}")
print(f"(c) {N.cdf(70) :.3}")

(a) 17.2
(b) 0.192
(c) 0.0405


# Q3.17
Suppose it is known that the probability of recovery for a certain disease is
0.4. If 35 people are stricken with the disease, what is the probability that:

(a) 25 or more will recover?  
(b) Fewer than five will recover?

(Use the normal approximation.)

## A3.17

In [18]:
n = 35
pi = 0.4

x = 25
z = (x - n * pi) / np.sqrt(n * pi * (1 - pi))
print(f"(a) {stats.norm.sf(z) :8.2e} (z = {z :.4})")

x = 5
z = (x - n * pi) / np.sqrt(n * pi * (1 - pi))
print(f"(b) {stats.norm.cdf(z) :8.2e} (z = {z :.4})")

(a) 7.37e-05 (z = 3.795)
(b) 9.50e-04 (z = -3.105)


# Q3.19
Many samples of water, all the same size, are taken from a river suspected of
having been polluted by irresponsible operators at a sewage treatment plant.
The number of coliform organisms in each sample was counted; the average
number of organisms per sample was 15. Assuming the number of organisms
to be Poisson distributed, find the probability that:

(a) The next sample will contain at least 20 organisms.  
(b) The next sample will contain no more than five organisms.

## A3.19

In [19]:
P = stats.poisson(15)
print(f"(a) {P.sf(19) :.4}")  # Choose sf(19) to get pdf(20) + pdf(21) + pdf(22)...
print(f"(b) {P.cdf(5) :.4}")

(a) 0.1248
(b) 0.002792


# Q3.20
For the year 1981 (see Example 3.9), we also have the following data for the
South Atlantic states (Delaware, Florida, Georgia, Maryland, North and South
Carolina, Virginia, and West Virginia, and the District of Columbia):
- d = 7643 infant deaths
- N = 550300 live births

Find the infant mortality rate, and compare it to the national average (11.9 per 1000 live births) using the
method of Example 3.9.

## A3.20

In [20]:
d = 7643
N = 550300

IMR = d / N
print(f"IMR = {1000 * IMR :.1f} per 1000 live births")

theta = (11.9 / 1000) * N
z = (d - theta) / np.sqrt(theta)
print(f"z-score = {z :.3}")

IMR = 13.9 per 1000 live births
z-score = 13.5


# Q3.21
For a t curve with 20 df, find the areas:

(a) To the left of 2.086 and of 2.845.  
(b) To the right of 1.725 and of 2.528.  
(c) Beyond ± 2.086 and beyond ± 2.845.

## A3.21

In [21]:
t = stats.t(20)
print(f"(a) {t.cdf(2.086) :.3}, {t.cdf(2.845) :.3}")
print(f"(b) {t.sf(1.725) :.3}, {t.sf(2.528) :.3}")
print(f"(c) {t.cdf(-2.086) + t.sf(2.086) :.3}, {t.cdf(-2.845) + t.sf(2.845) :.3}")

(a) 0.975, 0.995
(b) 0.05, 0.01
(c) 0.05, 0.01


# Q3.22
For a chi‐square distribution with 2 df, find the areas:

(a) To the right of 5.991 and of 9.21.  
(b) To the right of 6.348.  
(c) Between 5.991 and 9.21.

## A3.22

In [22]:
chi2 = stats.chi2(2)
print(f"(a) {chi2.sf(5.991) :.3}, {chi2.sf(9.21) :.3}")
print(f"(b) {chi2.sf(6.348) :.3}")
print(f"(a) {chi2.cdf(9.21) - chi2.cdf(5.991) :.3}")

(a) 0.05, 0.01
(b) 0.0418
(a) 0.04


# Q3.23
For an F distribution with 2 numerator dfs and 30 denominator dfs, find the
areas:

(a) To the right of 3.32 and of 5.39.  
(b) To the right of 2.61.  
(c) Between 3.32 and 5.39.

## A3.23

In [23]:
f = stats.f(2, 30)
print(f"(a) {f.sf(3.32) :.3}, {f.sf(5.39) :.3}")
print(f"(b) {f.sf(2.61) :.3}")
print(f"(c) {f.cdf(5.39) - f.cdf(3.32) :.3}")

(a) 0.0498, 0.01
(b) 0.0902
(c) 0.0398


# Q3.24
In a study of intraobserver variability in assessing cervical smears, 3325
slides were screened for the presence or absence of abnormal squamous
cells. Each slide was screened by a particular observer and then rescreened
six months later by the same observer. The results are shown in Table E3.2.
Calculate the kappa statistic representing the agreement between the two
screenings.

In [24]:
index = ['Present', 'Absent', 'Total']

columns = [
    ['Second screening', 'Second screening', 'Total'],
    ['Present', 'Absent', '']
]

data = [
    [1763, 489, 2252],
    [403, 670, 1073],
    [2166, 1159, 3325],
]

df = pd.DataFrame(data=data, index=index, columns=columns)
df.index = df.index.rename('First screening')

df

Unnamed: 0_level_0,Second screening,Second screening,Total
Unnamed: 0_level_1,Present,Absent,Unnamed: 3_level_1
First screening,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Present,1763,489,2252
Absent,403,670,1073
Total,2166,1159,3325


## A3.24

In [25]:
n11 = df.loc['Present', ('Second screening', 'Present')]
n22 = df.loc['Absent', ('Second screening', 'Absent')]
n12 = df.loc['Present', ('Second screening', 'Absent')]
n21 = df.loc['Absent', ('Second screening', 'Present')]

n1t = df.loc['Present', ('Total', '')]
n2t = df.loc['Absent', ('Total', '')]
nt1 = df.loc['Total', ('Second screening', 'Present')]
nt2 = df.loc['Total', ('Second screening', 'Absent')]

kappa = 2 * (n11 * n22 - n12 * n21) / (n1t * n2t + nt1 * nt2)
print(f"Kappa statistic = {kappa :.4} (somewhat decent agreement)")

Kappa statistic = 0.3995 (somewhat decent agreement)


# Q3.25
Ninety‐eight heterosexual couples, at least one of whom was HIV‐infected,
were enrolled in an HIV transmission study and interviewed about sexual
behavior. Table E3.25 provides a summary of condom use reported by heterosexual
partners. How strongly do the couples agree?

In [26]:
index = ['Ever', 'Never', 'Total']

columns = [
    ['Man', 'Man', 'Total'],
    ['Ever', 'Never', '']
]

data = [
    [45, 6, 51],
    [7, 40, 47],
    [52, 46, 98],
]

df = pd.DataFrame(data=data, index=index, columns=columns)
df.index = df.index.rename('Woman')

df

Unnamed: 0_level_0,Man,Man,Total
Unnamed: 0_level_1,Ever,Never,Unnamed: 3_level_1
Woman,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Ever,45,6,51
Never,7,40,47
Total,52,46,98


## A3.25

In [27]:
n11 = df.loc['Ever', ('Man', 'Ever')]
n22 = df.loc['Never', ('Man', 'Never')]
n12 = df.loc['Ever', ('Man', 'Never')]
n21 = df.loc['Never', ('Man', 'Ever')]

n1t = df.loc['Ever', ('Total', '')]
n2t = df.loc['Never', ('Total', '')]
nt1 = df.loc['Total', ('Man', 'Ever')]
nt2 = df.loc['Total', ('Man', 'Never')]

kappa = 2 * (n11 * n22 - n12 * n21) / (n1t * n2t + nt1 * nt2)
print(f"Kappa statistic = {kappa :.4} (good agreement)")

Kappa statistic = 0.7342 (good agreement)
