In [12]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy.stats import t
from statsmodels.formula.api import ols



In [13]:
# Load data
playbill = pd.read_csv("playbill.csv")
print(playbill.head())

         Production  CurrentWeek  LastWeek
0       42nd Street       684966    695437
1          Avenue Q       502367    498969
2  Beauty and Beast       594474    598576
3     Bombay Dreams       529298    528994
4           Chicago       570254    562964


In [14]:
# Fit linear model CurrentWeek ~ LastWeek
X = sm.add_constant(playbill['LastWeek'])  # add intercept
y = playbill['CurrentWeek']
model = ols('CurrentWeek ~ LastWeek', data=playbill).fit()
print(model.summary())

# ANOVA table
anova_results = sm.stats.anova_lm(model)
print(anova_results)

                            OLS Regression Results                            
Dep. Variable:            CurrentWeek   R-squared:                       0.997
Model:                            OLS   Adj. R-squared:                  0.996
Method:                 Least Squares   F-statistic:                     4634.
Date:                Sat, 06 Sep 2025   Prob (F-statistic):           3.87e-21
Time:                        16:19:09   Log-Likelihood:                -200.85
No. Observations:                  18   AIC:                             405.7
Df Residuals:                      16   BIC:                             407.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   6804.8860   9929.318      0.685      0.5

  return hypotest_fun_in(*args, **kwds)


In [15]:
# Question 2a: 95% CI for slope b1
b1 = model.params['LastWeek']
b1_std_error = model.bse['LastWeek']
df = model.df_resid
t_star = t.ppf(0.975, df)
b1_ci = b1 + np.array([-1, 1]) * t_star * b1_std_error

print("95% CI for slope b1:", b1_ci)

95% CI for slope b1: [0.95149713 1.01266583]


In [17]:
# t-test for intercept b0
b0 = model.params['Intercept']  # use 'Intercept' when using formula API
b0_std_error = model.bse['Intercept']
b0_hyp = 10000
df = model.df_resid

t_stat = (b0 - b0_hyp) / b0_std_error

# two-tailed p-value
if t_stat > 0:
    p_val = 2 * (1 - t.cdf(t_stat, df))
else:
    p_val = 2 * t.cdf(t_stat, df)

print("Two-tailed p-value for intercept test:", p_val)

Two-tailed p-value for intercept test: 0.7517807298578961


In [18]:
# Question 2c: Prediction interval for x = 400000
x_new = 400000
y_pred = b0 + b1 * x_new

n = len(playbill)
s = 18010  # residual standard deviation (given)
x_mean = playbill['LastWeek'].mean()
s_x = playbill['LastWeek'].std(ddof=1)
SXX = (n - 1) * s_x**2

PI_width = t_star * s * np.sqrt(1 + 1/n + ((x_new - x_mean)**2 / SXX))
PI = y_pred + np.array([-1, 1]) * PI_width
print("Prediction interval for x=400000:", PI)


Prediction interval for x=400000: [359827.35182064 439447.60350672]


In [19]:
# Question 3a: 95% CI for b0
b0_val = 0.6417099
b0_se = 0.1222707
df3 = 28
t_star3 = t.ppf(0.975, df3)
b0_ci = b0_val + np.array([-1, 1]) * t_star3 * b0_se
print("95% CI for b0:", b0_ci)

95% CI for b0: [0.39124972 0.89217008]


In [20]:
# Question 3b: t-test
t_stat3b = (0.0112916 - 0.01) / 0.0008184
p_val3b = t.cdf(t_stat3b, df3)
print("p-value for t-test:", p_val3b)

p-value for t-test: 0.9371241529830511


In [21]:
# Question 3c: Prediction interval
y_pred3c = 0.6417 + 0.0113 * 130
s3c = 0.3298
n3c = 30  # assuming sample size
PI3c = y_pred3c + np.array([-1, 1]) * t_star3 * s3c * np.sqrt(1 + 1/n3c)
print("Prediction interval for Question 3c:", PI3c)

Prediction interval for Question 3c: [1.42396821 2.79743179]
