In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

%precision 3
%matplotlib inline

In [None]:
df = pd.read_csv('./data/ch4_scores400.csv')
scores = np.array(df['点数'])
#scores

In [None]:
p_mean = np.mean(scores)
p_var = np.var(scores)

p_mean, p_var

In [None]:
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111)

xs = np.arange(101)
rv = stats.norm(p_mean, np.sqrt(p_var))
ax.plot(xs, rv.pdf(xs), color='gray')
ax.hist(scores, bins=100, range=(0, 100), density=True)

plt.show()

In [None]:
np.random.seed(0)
n = 20
sample = np.random.choice(scores, n)
sample

In [None]:
np.random.seed(1111)
n_samples = 10000
samples = np.random.choice(scores, (n_samples, n))
samples[:50]

In [None]:
samples[5]

In [None]:
for i in range(5):
    s_mean = np.mean(samples[i])
    print(f'{i+1}回目の標本平均：{s_mean:.3f}')

In [None]:
sample_means = np.mean(samples, axis=1)
np.mean(sample_means)

In [None]:
np.mean(np.random.choice(scores, int(1e6)))

In [None]:
s_mean = np.mean(sample)
s_mean

In [None]:
for i in range(5):
    s_var = np.var(samples[i])
    print(f'{i+1}回目の標本分散：{s_var:.3f}')

In [None]:
sample_vars = np.var(samples, axis=1)
np.mean(sample_vars)

In [None]:
#分散を定義に入れて求める
a = 0

for i in range(n_samples):
    b = sum((samples[i] - np.mean(samples[i]))**2) / n
    a = a + b

s_var2 = a / n_samples
print(s_var2)

In [None]:
#母平均で計算する分散が不偏推定量になることの確認
a = 0

for i in range(n_samples):
    b = sum((samples[i] - p_mean)**2) / n
    a = a + b

s_var3 = a / n_samples
print(s_var3)

In [None]:
sample_u_vars = np.var(samples, axis=1, ddof=1)
np.mean(sample_u_vars)

In [None]:
np.var(np.random.choice(scores, int(1e6)), ddof=1)

In [None]:
u_var = np.var(sample, ddof=1)
u_var

In [None]:
rv = stats.norm()
lcl = s_mean - rv.isf(0.025) * np.sqrt(p_var / n)
ucl = s_mean - rv.isf(0.975) * np.sqrt(p_var / n)

lcl, ucl

In [None]:
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)

rv = stats.norm()
n_samples = 20
ax.vlines(p_mean, 0, 21)
for i in range(n_samples):
    sample_ = samples[i]
    s_mean_ = np.mean(sample_)
    lcl = s_mean_ - rv.isf(0.025) * np.sqrt(p_var / n)
    ucl = s_mean_ - rv.isf(0.975) * np.sqrt(p_var / n)
    if lcl <= p_mean <= ucl:
        ax.scatter(s_mean_, n_samples - i, color='gray')
        ax.hlines(n_samples - i, lcl, ucl, color='gray')
    else:
        ax.scatter(s_mean_, n_samples - i, color='b')
        ax.hlines(n_samples - i, lcl, ucl, color='b')
ax.set_xticks([p_mean])
ax.set_xticklabels(['母平均'])

plt.show()

In [None]:
rv = stats.norm()
cnt = 0
for sample_ in samples:
    s_mean_ = np.mean(sample_)
    lcl = s_mean_ - rv.isf(0.025) * np.sqrt(p_var / n)
    ucl = s_mean_ - rv.isf(0.975) * np.sqrt(p_var / n)
    if lcl <= p_mean <= ucl:
        cnt += 1

cnt / len(samples)

In [None]:
sample_y = sample_u_vars * (n - 1) / p_var
sample_y

In [None]:
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111)

xs = np.linspace(0, 40, 100)
rv = stats.chi2(df = n - 1)
ax.plot(xs, rv.pdf(xs), color='gray')
hist, _, _ = ax.hist(sample_y, bins=100, range=(0, 40), density=True)

plt.show()

In [None]:
rv = stats.chi2(df=n-1)
lcl = (n - 1) * u_var / rv.isf(0.025)
ucl = (n - 1) * u_var / rv.isf(0.975)

lcl, ucl

In [None]:
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)

rv = stats.chi2(df=n-1)
n_samples = 20
ax.vlines(p_var, 0, 21)
for i in range(n_samples):
    sample_ = samples[i]
    u_var_ = np.var(sample_, ddof=1)
    lcl = (n - 1) * u_var_ / rv.isf(0.025)
    ucl = (n - 1) * u_var_ / rv.isf(0.975)
    if lcl <= p_var <= ucl:
        ax.scatter(u_var_, n_samples - i, color='gray')
        ax.hlines(n_samples - i, lcl, ucl, color='gray')
    else:
        ax.scatter(u_var_, n_samples - i, color='b')
        ax.hlines(n_samples - i, lcl, ucl, color='b')
ax.set_xticks([p_var])
ax.set_xticklabels(['母分散'])

plt.show()

In [None]:
rv = stats.chi2(df=n-1)
cnt = 0
for sample_ in samples:
    u_var_ = np.var(sample_, ddof=1)
    lcl = (n - 1) * u_var_ / rv.isf(0.025)
    ucl = (n - 1) * u_var_ / rv.isf(0.975)
    if lcl <= p_var <= ucl:
        cnt += 1

cnt / len(samples)

In [None]:
rv = stats.t(df=n-1)
lcl = s_mean - rv.isf(0.025) * np.sqrt(s_var / n)
ucl = s_mean - rv.isf(0.975) * np.sqrt(s_var / n)

lcl, ucl

In [None]:
enquete_df = pd.read_csv('./data/ch10_enquete.csv')
enquete_df

In [None]:
enquete = np.array(enquete_df['知っている'])
n = len(enquete)
enquete[:10]

In [None]:
s_mean = enquete.mean()
s_mean

In [None]:
rv = stats.norm()
lcl = s_mean - rv.isf(0.025) * np.sqrt(s_mean*(1 - s_mean) / n)
ucl = s_mean - rv.isf(0.975) * np.sqrt(s_mean*(1 - s_mean) / n)

lcl, ucl

In [None]:
n_access_df = pd.read_csv('./data/ch10_access.csv')
n_access_df

In [None]:
n_access = np.array(n_access_df['アクセス数'])
n = len(n_access)
n_access[:10]

In [None]:
s_mean = n_access.mean()
s_mean

In [None]:
s_mean2 = np.mean(n_access)
s_mean2

In [None]:
rv = stats.norm()
lcl = s_mean - rv.isf(0.025) * np.sqrt(s_mean / n)
ucl = s_mean - rv.isf(0.975) * np.sqrt(s_mean / n)

lcl, ucl