In [2]:
import scipy.stats as ss
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

##### 1 task

In [3]:
def sample_size(p, rel_tol) -> int:
    n = 0
    error = rel_tol + 1
    while error > rel_tol:
        n += 1
        sigma_phi = np.sqrt((1-p)/(n*p))
        error = sigma_phi
    return n


p = 0.025
rel_tol = 0.05

n = sample_size(p, rel_tol)
print(n)

15600


##### 2 task

In [4]:
def sample_size(gamma, alpha, delta):
    error = delta + 1
    n = 1
    while error >= delta:
        z_quantile = ss.norm.ppf(1 - alpha)
        phi_z_quantile = ss.norm.pdf(z_quantile)
        error = (gamma/(6*np.sqrt(n)))*(2*z_quantile**2+1)*phi_z_quantile
        n+=1
    return n-1

result = sample_size(2, 0.025, 0.025 * 0.1)
print(result)

4579


##### 3 task

In [18]:
def pivot_interval(n, k, gamma):
    sample_mean = k/n
    z = ss.norm.ppf((1-gamma)/2)
    a = z**2 + n
    b = (2*n*sample_mean + z**2)
    c = n*sample_mean**2
    p_R = -(-b - np.sqrt(b**2-4*a*c))/(2*a)
    p_L = -(-b + np.sqrt(b**2-4*a*c))/(2*a)
    return p_L, p_R


n = 20
k = 3
gamma = 0.95

p_alpha, p_beta = pivot_interval(n, k, gamma)
print("Доверительный интервал для p:", (p_alpha, p_beta))

Доверительный интервал для p: (0.052368745896216595, 0.36041886474075696)


##### 4 task

In [6]:
def var_interval(sample, gamma):
    n = len(sample)
    alpha = 1 - gamma
    g_1 = ss.chi2.ppf(alpha / 2, n - 1)
    g_2 = ss.chi2.ppf(1 - alpha / 2, n - 1)
    sample_variance = np.var(sample, ddof=1)
    lower_bound = (n - 1) * sample_variance / g_2
    upper_bound = (n - 1) * sample_variance / g_1
    return (lower_bound, upper_bound)


print(var_interval([0, 1], 0.95))

(0.09952454760270645, 509.12913485983375)


##### 5 task

In [7]:
def loc_interval(sample, gamma, number_of_trials):
    # MONTE-CARLO for quantiles
    n = len(sample)
    loc = 1
    scale = 5
    sample_monte_carlo = ss.cauchy(loc=loc, scale=scale).rvs((number_of_trials, n))
    med = np.median(sample_monte_carlo, axis=1)
    MAD = np.median(abs(sample_monte_carlo - np.resize(med, (number_of_trials, n))), axis=1)
    g = (med - loc)/MAD
    alpha = 1 - gamma
    quantile_1 = np.quantile(g, alpha/2)
    quantile_2 = np.quantile(g, 1 - alpha/2)
    # CONFIDENCE INTERVAL
    med_loc = np.median(sample)
    MAD_loc = np.median(abs(sample - np.resize(med_loc, (1, n))))
    loc_2 = med_loc - MAD_loc*quantile_1
    loc_1 = med_loc - MAD_loc*quantile_2
    return loc_1, loc_2


sample = [-0.93, -1.84, -0.84, -0.13, -0.63, 0.06, -0.93, 13.29, 0.9, -2.64,
          -0.37, 0.43, -2.41, 19.33, -0.18, 1.29, 1.32, -0.47, -0.27, 0.27,
          1.07, -1.49, -0.78, 0.59, -0.0, -1.59, -0.28, -1.38, 0.1, 1.72]
print(loc_interval(sample, 0.95, 10000))

(-0.6044365545271958, 0.16065599615307222)


##### 6 task

In [78]:
from scipy.optimize import root_scalar
def ratio(eps):
    sigma_1 = 1
    sigma_2 = 5
    sigma_mean = (1-eps)*sigma_1**2 + eps*sigma_2**2
    alpha = 0.5
    x_alpha = 0
    mixture_pdf = (1-eps)*(1/sigma_1/np.sqrt(2*np.pi))*np.exp(-x_alpha**2/2/sigma_1**2) + \
        eps*(1/sigma_2/np.sqrt(2*np.pi))*np.exp(-x_alpha**2/2/sigma_2**2)
    sigma_median = alpha*(1-alpha)/mixture_pdf**2
    ratio = sigma_mean/sigma_median
    return ratio


print(root_scalar(lambda x: ratio(x) - 1, bracket=[0, 0.5]).root)
eps = 0
print(ratio(eps))

0.026667806686548504
0.6366197723675814


##### 7 task

In [None]:
def corr_interval(sample, gamma):
    n = len(sample)
    x, y = zip(*sample)
    rho = np.corrcoef(x, y)[1, 0]
    z_conf = ss.norm.ppf(1 - (1 - gamma) / 2)
    ci_left = np.tanh(np.arctanh(rho) - z_conf/np.sqrt(n))
    ci_right = np.tanh(np.arctanh(rho) + z_conf/np.sqrt(n))
    return [ci_left, ci_right]


data = np.array([(576, 3.39), (635, 3.30), (558, 2.81), (578, 3.03), (666, 3.44),
                 (580, 3.07), (555, 3.0), (661, 3.43), (651, 3.36), (605, 3.13),
                 (653, 3.12), (575, 2.74), (545, 2.76), (572, 2.88), (594, 2.96)])

print(corr_interval(data, 0.95))

[0.4854712460301556, 0.9124958350575495]


##### 8 task

In [79]:
def corr_interval(sample, gamma):
    n = len(sample)
    x, y = zip(*sample)
    rho = np.corrcoef(x, y)[1, 0]
    z_conf = ss.norm.ppf(1 - (1 - gamma) / 2)
    ci_left = np.tanh(np.arctanh(rho) - z_conf/np.sqrt(n))
    ci_right = np.tanh(np.arctanh(rho) + z_conf/np.sqrt(n))
    return [ci_left, ci_right]

def true_proba(n, rho, gamma, number_of_trials):
    alpha = np.sqrt(rho / (1 - rho))
    z = ss.expon.rvs(size=(number_of_trials, n))
    x = ss.expon.rvs(size=(number_of_trials, n)) + alpha * z
    y = ss.expon.rvs(size=(number_of_trials, n)) + alpha * z
    samples = np.stack([x, y], axis=-1)
    corr_interval_arr = [corr_interval(sample, gamma) for sample in samples]
    left, right = zip(*corr_interval_arr)
    left = np.array(left)
    right = np.array(right)
    prob_left = np.sum(rho > right)/number_of_trials
    prob_right = np.sum(rho < left)/number_of_trials
    return prob_right, prob_left

print(true_proba(30, 0.6, 0.95, 10000))

(0.1023, 0.0757)


##### 9 task

In [71]:
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt


def efron_true_proba_slow(n, rho, gamma, number_of_trials, n_of_resamples):
    alpha = np.sqrt(rho / (1 - rho))
    efron_borders = []
    rng = np.random.default_rng()
    for i in tqdm(range(number_of_trials)):
        z = ss.expon.rvs(size=n)
        x = ss.expon.rvs(size=n) + alpha * z
        y = ss.expon.rvs(size=n) + alpha * z
        sample = np.stack([x, y], axis=-1)

        corr_coeff_arr = []
        for _ in range(n_of_resamples):
            bootstrap_sample = rng.choice(sample, size=n)
            # print(bootstrap_sample.shape)
            corr_coeff = np.corrcoef(bootstrap_sample, rowvar=False)[0,1]
            corr_coeff_arr.append(corr_coeff)
        efron_borders.append([np.quantile(corr_coeff_arr, (1-gamma)/2),
                              np.quantile(corr_coeff_arr, 1-(1-gamma)/2)])

    left, right = zip(*efron_borders)
    plt.plot(left)
    plt.plot(right)
    left = np.array(left)
    right = np.array(right)
    prob_left = np.sum(rho > right)/number_of_trials
    prob_right = np.sum(rho < left)/number_of_trials
    return prob_right, prob_left


def calculate_correlation_matrices(a):
    # Reshape 'a' to (m, 2, n) to perform calculations along the correct axes
    reshaped_a = a.swapaxes(1, 2)

    # Calculate correlation matrix for each subarray using vectorized operations
    mean_a = reshaped_a.mean(axis=2, keepdims=True)
    std_a = reshaped_a.std(axis=2, keepdims=True)
    centered_a = reshaped_a - mean_a
    normalized_a = centered_a / std_a

    # Transpose 'normalized_a' to (m, n, 2) and reshape it back to (m, 2, n) to align dimensions
    normalized_a = normalized_a.transpose(0, 2, 1)

    # Calculate correlation matrix
    corr_matrices = np.matmul(
        normalized_a, normalized_a.transpose(0, 2, 1)) / len(a[0])
    corr_arr = np.array([corr_matrice[0, 1] for corr_matrice in corr_matrices])
    return corr_arr

def efron_true_proba(n, rho, gamma, number_of_trials, n_of_resamples):
    alpha = np.sqrt(rho / (1 - rho))
    efron_borders = []
    for i in tqdm(range(number_of_trials)):
        z = ss.expon.rvs(size=n)
        x = ss.expon.rvs(size=n) + alpha * z
        y = ss.expon.rvs(size=n) + alpha * z
        sample = np.stack([x, y], axis=-1)
        # print(sample.shape)

        indices = np.random.randint(0, n, size=(n_of_resamples, n))
        resamples = sample[indices]
        # print(resamples.shape)
        
        corr_coeff_arr = calculate_correlation_matrices(resamples)
        
        
        efron_borders.append([np.quantile(corr_coeff_arr, (1-gamma)/2), 
                              np.quantile(corr_coeff_arr, 1-(1-gamma)/2)])

    left, right = zip(*efron_borders)
    plt.plot(left)
    plt.plot(right)
    left = np.array(left)
    right = np.array(right)
    prob_left = np.sum(rho > right)/number_of_trials
    prob_right = np.sum(rho < left)/number_of_trials
    return prob_right, prob_left


# print(efron_true_proba(30, 0.6, 0.95, 100, 1000))
print(efron_true_proba_slow(30, 0.6, 0.95, 10000, 1000))

  0%|          | 0/10000 [00:00<?, ?it/s]

 14%|█▍        | 1398/10000 [02:33<15:45,  9.10it/s]


KeyboardInterrupt: 

In [14]:
rho = 0.6
alpha = np.sqrt(rho / (1 - rho))
n_of_resamples = 3
n = 5
z = ss.expon.rvs(size=(n_of_resamples, n), random_state=1)
x = ss.expon.rvs(size=(n_of_resamples, n), random_state=1) + alpha * z
y = ss.expon.rvs(size=(n_of_resamples, n), random_state=1) + alpha * z
sample = np.stack([x, y], axis=-1)
# print(sample)
np.reshape(sample, (n_of_resamples, -1, 1))

a = [[[1, 2], [10, 20], [100, 200]], [[3, 4], [30, 30], [300, 400]]]
b = [[[1,10,100], [2, 20, 200]], [[3, 30, 300], [4, 40, 400]]]
# a = np.reshape(a, (2, 2, -1), )
# a
x, y = zip(*b)

def calculate_correlation(sample):
    sample = np.reshape(sample, (-1, 2))
    x, y = zip(*sample)
    mean_x = np.mean(x, axis=-1)
    mean_y = np.mean(y, axis=-1)
    var_x = np.var(x, axis=-1)
    var_y = np.var(y, axis=-1)
    print((x - mean_x) * (y - mean_y))
    covariance = np.mean((x - mean_x[..., 0]) * (y - mean_y[..., 0]), axis=-1)
    print(covariance)
    pass
    # correlation = covariance / np.sqrt(var_x * var_y)
    # return correlation

[ 7835.33333333  5717.33333333  2357.33333333  7478.66666667
  3490.66666667 65690.66666667]


IndexError: invalid index to scalar variable.

In [37]:
a = [[[1, 2], [5, -3], [-3, -7]], [[3, 4], [30, 30], [300, 400]]]
print(np.corrcoef(a[0]))
x,y = zip(*a[0])
x = np.array(x)
y = np.array(y)

[[ 1. -1. -1.]
 [-1.  1.  1.]
 [-1.  1.  1.]]


In [46]:
import numpy as np

# Assuming 'a' is your array with shape (m, n, 2)
# Create some random data for demonstration
a = np.array([[[1, 2], [5, -3], [-3, -7]], [[3, 4], [30, 30], [300, 400]]])

def calculate_correlation_matrices(a):
    # Reshape 'a' to (m, 2, n) to perform calculations along the correct axes
    reshaped_a = a.swapaxes(1, 2)

    # Calculate correlation matrix for each subarray using vectorized operations
    mean_a = reshaped_a.mean(axis=2, keepdims=True)
    std_a = reshaped_a.std(axis=2, keepdims=True)
    centered_a = reshaped_a - mean_a
    normalized_a = centered_a / std_a

    # Transpose 'normalized_a' to (m, n, 2) and reshape it back to (m, 2, n) to align dimensions
    normalized_a = normalized_a.transpose(0, 2, 1)

    # Calculate correlation matrix
    corr_matrices = np.matmul(normalized_a, normalized_a.transpose(0, 2, 1)) / n
    corr_arr = np.array([corr_matrice[0,1] for corr_matrice in corr_matrices])
    return corr_arr

calculate_correlation_matrices(a)
# np.corrcoef(a[0])

array([-0.02868852,  0.24489943])