In [1]:
import numpy as np
import pandas as pd

In [93]:
dataset2 = pd.read_csv("./ds-boot-1.csv", sep='\t')
dataset1 = pd.read_csv("./ds-boot-2.csv", sep='\t')

In [94]:
dataset1.head()

Unnamed: 0,id,p1,p2,p3,p4,p5,p6,p7,p8,p9,...,p24,p25,p26,p27,p28,y1,y2,y3,y4,y5
0,11,6.48148,3.0,5.0,7.75,0.0,7.16667,8.16667,9.66667,6.16667,...,1.66667,3.16667,0.0,0.0,0.0,1.0,5.0,7.44,1.18,4.38
1,12,5.74074,4.0,8.0,7.33333,8.0,8.83333,9.75,9.66667,9.0,...,2.5,5.5,5.0,8.66667,8.0,4.5,4.25,8.93,2.0,6.03
2,25,7.59259,7.0,8.0,7.66667,8.0,9.66667,9.5,6.16667,9.66667,...,3.5,3.5,9.0,6.5,7.0,7.5,11.0,8.97,2.0,9.12
3,31,5.96297,4.0,8.0,9.33333,10.0,9.33333,7.0,8.5,9.66667,...,0.0,0.0,0.0,0.0,0.0,4.0,6.25,8.93,1.82,6.41
4,48,5.44444,1.0,3.5,6.41667,9.0,8.5,7.08333,6.33333,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,8.08,1.36,3.67


In [95]:
target_names = ['y' + str(i) for i in range(1, 6)]
targets = dataset1[target_names]
dataset1.drop(target_names, axis=1, inplace=True)
dataset1.drop('id', axis=1, inplace=True)

In [34]:
def bootstrap_estimate(func, data, B=1000):
    """
    Args:
        func - statistic to estimate
        data - vector of values
        B - number of bootstrap samples
    
    Returns:
        estimate, std - estimation for the func
    """
    samples = np.random.choice(data, size=(B, len(data)))
    statistics = np.apply_along_axis(func, axis=1, arr=samples)
    b_mean = np.mean(statistics)
    b_std = np.std(statistics)
    return b_mean, b_std

In [35]:
for target_name in target_names:
    target = targets[target_name]
    print("Evaluating", target_name)
    
    print("Mean:")
    mean_b_mean, mean_b_std = bootstrap_estimate(np.mean, target)
    print("({} +- 1.96 * {}) for 95% confidence".format(mean_b_mean, mean_b_std))
    print()
    print("Median:")
    median_b_mean, median_b_std = bootstrap_estimate(np.median, target)
    print("({} +- 1.96 * {}) for 95% confidence".format(median_b_mean, median_b_std))
    print(); print()

Evaluating y1
Mean:
(4.025525 +- 1.96 * 0.46388209641567324) for 95% confidence

Median:
(4.00625 +- 1.96 * 0.573714595857557) for 95% confidence


Evaluating y2
Mean:
(5.49970625 +- 1.96 * 0.52201355911359) for 95% confidence

Median:
(6.172875 +- 1.96 * 0.7124814800224635) for 95% confidence


Evaluating y3
Mean:
(7.33241775 +- 1.96 * 0.45930786807155555) for 95% confidence

Median:
(8.309085 +- 1.96 * 0.2750371570806391) for 95% confidence


Evaluating y4
Mean:
(1.5212154999999998 +- 1.96 * 0.11332830983364219) for 95% confidence

Median:
(1.85524 +- 1.96 * 0.13428623309930174) for 95% confidence


Evaluating y5
Mean:
(5.607217 +- 1.96 * 0.4242976871089448) for 95% confidence

Median:
(6.195824999999999 +- 1.96 * 0.4247392074850167) for 95% confidence




In [70]:
test = np.random.choice(list(range(40)), size=(1000, 40))

In [100]:
def bootstrap_estimate_regressor(regressor_class, X, y, B=1000):
    """
    Args:
        regressor - class of the regressor (example: LinearRegression)
        X - data, np.array
        y - target, np.array
        B - number of bootstrap samples
        
    Returns:
        means, stds: tuple of lists of values
    """
    resample_indices = np.random.choice(list(range(X.shape[0])), size=(B, X.shape[0]))
    coefficients = np.ndarray((B, X.shape[1] + 1)) # + 1 for the intercept
    
    for i in range(B):
        X_resample = X.iloc[resample_indices[i], :]
        y_resample = y[resample_indices[i]]
        regressor = regressor_class()
        regressor.fit(X_resample, y_resample)
        coefficients[i] = np.concatenate((regressor.coef_, [regressor.intercept_]))
        
    coef_means, coef_stds = [], []
    for i in range(coefficients.shape[1]):
        cur_coef = coefficients[:, i]
        coef_means.append(np.mean(cur_coef))
        coef_stds.append(np.std(cur_coef))
    
    return coef_means, coef_stds

In [106]:
for target_name in target_names:
    target = targets[target_name]
    print("Evaluating", target_name, "confidence intervals for coeffs for 95% confidence")
    
    coef_means, coef_stds = bootstrap_estimate_regressor(LinearRegression, dataset1, target)
    for i in range(len(coef_means)):
        current_coef_name = "coefficient_" + str(i)
        if i == len(coef_means) - 1:
            current_coef_name = "the intercept"
        print("Estimating", current_coef_name)
        print("({} +- 1.96 * {})".format(coef_means[i], coef_stds[i]))
    break

Evaluating y1 confidence intervals for coeffs for 95% confidence
Estimating coefficient_0
(0.30561763092188843 +- 1.96 * 0.28629314340667844)
Estimating coefficient_1
(-0.01718694201154082 +- 1.96 * 0.2512012182572404)
Estimating coefficient_2
(-0.08628920917886876 +- 1.96 * 0.32576866311491076)
Estimating coefficient_3
(0.5812261095873973 +- 1.96 * 0.3900368359664032)
Estimating coefficient_4
(-0.2509984338468361 +- 1.96 * 0.31884929791192834)
Estimating coefficient_5
(0.25209688794552904 +- 1.96 * 0.5069444065123211)
Estimating coefficient_6
(-0.3885723975919868 +- 1.96 * 0.40987195206611277)
Estimating coefficient_7
(-0.09653747663312122 +- 1.96 * 0.31714953728681966)
Estimating coefficient_8
(-0.039353800339190434 +- 1.96 * 0.3517686739308529)
Estimating coefficient_9
(-0.20557575518219762 +- 1.96 * 0.3076490700055431)
Estimating coefficient_10
(0.3822948801529185 +- 1.96 * 0.36491580956110503)
Estimating coefficient_11
(0.05261601717619038 +- 1.96 * 0.36343251931176745)
Estimating