In [381]:
import numpy as np
import pandas as pd
from scipy import stats

In [383]:
def exact_MSE_regression_est_weighted(var_vec = [stats.beta(a=5, b=7).var(), stats.beta(a=5, b=3).var()], 
                                      mue = stats.beta(a=7, b=5).mean(), 
                                      n_list = [10, 20, 30], w_list = [0.2, 0.4, 0.6], 
                                      v_mat = [[0.1, 0.6, 0.3], [0.2, 0.8, 0.0], [0.3, 0.5, 0.2]], 
                                      x_vec = [1.0, 1.0]):
    '''
    Calculate exact error for linear regression, in special case where x-distribution is 0-mean multivariate 
    normal. Assumes the parameters have 0 correlation (parameter in 1st dimension is independent of parameter
    in 2nd dimension, for example). 

    Args:
        var_vec: a list with variance of true parameter values
        mue: mean of true error distribution. 
        n_list: a list of length M (number of players) with the number of samples each has. 
        w_list: a list of w-weights each player uses (in [0, 1]) for coarse-grained federation. 
        v_mat: a matrix (list of lists) of weights each player uses in fine-grained federation: the rows sum up 
               to 1.
        x_vec: a list of E[X_d^2] for d in dimension D for input distribution of X. 
    Returns:
        dataframe with average error for each player, for: local, uniform, coarse-grained, and fine-grained 
        federation.  
    '''
    # dataframe for storing error
    player_error = pd.DataFrame(data = 0.0, index = ['local', 'uniform', 'coarse', 'fine'], 
                                columns = range(len(n_list)))
    N = sum(n_list)
    D = len(var_vec)
    n_vec = pd.DataFrame(n_list)
    
    # for each player, calculate their true error 
    for i in range(len(w_list)):
        w = w_list[i]
        n = n_list[i]
        v_vec = pd.DataFrame(v_mat[i])
        
        # local
        player_error.loc['local'][i] = mue * D/(n - D - 1)
        
        # uniform
        var_prod = pd.DataFrame(x_vec).T.dot(pd.DataFrame(var_vec))[0][0]
        
        player_error.loc['uniform'][i] = ((mue * D * (n_vec**2).T.dot(1/(n_vec- D -1))/(N**2))[0][0] + 
                                          var_prod*((n_vec**2).sum() - n**2 + (N- n)**2)[0]/(N**2))
        
        # coarse-grained
        player_error.loc['coarse'][i] = ((mue*D *(1-w)**2 * (n_vec**2).T.dot(1/(n_vec- D -1))/(N**2))[0][0] + 
                                         mue*D*(w**2 + 2 * (1-w) * w*n/N)/(n-D-1) + 
                                         (1-w)**2*var_prod*((n_vec**2).sum() - n**2 + (N- n)**2)[0]/(N**2))        
      
        # fine-grained
        player_error.loc['fine'][i] = (mue * D * (v_vec**2).T.dot(1/(pd.DataFrame(n_list)- D -1))[0][0] + 
                                       var_prod * ((v_vec**2).sum() - v_vec[0][i]**2 + (1 - v_vec[0][i])**2)[0])
        
    return player_error

In [384]:
exact_MSE_regression_est_weighted()

Unnamed: 0,0,1,2
local,0.166667,0.068627,0.04321
uniform,0.070281,0.055368,0.040456
coarse,0.060535,0.041893,0.032399
fine,0.086632,0.054167,0.077729


In [385]:
# setting values to mimic local learning
exact_MSE_regression_est_weighted(w_list = [1.0, 1.0, 1.0], 
                                  v_mat = [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])

Unnamed: 0,0,1,2
local,0.166667,0.068627,0.04321
uniform,0.070281,0.055368,0.040456
coarse,0.166667,0.068627,0.04321
fine,0.166667,0.068627,0.04321


In [386]:
# setting values to mimic uniform learning
exact_MSE_regression_est_weighted(w_list = [0.0, 0.0, 0.0], 
                                  v_mat = [[1.0/6, 2.0/6, 0.5], [1.0/6, 2.0/6, 0.5], [1.0/6, 2.0/6, 0.5]])

Unnamed: 0,0,1,2
local,0.166667,0.068627,0.04321
uniform,0.070281,0.055368,0.040456
coarse,0.070281,0.055368,0.040456
fine,0.070281,0.055368,0.040456


In [387]:
def simulate_regression(params_dists = [stats.beta(a=5, b=7), stats.beta(a=5, b=3)], 
                        err_dist = stats.beta(a=7, b=5), draws_dist = stats.norm, 
                        n_list = [10, 20, 30], w_list = [0.2, 0.4, 0.6], 
                        v_mat = [[0.1, 0.6, 0.3], [0.2, 0.8, 0.0], [0.3, 0.5, 0.2]], 
                        world_nrun = 100, sample_nrun = 1, test_nrun = 10,
                        x_cov = [[1.0, 0.0], [0.0, 1.0]]):
    
    '''
    Simulate regression. 
    

    Args:
        params_dists: length-D list of distributions to draw parameters from.  
        err_dist: distribution to draw true error parameters from (err = epsilon^2) (scalar)
        draws_dist: distribution each player draws from: with mean*X as mean and variance epsilon^2. 
        n_list: a list of length M (number of players) with the number of samples each has. 
        w_list: a list of w-weights each player uses for coarse-grained federation.  
        v_mat: a matrix (list of lists) of weights each player uses in fine-grained federation: the rows sum up 
               to 1.
        world_nrun: number of times where means and errors are re-drawn
        sample_nrun: for each worldrun, number of times samples are re-drawn
        test_nrun: for each sample_nrun, number of points we use to calculate expected test error
        x_cov: matrix representing covariance of input x distribution
        
    Returns:
        dataframe with average error for each player, for local, uniform, coarse-grained, and fine-grained 
        federation.  
    '''
    M = len(w_list)
    n_list_pd = pd.DataFrame(n_list)
    w_list_pd = pd.DataFrame(w_list)
    v_mat_pd = pd.DataFrame(v_mat)
    
    D = len(x_cov)
    mean_x = np.array([0] * D)
    x_dist = stats.multivariate_normal(mean = mean_x, cov = x_cov)
    
    # dataframe for storing error
    player_error = pd.DataFrame(data = 0, index = ['local', 'uniform', 'coarse', 'fine'], 
                                columns = range(len(n_list)))
    
    
    for wn in range(world_nrun):
        # draw means and errors
        means = pd.DataFrame([dist.rvs(M) for dist in params_dists]).T 
        errors = err_dist.rvs(M)    
        
        for sn in range(sample_nrun): 
            
            # draw samples for each player, calculate local estimates
            local_est = pd.DataFrame(0, columns = range(D), index = range(M))
            for i in range(M):
                # draw X values
                X = pd.DataFrame(x_dist.rvs(n_list[i]))
                
                # draw Y values nosily
                Y = [draws_dist(X.dot(means.iloc[i])[j], np.sqrt(errors[i])).rvs(1)[0] for j in range(n_list[i])]
                
                # calculate local estimates through OLS
                to_invert = X.T.dot(X)
                df_inv = pd.DataFrame(np.linalg.pinv(to_invert.values), to_invert.columns, to_invert.index)
                local_est.iloc[i] = df_inv.dot(X.T).dot(Y)
            
            # calculate federated estimates
            uniform_est = local_est.T.dot(n_list_pd)/sum(n_list)
            coarse_est = (local_est * w_list_pd.values + 
                          (1-w_list_pd).values * pd.concat([uniform_est.T]*M, ignore_index=True)) 
            fine_est = local_est.T.dot(v_mat_pd.T).T
            
            # calculate expected MSE
            X = pd.DataFrame(x_dist.rvs(test_nrun)) # draw test input data
            if test_nrun ==1:
                X = X.T
                
            player_error.loc['local'] += ((X.dot(means.T) - X.dot(local_est.T))**2).sum()
            player_error.loc['uniform'] += ((X.dot(means.T) - 
                                 X.dot(pd.concat([uniform_est.T] * M, ignore_index=True).T))**2).sum()
            player_error.loc['coarse'] += ((X.dot(means.T) - X.dot(coarse_est.T))**2).sum()
            player_error.loc['fine'] += ((X.dot(means.T) - X.dot(fine_est.T))**2).sum()
    
    player_error = player_error/(world_nrun * sample_nrun * test_nrun)
    
    return player_error

Compare simulations to exact values. 

In [388]:
simulate_regression()

Unnamed: 0,0,1,2
local,0.157821,0.069628,0.050731
uniform,0.079156,0.057901,0.035837
coarse,0.063528,0.042912,0.034229
fine,0.10266,0.049946,0.068901


In [389]:
exact_MSE_regression_est_weighted()

Unnamed: 0,0,1,2
local,0.166667,0.068627,0.04321
uniform,0.070281,0.055368,0.040456
coarse,0.060535,0.041893,0.032399
fine,0.086632,0.054167,0.077729


Investigation of the correct factor for expected value of the matrix multiplication.

In [390]:
n_sim = 1000
n_test = 5
store = pd.DataFrame(0, index = range(2), columns = range(2))
for l in range(n_sim):
    X = pd.DataFrame(x_dist.rvs(n_test))

    # calculate local estimates through OLS
    to_invert = X.T.dot(X)/(n_test)
    df_inv = pd.DataFrame(np.linalg.pinv(to_invert.values), to_invert.columns, to_invert.index)
    store += df_inv

In [391]:
store/n_sim

Unnamed: 0,0,1
0,2.409684,0.107336
1,0.107336,2.490382


In [392]:
pd.DataFrame([[1.0, 0.0], [0.0, 1.0]]) * (n_test - 1)/(n_test - 2-2)

Unnamed: 0,0,1
0,4.0,0.0
1,0.0,4.0


In [393]:
n_sim = 1000
n_test = 5
store = pd.DataFrame(0, index = range(2), columns = range(2))
for l in range(n_sim):
    X = pd.DataFrame(x_dist.rvs(n_test))

    # calculate local estimates through OLS
    store += X.T.dot(X)

In [394]:
store/n_sim

Unnamed: 0,0,1
0,5.05708,-0.075001
1,-0.075001,4.915169
