In [1]:
import numpy as np
import scipy.stats

In [2]:
np.set_printoptions(precision=4, suppress=False)

In [3]:
pd.options.display.float_format = '{:,.4g}'.format  # show 4 digits of precision

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.25)

In [5]:
class ZeroInflatedDist(object):
    
    def __init__(self, dist, zero_proba):
        self.dist = dist
        self.zero_proba = float(zero_proba)
        
    def rvs(self, size=1, random_state=np.random):
        vals = np.atleast_1d(np.round(self.dist.rvs(size=size, random_state=random_state)))
        zmask = random_state.rand(size) < self.zero_proba
        vals[zmask] = 0
        return np.maximum(0, vals)

In [6]:
class QuantizedNormal(object):
    
    def __init__(self, loc, scale):
        self.dist = scipy.stats.norm(loc, scale)
    
    def rvs(self, *args, **kwargs):
        vals = np.atleast_1d(np.round(self.dist.rvs(*args, **kwargs)))
        return np.maximum(0, vals)

In [7]:
consistent_3 = [QuantizedNormal(7, 0.1) for _ in range(3)]

highvar_3 = [ZeroInflatedDist(QuantizedNormal(10, 0.1), 0.3) for _ in range(3)]

powerball_3 = [ZeroInflatedDist(QuantizedNormal(100, 0.1), 0.9) for _ in range(3)]

dist_N = consistent_3 + highvar_3 +powerball_3


'''
poisson_N = [scipy.stats.poisson(k) for k in range(1, 4)]
smallvar_norm_N = [QuantizedNormal(k + 0.5, 0.1) for k in range(1, 4)]
bigvar_norm_N = [QuantizedNormal(k + 0.5, 5.0) for k in range(1, 4)]

consistent_3 = [QuantizedNormal(7, 0.1) for _ in range(3)]

highvar_3 = [ZeroInflatedDist(QuantizedNormal(10, 0.1), 0.3) for _ in range(3)]

powerball_3 = [ZeroInflatedDist(QuantizedNormal(100, 0.1), 0.9) for _ in range(3)]

dist_N = poisson_N + smallvar_norm_N + bigvar_norm_N + consistent_3 + highvar_3 +powerball_3
'''
;

''

In [1]:
consistent_3[0].rvs(size=100)

NameError: name 'consistent_3' is not defined

In [9]:
highvar_3[0].rvs(size=1000).mean()

6.85

In [10]:
powerball_3[0].rvs(size=1000)

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0., 100.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0., 100.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0., 100.,   0.,   0.,   0.,   0., 100.,   0.,   0.,   0.,   0.,
       100.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
       100.,   0.,   0.,   0.,   0.,   0., 100.,   0.,   0.,   0.,   0.,
         0., 100.,   0.,   0.,   0., 100., 100.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0., 10

In [11]:
consistent_3[0].rvs(size=10000).mean()

7.0

In [12]:
powerball_3[0].rvs(size=10000).mean()

10.71

In [13]:
def calc_bpr_many_trials(
        dist_N, K=3, n_trials=10000, seed=101,
        strategy='pick_mean',
        percentile_as_frac=0.95):
    N = len(dist_N)
    y_RN = np.zeros((n_trials, N), dtype=np.int32)
    for n, dist in enumerate(dist_N):
        random_state = np.random.RandomState(10000 * seed + n)
        y_RN[:, n] = dist.rvs(size=n_trials, random_state=random_state)
    if strategy == 'cross_ratio':
        S = 100*n_trials
        y_SN = np.zeros((S, N))
        sum_str_N = [None for _ in range(N)]
        for n, dist in enumerate(dist_N):
            random_state = np.random.RandomState(10000 * seed + n)
            y_SN[:,n] = dist.rvs(size=S, random_state=random_state)
            sum_str_N[n] = " ".join(['%.1f' % np.percentile(y_SN[:,n], p)
                                    for p in [0, 10, 50, 90, 100]])
        ratio_N = np.mean(y_SN / np.sum(y_SN, axis=1, keepdims=1), axis=0)
        assert ratio_N.shape == (N,)
        selected_ids_K = np.argsort(-1 * ratio_N)[:K]
        for kk in selected_ids_K:
            print(sum_str_N[kk])
        selected_ids_RK = np.tile(selected_ids_K, (n_trials,1))

    if strategy == 'cross_ratio_topk':
        S = 100*n_trials
        y_SN = np.zeros((S, N))
        sum_str_N = [None for _ in range(N)]
        for n, dist in enumerate(dist_N):
            random_state = np.random.RandomState(10000 * seed + n)
            y_SN[:,n] = dist.rvs(size=S, random_state=random_state)
            sum_str_N[n] = " ".join(['%.1f' % np.percentile(y_SN[:,n], p)
                                    for p in [0, 10, 50, 90, 100]])

        topk_ids_SN = np.argsort(-1 * y_SN, axis=1)[:, :K]
        topk_y_SN = np.take_along_axis(y_SN, topk_ids_SN, axis=1)
        ratiotopk_N = np.mean(y_SN / np.sum(topk_y_SN, axis=1, keepdims=1), axis=0)
        assert ratiotopk_N.shape == (N,)
        selected_ids_K = np.argsort(-1 * ratiotopk_N)[:K]
        for kk in selected_ids_K:
            print(sum_str_N[kk])
        selected_ids_RK = np.tile(selected_ids_K, (n_trials,1))
        print('HEREEE')

    if strategy.count('pick'):
        score_N = np.zeros(N)
        sum_str_N = [None for _ in range(N)]
        for n, dist in enumerate(dist_N):
            random_state = np.random.RandomState(10000 * seed + n)
            y_samples_S = dist.rvs(size=100*n_trials, random_state=random_state)
            sum_str_N[n] = " ".join(['%.1f' % np.percentile(y_samples_S, p)
                                    for p in [0, 10, 50, 90, 100]])
            
            if strategy == 'pick_mean':
                score_N[n] = np.mean(y_samples_S)
            elif strategy == 'pick_mean_of_squares':
                score_N[n] = np.mean(np.square(y_samples_S))
            elif strategy == 'pick_mean_of_sqrt':
                score_N[n] = np.mean(np.sqrt(y_samples_S))
            elif strategy == 'pick_max':
                score_N[n] = np.max(y_samples_S)
            elif strategy == 'pick_percentile':
                score_N[n] = np.percentile(y_samples_S, percentile_as_frac)                
            else:
                score_N[n] = np.median(y_samples_S)
        selected_ids_K = np.argsort(-1 * score_N)[:K]
        for kk in selected_ids_K:
            print(sum_str_N[kk])
        selected_ids_RK = np.tile(selected_ids_K, (n_trials,1))
    if strategy == 'guess_random':
        random_state = np.random.RandomState(10000 * seed)
        selected_ids_RK = np.zeros((n_trials, K), dtype=np.int32)
        for trial in range(n_trials):
            selected_ids_RK[trial,:] = random_state.permutation(N)[:K]
        
    yselect_RK = np.take_along_axis(y_RN, selected_ids_RK, axis=1)
    topk_ids_RK = np.argsort(-1 * y_RN, axis=1)[:, :K]
    ytop_RK = np.take_along_axis(y_RN, topk_ids_RK, axis=1)

    numer_R = np.sum(yselect_RK, axis=1)
    denom_R = np.sum(ytop_RK, axis=1)
    
    assert np.all(numer_R <= denom_R + 1e-10)
    
    return numer_R / denom_R

In [14]:
y_RN = np.random.poisson(5, size=40).reshape(10, 4)

In [15]:
'topk_cross_ratio'.count('cross_ratio')

1

In [16]:
y_RN

array([[ 3,  5,  7,  4],
       [ 5,  7,  3,  4],
       [ 4,  4,  4,  5],
       [ 8,  4,  3,  3],
       [ 5,  2,  5,  3],
       [ 4, 12,  4,  3],
       [ 7,  2,  4,  5],
       [ 4,  8,  2,  2],
       [ 7,  7,  8,  6],
       [ 3,  9,  3,  3]])

In [17]:
K = 2
topk_ids_RK = np.argsort(-1 * y_RN, axis=1)[:, :K]

In [18]:
topk_ids_RK
np.take_along_axis(y_RN, topk_ids_RK, axis=1)

array([[ 7,  5],
       [ 7,  5],
       [ 5,  4],
       [ 8,  4],
       [ 5,  5],
       [12,  4],
       [ 7,  5],
       [ 8,  4],
       [ 8,  7],
       [ 9,  3]])

In [19]:
np.mean(calc_bpr_many_trials(dist_N, K=3, n_trials=100000, strategy='pick_mean'))

0.0 0.0 0.0 100.0 100.0
0.0 0.0 0.0 100.0 100.0
0.0 0.0 0.0 0.0 101.0


0.22923552078356427

In [20]:
np.mean(calc_bpr_many_trials(dist_N, K=3, n_trials=100000, strategy='guess_random'))

0.4780103310589275

In [21]:
np.mean(calc_bpr_many_trials(dist_N, K=3, n_trials=100000, strategy='cross_ratio'))

6.0 7.0 7.0 7.0 8.0
6.0 7.0 7.0 7.0 8.0
6.0 7.0 7.0 7.0 8.0


0.6116739632292829

In [22]:
np.mean(calc_bpr_many_trials(dist_N, K=3, n_trials=100000, strategy='cross_ratio_topk'))

6.0 7.0 7.0 7.0 8.0
6.0 7.0 7.0 7.0 8.0
6.0 7.0 7.0 7.0 8.0
HEREEE


0.6116739632292829

In [23]:
for perc in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
    print(np.mean(calc_bpr_many_trials(
        dist_N, K=3, strategy='pick_percentile',
        percentile_as_frac=perc)))

7.0 7.0 7.0 7.0 7.0
6.0 7.0 7.0 7.0 7.0
7.0 7.0 7.0 7.0 8.0
0.6155752892062741
7.0 7.0 7.0 7.0 7.0
6.0 7.0 7.0 7.0 7.0
7.0 7.0 7.0 7.0 8.0
0.6155752892062741
0.0 0.0 10.0 10.0 10.0
0.0 0.0 10.0 10.0 11.0
0.0 0.0 10.0 10.0 10.0
0.593449708994709
0.0 0.0 10.0 10.0 10.0
0.0 0.0 10.0 10.0 11.0
0.0 0.0 10.0 10.0 10.0
0.593449708994709
0.0 0.0 10.0 10.0 10.0
0.0 0.0 10.0 10.0 11.0
0.0 0.0 10.0 10.0 10.0
0.593449708994709
0.0 0.0 10.0 10.0 10.0
0.0 0.0 10.0 10.0 11.0
0.0 0.0 10.0 10.0 10.0
0.593449708994709
0.0 0.0 10.0 10.0 10.0
0.0 0.0 10.0 10.0 11.0
0.0 0.0 10.0 10.0 10.0
0.593449708994709
0.0 0.0 10.0 10.0 10.0
0.0 0.0 10.0 10.0 11.0
0.0 0.0 10.0 10.0 10.0
0.593449708994709
0.0 0.0 0.0 100.0 100.0
0.0 0.0 10.0 10.0 10.0
0.0 0.0 10.0 10.0 10.0
0.46919357508573


In [19]:
np.mean(calc_bpr_many_trials(dist_N, K=3, strategy='pick_mean_of_sqrt'))

7.0 7.0 7.0 7.0 8.0
7.0 7.0 7.0 7.0 7.0
6.0 7.0 7.0 7.0 7.0


0.5943714824065128

In [20]:
a = np.asarray([1,2,3])
np.square(np.linalg.norm(a))

14.0

In [21]:
np.sum(np.square(a))

14

In [22]:
np.identity(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [24]:
mnames = ['cross_ratio','cross_ratio_topk', 'pick_mean', 'pick_median', 'guess_random']
R = 10000

scores_MR = np.zeros((5, R))
for mm, method in enumerate(mnames):
    scores_MR[mm] = calc_bpr_many_trials(
        dist_N, K=3, n_trials=R, strategy=method)

7.0 7.0 7.0 7.0 8.0
7.0 7.0 7.0 7.0 7.0
6.0 7.0 7.0 7.0 7.0
7.0 7.0 7.0 7.0 8.0
7.0 7.0 7.0 7.0 7.0
6.0 7.0 7.0 7.0 7.0
HEREEE
0.0 0.0 0.0 100.0 100.0
0.0 0.0 0.0 0.0 100.0
0.0 0.0 0.0 0.0 100.0
0.0 0.0 10.0 10.0 10.0
0.0 0.0 10.0 10.0 11.0
0.0 0.0 10.0 10.0 10.0


In [26]:
np.set_printoptions(precision=4, linewidth=120)
scores_MR[:, :10]

array([[0.7778, 0.7778, 0.7   , 0.7778, 0.7   , 0.7   , 0.7   , 0.7778, 0.175 , 0.175 ],
       [0.7778, 0.7778, 0.7   , 0.7778, 0.7   , 0.7   , 0.7   , 0.7778, 0.175 , 0.175 ],
       [0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.8333, 0.8333],
       [0.7407, 0.7407, 1.    , 0.7407, 1.    , 1.    , 1.    , 0.7407, 0.1667, 0.1667],
       [0.3704, 0.6296, 0.    , 0.5185, 0.8   , 0.6667, 0.9   , 0.2593, 0.1417, 0.1417]])

In [27]:
winscore_1R = scores_MR.max(axis=0, keepdims=1)

winners_MR = np.abs(scores_MR - winscore_1R) < 0.02

In [28]:
for mm, mname in enumerate(mnames):
    print("%13s won % 6d/%d trials" % (mname, winners_MR[mm].sum(), R))

  cross_ratio won   4453/10000 trials
cross_ratio_topk won   4453/10000 trials
    pick_mean won   1953/10000 trials
  pick_median won   2514/10000 trials
 guess_random won   1270/10000 trials


In [31]:
np.percentile(scores_MR / winscore_1R, 20, axis=1)

array([0.21  , 0.    , 0.25  , 0.2333])