# Valuation Under Independent t-Distributed Assumptions

This notebook contains the empricial extensions described in Section 7 of the paper.

> We extend the model to incorporate the heavy tail in the distribution of uplifts shown in related work by modelling the true value of the propositions, as well as the estimation noise, with Generalized Student’s t-distributions.

In [None]:
import normal_normal_model as nnm
import t_t_model as ttm
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

In [None]:
def print_run_setting(count, N, M, mu_X, sigma_X, mu_eps, sigma_1, sigma_2):
    print("Cycle {}: N = {}, M = {}, mu_X = {}, sigma_X = {}, "
          "mu_epsilon = {}, sigma_1 = {}, sigma_2 = {}"
          .format(count, N, M, np.round(mu_X, 4), 
                  np.round(sigma_X, 4), np.round(mu_eps, 4),
                  np.round(sigma_1, 4), np.round(sigma_2, 4)))

def calculate_norm_and_t_improvements(cycles=500):
    
    def process_improvement_samples(
        improvement_samples, mean_improvements, CI_lows, CI_highs):
        mean_improvement = np.mean(improvement_samples)
        CI_low = np.percentile(improvement_samples, 5)
        CI_high = np.percentile(improvement_samples, 95)
        
        mean_improvements.append(mean_improvement)
        CI_highs.append(CI_high)
        CI_lows.append(CI_low)
        
        return(mean_improvement, CI_low, CI_high)
    
    # Constants
    NUM_SAMPLES = 1000
    NUM_BOOTSTRAPS = 500
    run_counter = 0
    T_NU = 3
    
    norm_mean_improvements = []
    norm_CI_highs = []
    norm_CI_lows = []
    
    t_mean_improvements = []
    t_CI_highs = []
    t_CI_lows = []
    
    t_adjusted_mean_improvements = []
    t_adjusted_CI_highs = []
    t_adjusted_CI_lows = []

    for cycle in range(1, cycles+1, 1):
        N = int(10 ** np.random.uniform(1, 3.5))
        M = int(N * np.random.uniform(0.02, 0.6)) + 2
        sigma_X = np.sqrt(np.random.chisquare(3))
        sigma_1 = np.sqrt(np.random.chisquare(3))
        sigma_2 = sigma_1 * np.random.uniform(0.02, 0.9999)
        mu_X = np.random.normal(0, 3)
        mu_epsilon = np.random.normal(0, 3)

        # Reconciling the use of sigmas by numpy
        # and sigma_sqs in the theoretical calculations
        sigma_sq_1 = sigma_1 ** 2
        sigma_sq_2 = sigma_2 ** 2
        sigma_sq_X = sigma_X ** 2

        print_run_setting(cycle, N, M, mu_X, sigma_X, 
                          mu_epsilon, sigma_1, sigma_2)

        # Normal distribution
        _, _, norm_improvements = (
            nnm.get_prioritisation_value_samples(
                NUM_SAMPLES, N, M, mu_X, mu_epsilon, 
                sigma_sq_X, sigma_sq_1, sigma_sq_2, verbose=False)
        )
        
        _, _, norm_mean_improvement = (
            process_improvement_samples(
                norm_improvements, norm_mean_improvements, 
                norm_CI_lows, norm_CI_highs)
        )

        # Student's t-distribution
        _, _, t_improvements = (
            ttm.get_prioritisation_value_samples(
                NUM_SAMPLES, N, M, mu_X, mu_epsilon, 
                sigma_sq_X, sigma_sq_1, sigma_sq_2, T_NU, verbose=False)
        )
        
        _, _, t_mean_improvement = (
            process_improvement_samples(
                t_improvements, t_mean_improvements, 
                t_CI_lows, t_CI_highs)
        )
        
        # Student's t-distribution with adjusted scale
        # so that it has the same variance as the normal distributions
        _, _, t_adjusted_improvements = (
            ttm.get_prioritisation_value_samples(
                NUM_SAMPLES, N, M, mu_X, mu_epsilon, 
                sigma_sq_X * np.sqrt((T_NU - 2)/T_NU),
                sigma_sq_1 * np.sqrt((T_NU - 2)/T_NU),
                sigma_sq_2 * np.sqrt((T_NU - 2)/T_NU),
                T_NU, verbose=False)
        )
        
        _, _, t_adjusted_mean_improvement = (
            process_improvement_samples(
                t_adjusted_improvements, t_adjusted_mean_improvements, 
                t_adjusted_CI_lows, t_adjusted_CI_highs)
        )
        
        print("Cycle {}: Normal E(D) = {}; t E(D) = {}; Adjusted t E(D) = {}"
              .format(cycle,
                      norm_mean_improvement,
                      t_mean_improvement,
                      t_adjusted_mean_improvement))
            
    return(
        norm_mean_improvements, norm_CI_highs, norm_CI_lows,
        t_mean_improvements, t_CI_highs, t_CI_lows,
        t_adjusted_mean_improvements, t_adjusted_CI_highs, t_adjusted_CI_lows
    )

In [None]:
norm_improvements = []
norm_CI_highs = []
norm_CI_lows = []

t_improvements = []
t_CI_highs = []
t_CI_lows = []

t_adjusted_improvements = []
t_adjusted_CI_highs = []
t_adjusted_CI_lows = []

# We obtain the improvements under independent Gaussian and t-distributed
# assumptions respectively in batches of ten.
# This enables us to retain the numbers calculated even if
# we have to interrupt the process.

while True:
    (norm_EDs, norm_highs, norm_lows, 
     t_EDs, t_highs, t_lows,
     t_adjusted_EDs, t_adjusted_highs, t_adjusted_lows) =  \
        calculate_norm_and_t_improvements(10)
    
    norm_improvements = norm_improvements + norm_EDs
    norm_CI_highs = norm_CI_highs + norm_highs
    norm_CI_lows = norm_CI_lows + norm_lows

    t_improvements = t_improvements + t_EDs
    t_CI_highs = t_CI_highs + t_highs
    t_CI_lows = t_CI_lows + t_lows

    t_adjusted_improvements = t_adjusted_improvements + t_adjusted_EDs
    t_adjusted_CI_highs = t_adjusted_CI_highs + t_adjusted_highs
    t_adjusted_CI_lows = t_adjusted_CI_lows + t_adjusted_lows

In [None]:
len(norm_improvements)

In [None]:
plt.hist(np.array(norm_improvements) / np.array(t_improvements), bins=500)
plt.xlim(0, 2)
plt.show()

In [None]:
plt.hist(np.array(norm_improvements) / np.array(t_adjusted_improvements), bins=100)
plt.xlim(0, 2)
plt.show()

In [None]:
pd.DataFrame(np.array(t_improvements) / np.array(norm_improvements)).describe()

In [None]:
pd.DataFrame(np.array(t_adjusted_improvements)/ np.array(norm_improvements)).describe()

In [None]:
pd.DataFrame(np.array(t_adjusted_CI_highs) / np.array(norm_CI_highs)).describe()