# Setup

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
%cd "/content/gdrive/My Drive/PerfPred/Experiment 1"

/content/gdrive/.shortcut-targets-by-id/1vr6Z8seuUA0zoWaHuosCSZMv_H2Go5KR/PerfPred/Experiment 1


In [3]:
import sys
sys.path.append('/content/gdrive/My Drive/PerfPred/Experiment 1/src')

# Trial Vars

## Expr 1A: Size
- $D_1$ (`[TRAIN1_SIZE]`)
- $D_2$ (`[TRAIN2_SIZE]`)
- $D_1, D_2$ (`[TRAIN1_SIZE, TRAIN2_SIZE]`)

## Expr 1B: Domain Relatedness
- $j_1$ (`[TRAIN1_JSD]`)
- $j_2$ (`[TRAIN2_JSD]`)
- $j_1, j_2$ (`[TRAIN1_JSD, TRAIN2_JSD]`)

## Expr 1C: Language Relatedness (Dataset Independent)
- $d_\text{fea}$ (`[FEA_DIST]`)
- $d_\text{inv}$ (`[INV_DIST]`)
- $d_\text{pho}$ (`[PHO_DIST]`)
- $d_\text{syn}$ (`[SYN_DIST]`)
- $d_\text{gen}$ (`[GEN_DIST]`)
- $d_\text{geo}$ (`[GEO_DIST]`)
- $d_\text{inv}, d_\text{pho}$ (`[INV_DIST, PHO_DIST]`)
- $d_\text{inv}, d_\text{syn}$ (`[INV_DIST, SYN_DIST]`)
- $d_\text{pho}, d_\text{syn}$ (`[PHO_DIST, SYN_DIST]`)
- $d_\text{gen}, d_\text{geo}$ (`[GEN_DIST, GEO_DIST]`)
- $d_\text{inv}, d_\text{pho}, d_\text{syn}$ (`[INV_DIST, PHO_DIST, SYN_DIST]`)
- $d_\text{fea}, d_\text{gen}, d_\text{geo}$ (`[FEA_DIST, GEN_DIST, GEO_DIST]`)
- $d_\text{fea}, d_\text{inv}, d_\text{pho}, d_\text{syn}$ (`[FEA_DIST, INV_DIST, PHO_DIST, SYN_DIST]`)
- $d_\text{inv}, d_\text{pho}, d_\text{syn}, d_\text{gen}, d_\text{geo}$ (`[INV_DIST, PHO_DIST, SYN_DIST, GEN_DIST, GEO_DIST]`)
- $d_\text{fea}, d_\text{inv}, d_\text{pho}, d_\text{syn}, d_\text{gen}, d_\text{geo}$ (`[FEA_DIST, INV_DIST, PHO_DIST, SYN_DIST, GEN_DIST, GEO_DIST]`)


# Trial Functions

## General

### Linear
$$\text{linear}(x_1, \dots, x_n) = c_0 + \sum_{i=1}^n c_ix_i$$


### Polynomial
$$\text{polynomial}(x_1, \dots, x_n) = c_0 + \sum_{i=1}^n \sum_{j=1}^k c_{i, j} x_i^j$$
*Note:* You can look at $c$ as a constast $c_0$ and a $n \times k$ matrix $\{c_{i, j}\}$, but it's actually stored as a vector of length $1 + n \times k$ with $c_{i, j}$ stored at index $n(i - 1) + j$.


### Exponential
$$\text{exponential}(x_1, \dots, x_n) = c_0\exp\left(\sum_{i=1}^n c_ix_i\right)$$


### Logarithmic
$$\text{logarithmic}(x_1, \dots, x_n) = c_0 + \sum_{i=1}^n c_i\log(x_i)$$


### Power
$$\text{power}(x_1, \dots, x_n) = c_0 \sum_{i=1}^n x_i^{c_i}$$


### Multiplicative
$$\text{multiplicative}(x_1, \dots, x_n) = c_0 \prod_{i=1}^n x_i^{c_i}$$


### Hybrid Multiplicative
$$\text{hybrid-multiplicative}(x_1, \dots, x_n) = c_0 + \prod_{i=1}^n x_i^{c_i}$$


### Arithmetic Mean Linear
$$\text{arithmetic-mean-linear}(x_1, \dots, x_n) = c_0 + c_1\frac{\sum_{i=1}^n x_i}{n}$$


### Geometric Mean Linear
$$\text{geometric-mean-linear}(x_1, \dots, x_n) = c_0 + c_1\left(\prod_{i=1}^n x_i\right)^{\frac1n}$$


### Harmonic Mean Linear
$$\text{harmonic-mean-linear}(x_1, \dots, x_n) = c_0 + c_1\frac{n}{\sum_{i=1}^n \frac{1}{x_i}}$$


Code TODO:

### Sigmoid Linear
$$\text{sigmoid-linear}(x_1, ..., x_n) = \frac{c_0}{1 + \exp{\sum_{i=1}^{n}c_ix_i}}$$

# Expr 1A: Size

### Trial 2: Log D1
$$ \text{sp-BLEU} (D_1) = C \log (\alpha D_1) + β$$
where $\alpha > 0$. \\
Idea from [Sriivasan's paper, p.4](https://arxiv.org/pdf/2110.08875.pdf).


In [None]:
expr =  SingleSizeTrial(1, Model(func.log_single, np.array([0.1, 0.1, 0.1]),
                           bounds=([-np.inf, 0, -np.inf], [np.inf, np.inf, np.inf]),
                           pars=["C", "alpha", "beta"]), trial="trial2")
fits, costs = expr.fit_all()
# fits, costs = expr.read_all_fits()
expr.plot_all()
expr.analyze_all()

### Trial 3: Google paper law D1
$$\text{sp-BLEU} (D_1) = \alpha \left(\frac{1}{D_1} + C \right) ^{p}$$
where $C > 0$. \\
Idea from [Bansal's paper](https://arxiv.org/pdf/2202.01994.pdf), p.3.

In [None]:
expr = SingleSizeTrial(1, Model(func.recip_single, np.array([0, 0, -1]),
                           bounds=([-np.inf, 0, -np.inf], [np.inf, np.inf, np.inf]),
                           pars=["alpha", "C", "p"]), trial="trial3")
fits, costs = expr.fit_all()
# fits, costs = expr.read_all_fits()
expr.plot_all()
expr.analyze_all()

### Trial 2: Product
$$\text{sp-BLEU} (D_1, D_2) = \alpha (D_1)^{-p_1} \cdot (D_2)^{-p_2} + C $$
where $\alpha < 0, p_1, p_2, C > 0$  \\
See curve-fitting > equation in [Anthony's work](https://colab.research.google.com/drive/1Rx6sExWQ9RsNQeoHwBSmzIP2D-XvtMRy#scrollTo=aC47KqM31nLO).

In [None]:
expr = DoubleSizeTrial(Model(func.product_double, np.zeros(4),
                        bounds=([-np.inf, 0, 0, 0], [0, np.inf, np.inf, np.inf]),
                        pars=["alpha", "p1", "p2", "C"]), trial="trial2")
fits, costs = expr.fit_all()
# fits, costs = expr.read_all_fits()
expr.plot_all()
expr.analyze_all()

### Trial 3: Anthony's Paper Law D1 D2
$$\text{sp-BLEU}(D_1,D_2) = \alpha_1 (D_1D_2)^{-p_1} + \alpha_2 D_2 ^ {-p_2} + C$$
where $\alpha_1, \alpha_2 < 0, p_1, p_2, C > 0$.
See curve-fitting -> equation in [Anthony's work](https://colab.research.google.com/drive/1Rx6sExWQ9RsNQeoHwBSmzIP2D-XvtMRy#scrollTo=aC47KqM31nLO).


In [None]:
expr = DoubleSizeTrial(Model(func.depend_double, np.zeros(5),
                        bounds=([-np.inf, -np.inf, 0, 0, 0], [0, 0, np.inf, np.inf, np.inf]),
                        pars=["alpha1", "alpha2", "p1", "p2", "C"]), trial="trial3")
fits, costs = expr.fit_all()
# fits, costs = expr.read_all_fits()
expr.plot_all()
expr.analyze_all()

### Trial 4: Simple Decision D1D2
$$
\text{sp-BLEU}(D_1, D_2) = \begin{cases}
  c_1 D_1 + c_2 D_2 + C &, D_1 > 10k \\
  c_2 D_2 + C &, \text{otherwise}
\end{cases}
$$


In [None]:
def simple_decision_size(c,x):
  """ See above
  c: Array with dim 3, corresponding to c1, c2, and C
  x: Array of dim (n,2)
  y: Array with dim n
  """
  if np.all(x[:, 0] > 10):
    return c[0] * x[:, 0] + c[1] * x[:, 1] + c[2];
  return c[1] * x[:, 1] + c[2]

### Trial 8: Linear Regression with Divergence Difference
$$\text{sp-BLEU}(j_1, j_2) = \beta_0 + \beta_1 j_1 + \beta_2 j_2 + \beta_3 |j_1 - j_2|$$

# Expr 1B: Domain Relatedness

## Var = All Dataset Independent Language Features

### Trial 1: Simple Linear Regression
$$\text{sp-BLEU}(d_{geo}, d_{gen}, d_{inv}, d_{syn}, d_{pho}) = \beta_1 d_{geo} + \beta_2 d_{gen} + \beta_3 d_{inv} + \beta_4 d_{syn} + \beta_5 d_{pho} + C$$

### Trial 2: Stepwise Regression from Linear Single


```
candidate_factors = [geo, gen, inv, syn, pho]
candidate_factors.sort() # Ascending based on average RMSE of single var linear

selected_factors = []
MAX_FACTORS = 5

# Start with linear single with lowest RMSE
current_model = linear_reg(candidate_factors[0])
best_rmse = rmse(linear_reg(candidate_factors[0]))
candidate_factors.pop_front()

# Perform stepwise regression
while len(selected_factors) < MAX_FACTORS:

  best_factor = None

  # Iterate over all remaining factors
  for factor in candidate_factors:

    # Add the candidate factor to current
    subset_factors = selected_factors
    subset_factors.append(factor)
    updated_model = linear_reg(subset_factors)
    rmse = rmse(updated_model)

    if rmse < best_rmse:
      best_factor = factor
      best_rmse = rmse

  if best_factor = None:
    break

  selected_factors.append(best_factor)
  candidate_factors.remove(best_factor)

  current_model = add_factor(current_model, best_factor)

final_model = current_model # Do whatever analysis with this
final_rmse = best_rmse

```




### Trial 3: Reverse Stepwise Regression from Linear Single


```
candidate_factors = [geo, gen, inv, syn, pho]
selected_factors = candidate_factors.copy()
current_model = linear_reg(candidate_factors)
best_rmse = rmse(simple_linear(candidate_factors))

for factor in candidate_factors:

  # Temporarily remove a factor
  subset_factors = selected_factors.copy()
  subset_factors.remove(factor)
  updated_model = linear_reg(subset_factors)
  current_rmse = rmse(updated_model)

  if cur_rmse < best_rmse:
    best_remse = current_rmse
    selected_factors = subset_factors


final_model = current_model # Do whatever analysis with this
final_rmse = best_rmse


```

