# Cut-replace rate: how much is a sentence cut vs. replaced

## 1 Setup

Flags

In [None]:
# None

Setup the database

In [None]:
import os, sys
sys.path.insert(1, os.path.abspath('../..'))
import analysis

DB_NAME = 'spreadr_' + os.path.split(os.path.abspath(os.path.curdir))[1]
analysis.setup(DB_NAME)
print('Database:', DB_NAME)

In [None]:
import random
import itertools

import numpy as np
from scipy import optimize
from nltk.metrics import edit_distance
from frozendict import frozendict
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
from progressbar import ProgressBar

from gists.models import Sentence
from analysis import settings, transformations

## 2 Test optimising for known parameters

### 2.1 Framework setup

In [None]:
kept_sentence_ids = Sentence.objects.kept\
    .filter(parent__isnull=False)\
    .values_list('id', flat=True)
kept_sentence_ids = list(kept_sentence_ids)

def sample_training_sentences(sample_size):
    remaining_shuffled_sentence_ids = random.sample(kept_sentence_ids, len(kept_sentence_ids))
    sample_sentences = []
    while len(sample_sentences) < sample_size:
        candidate = Sentence.objects.get(id=remaining_shuffled_sentence_ids.pop())
        if candidate.oc_distance(candidate.parent) > 0:
            sample_sentences.append(candidate)
    assert len(sample_sentences) == sample_size
    return sample_sentences

def get_complement_test_sentences(sample_sentences):
    test_sentences = []
    for sid in set(kept_sentence_ids).difference(
            [sentence.id for sentence in sample_sentences]):
        candidate = Sentence.objects.get(id=sid)
        if candidate.oc_distance(candidate.parent) > 0:
            test_sentences.append(candidate)
    return test_sentences

In [None]:
def alignments(sentences, parameters):
    frozen_parameters = frozendict(parameters)
    return [transformations.align_lemmas(s.parent.tokens, s.tokens,
                                         parameters=frozen_parameters)
            for s in sentences]

In [None]:
def distance(alignment1, alignment2):
    seq1A, seq1B = alignment1[:2]
    seq2A, seq2B = alignment2[:2]
    seq1A = list(map(id, seq1A))
    seq1B = list(map(id, seq1B))
    seq2A = list(map(id, seq2A))
    seq2B = list(map(id, seq2B))
    return (edit_distance(seq1A, seq2A) + edit_distance(seq1B, seq2B)) / 2

In [None]:
BASE_COMPARE_FACTOR = 1
def x2parameters(x):
    return frozendict({
        'COMPARE_FACTOR': BASE_COMPARE_FACTOR,
        'COMPARE_ORIGIN': x[0] * BASE_COMPARE_FACTOR,
        'GAP_OPEN': (x[1] + x[2]) * BASE_COMPARE_FACTOR,
        'GAP_EXTEND': x[2] * BASE_COMPARE_FACTOR,
        'EXCHANGE': None,
    })

def parameters2x(parameters):
    return (np.array([parameters['COMPARE_ORIGIN'],
                      parameters['GAP_OPEN'] - parameters['GAP_EXTEND'],
                      parameters['GAP_EXTEND']])
            / parameters['COMPARE_FACTOR'])

In [None]:
def objective(x, sentences, ref_alignments):
    x_alignments = alignments(sentences, x2parameters(x))
    distances = []
    for ref_as, x_as in zip(ref_alignments, x_alignments):
        if len(x_as) == 0:
            # Add an empty alignment if there are none
            x_as = [([], [])]
        # Or use max+mean
        distances.append(np.max([distance(ref_a, x_a) for ref_a, x_a
                                 in itertools.zip_longest(ref_as,  x_as,
                                                          fillvalue=([], []))]))
    return np.sum(distances)

In [None]:
reference_parameters = {
    'COMPARE_FACTOR': BASE_COMPARE_FACTOR,
    'COMPARE_ORIGIN': -.5,
    'GAP_OPEN': -.5,
    'GAP_EXTEND': -.1,
    'EXCHANGE': None,
}

x_bounds = [
    (-1, -.01), # COMPARE_ORIGIN / COMPARE_FACTOR
    (-1, -.01), # (GAP_OPEN - GAP_EXTEND) / COMPARE_FACTOR
    (-1, -.01), # GAP_EXTEND / COMPARE_FACTOR
]

### 2.2 Local optimisation

In [None]:
sample_sentences = sample_training_sentences(200)
reference_alignments = alignments(sample_sentences, reference_parameters)
for _ in range(1):
    x0 = [np.random.uniform(*bounds) for bounds in x_bounds]
    result = optimize.minimize(
        objective, x0,
        #method='SLSQP',
        bounds=x_bounds,
        args=(sample_sentences, reference_alignments),
        options={'disp': True, 'maxiter': 500},
        callback=print)
    print(result)
    print(x2parameters(result.x))
    print()
    print()

### 2.3 Brute force

Try plotting one sample size

In [None]:
discretization = 10
n_dims = len(x_bounds)
xs = [np.linspace(start, stop, discretization) for (start, stop) in x_bounds]
grids = np.meshgrid(*xs, indexing='ij')
values = np.zeros_like(grids[0])

sample_sentences = sample_training_sentences(20)
reference_alignments = alignments(sample_sentences, reference_parameters)
for i, k in ProgressBar(max_value=len(values.flat))(
        enumerate(itertools.product(range(discretization), repeat=n_dims))):
    values[k] = objective([grids[j][k] for j in range(n_dims)],
                          sample_sentences, reference_alignments)

fig, axes = plt.subplots(1, discretization,
                         figsize=(70, 5), subplot_kw={'projection': '3d'})
for i in range(discretization):
    axes[i].plot_surface(grids[1][i, :, :], grids[2][i, :, :], values[i, :, :])

Then evaluate several sizes

In [None]:
def evaluate_sample_size(sample_size, xref=parameters2x(reference_parameters)):
    n_runs = 10
    print()
    print('Sample size {}, doing {} runs'.format(sample_size, n_runs))
    
    worst_objectives = []
    for r in range(n_runs):
        print()
        print('Run', r)
        
        # Brute force the parameter fitting
        discretization = 10
        n_dims = len(x_bounds)
        xs = [np.linspace(start, stop, discretization) for (start, stop) in x_bounds]
        grids = np.meshgrid(*xs, indexing='ij')
        values = np.zeros_like(grids[0])
        
        sample_sentences = sample_training_sentences(sample_size)
        reference_alignments = alignments(sample_sentences, x2parameters(xref))
        for i, k in ProgressBar(max_value=len(values.flat))(
                enumerate(itertools.product(range(discretization), repeat=n_dims))):
            values[k] = objective([grids[j][k] for j in range(n_dims)],
                                  sample_sentences, reference_alignments)
        
        min_value = np.min(values)
        min_locations = np.where(values == min_value)
        print('Min training objective value {}, found in {} points'
              .format(min_value, len(min_locations[0])))
        
        # Test the best parameters found
        test_sentences = get_complement_test_sentences(sample_sentences)
        print('Testing on the remaining {} sentences'.format(len(test_sentences)))
        test_alignments = alignments(test_sentences, x2parameters(xref))
        objective_values = []
        for k in zip(*min_locations):
            x = [grids[0][k], grids[1][k], grids[2][k]]
            objective_values.append(objective(x, test_sentences, test_alignments))
        print('Worst objective', np.max(objective_values))
        worst_objectives.append(np.max(objective_values))
    
    print()
    print('Overall worst objective for {} runs of sample size {} = {}'
          .format(n_runs, sample_size, np.max(worst_objectives)))
    return np.max(worst_objectives)

In [None]:
evaluate_sample_size(20)

In [None]:
evaluate_sample_size(50)

In [None]:
evaluate_sample_size(100)

In [None]:
evaluate_sample_size(200)

In [None]:
gold100_worst_objectives = []
gold100_n_runs = 10
for i in range(gold100_n_runs):
    xref = [np.random.uniform(*bounds) for bounds in x_bounds]
    gold100_worst_objectives.append(evaluate_sample_size(100, xref))
print('Worst objective for {} random target parameter '
      'sets trained with sample size 100 = {}'.format(np.max(gold100_worst_objectives)))

This tells me how many gold alignments I need to be able to recover a working set of parameters.

## Questions

Why are the objective values always the same (29, 261, ...)