In [None]:
import os
import torch as tf
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "1"
os.environ["TORCHINDUCTOR_AUTOGRAD_CACHE"] = "1"

In [None]:
import os
import cloudpickle as p
from pathlib import Path

output_dir = Path("./results/trained_input_embeddings/random_embeddings")

def restore_checkpoint():
    if not os.path.exists(output_dir / "all_args.p"):
        all_args = {}
    else:
        with open(output_dir / "all_args.p", "rb") as f:
            all_args = p.load(f)
    if not os.path.exists(output_dir / "results.p"):
        results = {}
    else:
        with open(output_dir / "results.p", "rb") as f:
            results = p.load(f)
    return all_args, results


def save_checkpoint(all_args, results):
    with open(output_dir / "all_args.p", "wb+") as f:
        p.dump(all_args, f)
    with open(output_dir / "results.p", "wb+") as f:
        p.dump(results, f)

In [None]:
from llama_experiments import experiments, HORSE_DISTRIBUTION
from llama_models import tokenizer
from compute_batch_llama_gradients_optimized import learn_embeddings, BatchArgs, read_sentences, tokenize_sentences
import scipy.stats 
import numpy as np

all_args = {}
results = {}

all_args, results = restore_checkpoint()

for offset in range(10):

    if offset in results:
        print("Skipping - already done")
        continue

    args = BatchArgs(steps = 500,
            num_examples = 16,
            examples_filepath='eng_sentences.tsv',
            example_stride=50,
            trim_input_ids=True,
            start_offset = offset,
            permutation_seed = offset * 42,
            randomize_input_embeds = True,
            target_probabilities=np.tile(HORSE_DISTRIBUTION, (16,1))
    )

    # Gradient descent dog->horse, 250 steps
    all_args[offset] = args
    results[offset] = learn_embeddings(args)

    save_checkpoint(all_args, results)

In [None]:
all_norms = []
for key in results:
    learned_embeddings = results[key].inputs_embeds
    initial_embeddings = all_args[key].inputs_embeds
    
    
    norms = np.linalg.norm(learned_embeddings - initial_embeddings, ord = 2, axis = -1)
    norms = norms.flatten()
    norms = norms[norms > 0]
    all_norms.extend(norms.tolist())

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
sns.kdeplot(all_norms)
plt.title("Travel Distance Of Learned Input Embeddings")
plt.show()


In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from llama_models import np_vocab_embedding_no_special_tokens


print(type(np_vocab_embedding_no_special_tokens), np_vocab_embedding_no_special_tokens.shape)
idxs_1 = np.random.choice(list(range(len(np_vocab_embedding_no_special_tokens))), 1000)
idxs_2 = np.random.choice(list(range(len(np_vocab_embedding_no_special_tokens))), 1000)
random_vocab_pair_norms = np.linalg.norm(np_vocab_embedding_no_special_tokens[idxs_1] - np_vocab_embedding_no_special_tokens[idxs_2], ord = 2, axis = -1)

sns.kdeplot(random_vocab_pair_norms)
plt.title("Typical Inter-Token L2 Distances")
plt.show()
