In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "1"
os.environ["TORCHINDUCTOR_AUTOGRAD_CACHE"] = "1"

from llama_experiments import experiments
from compute_batch_llama_gradients import learn_embeddings

# Run a single gradient descent step and collect the gradients
args = experiments["one-step"].copy()
result = learn_embeddings(args)

In [None]:
# From one-step run.
# Display sentences by gradient L2 norm
import numpy as np
from llama_viz import generate_saliency_html, viz_sentences_for_input_embed
saliencies = np.linalg.norm(result.last_gradient, axis=2, ord=2)
sentences = viz_sentences_for_input_embed(args, input_embeds=result.inputs_embeds_list[0])
generate_saliency_html(args, sentences, saliencies)

In [None]:
# From one-step run.
# Display tokens that have the highest cosine similarity to the gradient
import numpy as np
from llama_models import tokenizer, full_vocab_embedding_normalized
from llama_viz import generate_saliency_html, viz_sentences_for_input_embed
normalized_gradient = result.last_gradient / np.linalg.norm(result.last_gradient,axis=2,keepdims=True) # [B, N, D]
cosine_sims = np.einsum('bnd,vd->bnv', normalized_gradient, full_vocab_embedding_normalized) # [B, N, V]
closest_cosine_sim_vocab_token_idxs = np.argmax(cosine_sims,-1) # [B, N]
closest_cosine_sim_vocab_tokens = [ tokenizer.convert_ids_to_tokens(idxs) for idxs in closest_cosine_sim_vocab_token_idxs ]
saliencies = np.take_along_axis(cosine_sims, np.expand_dims(closest_cosine_sim_vocab_token_idxs, axis=-1), axis=-1)
generate_saliency_html(args, closest_cosine_sim_vocab_tokens, saliencies)

In [None]:
# From one-step run
# Display tokens that are pointed to by the gradient from the initial input embedding
from llama_viz import viz_sentences_for_input_embed, viz_sentences_for_input_embed
from llama_models import tokenizer, full_vocab_embedding_normalized
gradient_diff = result.last_gradient - result.inputs_embeds_list[0] # [B, N, D]
gradient_diff_normalized = gradient_diff / np.linalg.norm(gradient_diff, axis=2, keepdims=True) # [B, N, D]
cosine_sims = np.einsum('bnd,vd->bnv', gradient_diff_normalized, full_vocab_embedding_normalized) # [B, N, V]
nearest_token_ids = np.argmax(cosine_sims,-1) # [B, N]
nearest_tokens = [ tokenizer.convert_ids_to_tokens(nearest_token_ids[i]) for i in range(len(nearest_token_ids)) ]
saliencies = np.take_along_axis(cosine_sims, np.expand_dims(nearest_token_ids, axis=-1), axis=-1)
generate_saliency_html(args, nearest_tokens, saliencies)


In [None]:
from llama_experiments import experiments, HORSE_DISTRIBUTION
from compute_batch_llama_gradients import learn_embeddings, BatchArgs
from llama_util import collect_probability_path
import scipy.stats 

print(scipy.stats.entropy(HORSE_DISTRIBUTION[0]))

args = BatchArgs(steps = 250,
        num_examples = 1,
        examples_filepath=["The animal that says bark is a "],
        example_stride=1,
        learning_rate = 1e-3,
        target_probabilities=HORSE_DISTRIBUTION)

# Gradient descent dog->horse, 250 steps
result = learn_embeddings(args)
probs_path = collect_probability_path(args, result.inputs_embeds_list)

In [None]:

from llama_util import compute_divergence
print("Start KL div", compute_divergence(args, result.inputs_embeds_list[0], args.target_probabilities))
print("Start KL div", compute_divergence(args, result.inputs_embeds_list[-1], args.target_probabilities))


In [None]:
#from bert_viz import animate_sentence_level_L2_distances
from llama_animate import animate_sentence_level_L2_distances
animate_sentence_level_L2_distances(args, result.inputs_embeds_list)


In [None]:
from llama_animate import animate_token_level_L2_distances
animate_token_level_L2_distances(args, result.inputs_embeds_list, sentence_idx = 0)

In [None]:
from llama_experiments import experiments
from llama_util import collect_probability_path
from compute_batch_llama_gradients import learn_embeddings
from llama_util import compute_divergence
# Try 'eng-random-embedding' to help validate our "close to everywhere global minima" hypothesis
args = experiments['eng'].copy()
args.steps = 250
result = learn_embeddings(args)
probs_path = collect_probability_path(args, result.inputs_embeds_list)
print("Starting Divergence", compute_divergence(args, result.inputs_embeds_list[0], args.target_probabilities))
print("Ending Divergence", compute_divergence(args, result.inputs_embeds_list[-1], args.target_probabilities))

In [None]:
from llama_viz import viz_sentence_changes
from IPython.display import display
import importlib, llama_viz
importlib.reload(llama_viz)
viz_sentence_changes = llama_viz.viz_sentence_changes
display(viz_sentence_changes(args, result.inputs_embeds_list))


In [None]:
import importlib, llama_animate
importlib.reload(llama_animate)
animate_prob_distr_path = llama_animate.animate_prob_distr_path
animate_prob_distr_path(args, result.inputs_embeds_list, probs_path, SELECTED_IDX = 0)

In [None]:
import importlib, llama_viz
importlib.reload(llama_viz)
llama_viz.display_gradient_displacement(args, result.inputs_embeds_list, 1)



In [None]:
from llama_animate import animate_kl_divergences
animate_kl_divergences(args, probs_path)

In [None]:
#from bert_viz import animate_sentence_level_L2_distances
from llama_animate import animate_sentence_level_L2_distances
animate_sentence_level_L2_distances(args, result.inputs_embeds_list)

In [None]:
# Display sentences by gradient L2 norm
import numpy as np
from llama_viz import generate_saliency_html, viz_sentences_for_input_embed
saliencies = np.linalg.norm(result.last_gradient, axis=2, ord=2)
sentences = viz_sentences_for_input_embed(args, input_embeds=result.inputs_embeds_list[0])
generate_saliency_html(args, sentences, saliencies)

In [None]:
# Display tokens that are pointed to by the gradient from the initial input embedding
import numpy as np
from IPython.display import display
import importlib, llama_viz as viz
from llama_models import tokenizer, full_vocab_embedding_normalized
importlib.reload(viz)
#from bert_viz import viz_sentences_for_input_embed, viz_sentences_for_input_embed
gradient_diff = result.last_gradient - result.inputs_embeds_list[0] # [B, N, D]
gradient_diff_normalized = gradient_diff / np.linalg.norm(gradient_diff, axis=2, keepdims=True) # [B, N, D]
cosine_sims = np.einsum('bnd,vd->bnv', gradient_diff_normalized, full_vocab_embedding_normalized) # [B, N, V]
nearest_token_ids = np.argmax(cosine_sims,-1) # [B, N]
nearest_tokens = [ tokenizer.convert_ids_to_tokens(nearest_token_ids[i]) for i in range(len(nearest_token_ids)) ]
saliencies = np.take_along_axis(cosine_sims, np.expand_dims(nearest_token_ids, axis=-1), axis=-1)
display(viz.generate_saliency_html(args, nearest_tokens, saliencies))
