In [1]:
# 5_generation_examples.ipynb

# Cell 1: Import libraries
import numpy as np
import matplotlib.pyplot as plt
import torch
import os
import sys
from IPython.display import HTML
import random

# Add project root to path
sys.path.append('..')
from src.data.loader import MotionDataLoader
from src.visualization.animator import DanceAnimator
from src.model.encoders import DanceEncoder, SimpleTextEncoder
from src.generation.dance_from_text import retrieve_dance_by_text
from src.generation.text_from_dance import retrieve_text_by_dance, generate_composite_description



# Cell 2: Introduction
"""
# Dance and Text Generation Examples

This notebook demonstrates how to use our trained contrastive learning model for bidirectional generation:

1. **Text-to-Dance**: Given a text description, find the most similar dance sequence
2. **Dance-to-Text**: Given a dance sequence, generate an appropriate text description

These examples illustrate the practical applications of our multimodal embedding space, showing how the model can bridge the gap between movement and language.
"""


In [3]:
# Cell 3: Load trained model and data
# Load embeddings saved during training
try:
    test_dance_embeddings = np.load('../results/embeddings/test_dance_embeddings.npy')
    test_text_embeddings = np.load('../results/embeddings/test_text_embeddings.npy')
    print(f"Loaded {len(test_dance_embeddings)} test embeddings")
except:
    print("Test embeddings not found. Please run the model training notebook first.")
    test_dance_embeddings = None
    test_text_embeddings = None

# Load sequences and labels
try:
    # Try to load the labeled dataset
    dataset = np.load('../data/processed/labeled_dataset.npy', allow_pickle=True).item()
    sequences = dataset['sequences']
    all_labels = dataset['labels']
    print(f"Loaded dataset with {len(sequences)} sequences and {len(all_labels)} labels")
except:
    # Alternative: try loading the individual components
    try:
        sequences = np.load('../data/processed/dance_sequences.npy')
        
        # Try to load labels
        try:
            label_data = np.load('../data/processed/sequence_labels.npy', allow_pickle=True).item()
            all_labels = label_data['labels']
        except:
            print("Labels not found. Creating placeholder labels for demonstration.")
            # Create placeholder labels if needed
            all_labels = [f"Dance movement {i}" for i in range(len(sequences))]
        
        print(f"Loaded {len(sequences)} sequences and {len(all_labels)} labels")
    except:
        print("No dataset files found. Please run the previous notebooks first.")
        sequences = None
        all_labels = None

# Load the model (define architecture first)
if 'sequences' in locals() and sequences is not None:
    # Model parameters
    n_joints, seq_length, n_dims = sequences[0].shape
    embedding_dim = 128  # Same as during training
    
    # Try to determine the correct vocab size from the saved model
    model_path = '../results/models/contrastive_model.pt'
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # First check if the model file exists
    if os.path.exists(model_path):
        try:
            # Load the checkpoint to inspect dimensions
            checkpoint = torch.load(model_path, map_location=device)
            # Get text encoder input size from saved weights
            if 'text_encoder' in checkpoint:
                first_layer_weight = checkpoint['text_encoder']['layers.0.weight']
                vocab_size = first_layer_weight.shape[1]
                print(f"Detected vocabulary size from saved model: {vocab_size}")
            else:
                # Fallback value
                vocab_size = 32
                print(f"Using default vocabulary size: {vocab_size}")
        except Exception as e:
            print(f"Error inspecting model file: {e}")
            vocab_size = 32
            print(f"Using default vocabulary size: {vocab_size}")
    else:
        # If no model file, use an estimate
        vocab_size = 32
        print(f"Model file not found. Using default vocabulary size: {vocab_size}")
    
    # Create encoders (we'll load the trained weights)
    dance_encoder = DanceEncoder(
        n_joints=n_joints,
        seq_length=seq_length,
        n_dims=n_dims,
        embedding_dim=embedding_dim
    )
    
    text_encoder = SimpleTextEncoder(
        input_dim=vocab_size,  # Now using the correct vocabulary size
        embedding_dim=embedding_dim
    )
    
    # Try to load model weights
    if os.path.exists(model_path):
        try:
            checkpoint = torch.load(model_path, map_location=device)
            dance_encoder.load_state_dict(checkpoint['dance_encoder'])
            text_encoder.load_state_dict(checkpoint['text_encoder'])
            print(f"Successfully loaded trained model from {model_path}")
            
            # Move to appropriate device
            dance_encoder.to(device)
            text_encoder.to(device)
        except Exception as e:
            print(f"Error loading model: {e}")
            print("Using untrained model for demonstration")
    else:
        print(f"Model not found at {model_path}, using untrained model for demonstration")

Loaded 304 test embeddings
Loaded dataset with 2022 sequences and 2022 labels
Detected vocabulary size from saved model: 32
Successfully loaded trained model from ../results/models/contrastive_model.pt


In [4]:
# Cell 4: Text-to-Dance Generation
if 'dance_encoder' in locals() and 'text_encoder' in locals() and 'sequences' in locals() and 'test_dance_embeddings' in locals():
    """
    ## Text to Dance Generation
    
    Given a text description, we can find dance sequences that match by:
    1. Encoding the text with our text encoder
    2. Computing similarity with all pre-encoded dance sequences
    3. Retrieving the closest dance sequence(s)
    
    Let's try some examples using our test set embeddings.
    """
    
    # Define a simple text tokenizer function
    def simple_tokenizer(text_list):
        """
        Convert text to bag-of-words representation
        
        Args:
            text_list: List of text descriptions
            
        Returns:
            Bag-of-words tensor and None for lengths
        """
        bow_vectors = torch.zeros(len(text_list), vocab_size, device=device)
        
        # For each text, create a BoW representation
        for i, text in enumerate(text_list):
            words = text.lower().strip().split()
            for word in words:
                # We don't have the actual vocabulary, so hash the words
                # This is just for demonstration
                word_idx = hash(word) % (vocab_size - 2) + 2  # +2 to avoid special tokens
                bow_vectors[i, word_idx] = 1
        
        return bow_vectors, None
    
    # Create animator for visualization
    animator = DanceAnimator(
        figsize=(10, 8),
        joint_color='blue',
        line_color='red',
        draw_floor=True,
        show_trajectory=True
    )
    
    # Test some example queries
    test_queries = [
        "smooth flowing movement with arms",
        "energetic jump with the whole body",
        "sharp kick with legs forward",
        "gentle turning motion"
    ]
    
    for query in test_queries:
        print(f"\nQuery: \"{query}\"")
        
        # Retrieve matching dance
        results = retrieve_dance_by_text(
            query,
            text_encoder,
            simple_tokenizer,
            sequences,
            test_dance_embeddings,
            top_k=1,
            device=device
        )
        
        # Display result
        if results:
            idx, score, seq = results[0]
            print(f"Found matching sequence {idx} with similarity score {score:.4f}")
            print(f"Original label: \"{all_labels[idx]}\"")
            
            # Create animation (but don't display in this code snippet)
            print("Animation would be displayed here in notebook")
            # ani = animator.animate_sequence(seq)
            # display(HTML(ani.to_jshtml()))
        else:
            print("No matching sequence found")

2025-04-06 02:33:20,412 - src.visualization.animator - INFO - DanceAnimator initialized with custom settings



Query: "smooth flowing movement with arms"
Found matching sequence 216 with similarity score 0.6455
Original label: "energetic leap with whole body moving high"
Animation would be displayed here in notebook

Query: "energetic jump with the whole body"
Found matching sequence 303 with similarity score 0.5965
Original label: "fluid stretch with arms moving diagonal"
Animation would be displayed here in notebook

Query: "sharp kick with legs forward"
Found matching sequence 303 with similarity score 0.6269
Original label: "fluid stretch with arms moving diagonal"
Animation would be displayed here in notebook

Query: "gentle turning motion"
Found matching sequence 218 with similarity score 0.6220
Original label: "energetic leap with whole body moving high"
Animation would be displayed here in notebook


  tokens = torch.tensor(tokens, device=device)


In [5]:
# Cell 5: Dance-to-Text Generation
if 'dance_encoder' in locals() and 'text_encoder' in locals() and 'sequences' in locals() and 'test_text_embeddings' in locals():
    """
    ## Dance to Text Generation
    
    Now let's demonstrate the reverse: given a dance sequence, generate an appropriate text description.
    We can:
    1. Encode the dance sequence with our dance encoder
    2. Find the closest text embeddings in our shared space
    3. Return the corresponding text descriptions
    
    For more natural-sounding descriptions, we can also create a composite description from multiple
    top matches.
    """
    # Choose random test sequences to describe
    n_samples = 3
    if 'test_dance_embeddings' in locals() and test_dance_embeddings is not None:
        sample_indices = random.sample(range(len(test_dance_embeddings)), k=min(n_samples, len(test_dance_embeddings)))
    else:
        sample_indices = random.sample(range(len(sequences)), k=min(n_samples, len(sequences)))
    
    for idx in sample_indices:
        print(f"\nDance Sequence {idx}:")
        
        # Get the sequence
        dance_seq = sequences[idx]
        
        # Retrieve matching text
        results = retrieve_text_by_dance(
            dance_seq,
            dance_encoder,
            test_text_embeddings,
            all_labels,
            top_k=3,  # Get top 3 matches
            device=device
        )
        # Display results
        if results:
            print(f"Top matches:")
            for i, (match_idx, score, text) in enumerate(results):
                print(f"  {i+1}. \"{text}\" (score: {score:.4f})")
            
            # Generate composite description
            composite = generate_composite_description(
                dance_seq,
                dance_encoder,
                test_text_embeddings,
                all_labels,
                num_components=3,
                device=device
            )
            
            print(f"\nComposite description: \"{composite}\"")
            
            # Create animation (but don't display in this code snippet)
            print("Animation would be displayed here in notebook")
            # ani = animator.animate_sequence(dance_seq)
            # display(HTML(ani.to_jshtml()))
        else:
            print("No matching text found")


Dance Sequence 231:
Top matches:
  1. "energetic leap with whole body moving high" (score: 0.6472)
  2. "energetic leap with whole body moving high" (score: 0.6472)
  3. "energetic leap with whole body moving high" (score: 0.6472)

Composite description: "leap"
Animation would be displayed here in notebook

Dance Sequence 6:
Top matches:
  1. "fluid stretch with arms moving diagonal" (score: 0.5948)
  2. "quick turn with torso moving circular" (score: 0.5948)
  3. "fluid stretch with arms moving diagonal" (score: 0.5948)

Composite description: "fluid stretch and turn"
Animation would be displayed here in notebook

Dance Sequence 238:
Top matches:
  1. "energetic leap with whole body moving high" (score: 0.6282)
  2. "energetic leap with whole body moving high" (score: 0.6282)
  3. "energetic leap with whole body moving high" (score: 0.6282)

Composite description: "leap"
Animation would be displayed here in notebook


In [6]:
# Cell 6: Interpolation in Embedding Space
if 'dance_encoder' in locals() and 'text_encoder' in locals() and 'sequences' in locals():
    """
    ## Creative Applications: Embedding Space Interpolation
    
    One interesting creative application is interpolating between different dance movements
    in the embedding space. This allows us to:
    1. Blend characteristics of different dance sequences
    2. Create smooth transitions between movements
    3. Explore the "space" between different choreographic elements
    
    We can do this by:
    1. Selecting two or more source dance sequences
    2. Finding their locations in the embedding space
    3. Interpolating between those points
    4. Finding the closest actual dance sequence to the interpolated point
    
    Alternatively, we can directly interpolate in the sequence space by blending joint positions.
    """
    # For demonstration, we'll use direct sequence interpolation
    from src.generation.dance_from_text import interpolate_dance_sequences
    
    # Select two sequences to interpolate between (random selection)
    if len(sequences) >= 2:
        idx1, idx2 = random.sample(range(len(sequences)), k=2)
        seq1 = sequences[idx1]
        seq2 = sequences[idx2]
        
        print(f"Interpolating between sequences:")
        print(f"Sequence {idx1} - Label: \"{all_labels[idx1]}\"")
        print(f"Sequence {idx2} - Label: \"{all_labels[idx2]}\"")
        
        # Create interpolated sequences
        weights = [
            [1.0, 0.0],    # 100% seq1, 0% seq2
            [0.75, 0.25],  # 75% seq1, 25% seq2
            [0.5, 0.5],    # 50% seq1, 50% seq2
            [0.25, 0.75],  # 25% seq1, 75% seq2
            [0.0, 1.0]     # 0% seq1, 100% seq2
        ]
        interpolated_seqs = []
        for w in weights:
            interp = interpolate_dance_sequences([seq1, seq2], w)
            interpolated_seqs.append(interp)
        
        print(f"\nCreated {len(interpolated_seqs)} interpolated sequences")
        print("Animations would be displayed here in notebook")
        
        # Generate text descriptions for interpolated sequences
        if 'test_text_embeddings' in locals() and test_text_embeddings is not None:
            print("\nText descriptions for interpolated sequences:")
            for i, seq in enumerate(interpolated_seqs):
                w = weights[i]
                desc = generate_composite_description(
                    seq,
                    dance_encoder,
                    test_text_embeddings,
                    all_labels,
                    num_components=2,
                    device=device
                )
                print(f"  {w[0]*100:.0f}% seq1, {w[1]*100:.0f}% seq2 → \"{desc}\"")


Interpolating between sequences:
Sequence 708 - Label: "sharp swing with feet moving high"
Sequence 1628 - Label: "heavy bend with back moving low"

Created 5 interpolated sequences
Animations would be displayed here in notebook

Text descriptions for interpolated sequences:
  100% seq1, 0% seq2 → "fluid stretch and turn"
  75% seq1, 25% seq2 → "fluid stretch and turn"
  50% seq1, 50% seq2 → "fluid stretch and turn"
  25% seq1, 75% seq2 → "fluid stretch and turn"
  0% seq1, 100% seq2 → "fluid stretch and turn"


# Cell 7: Summary and Future Directions
"""
## Summary and Conclusion

In this project, we've successfully:

1. **Visualized dance motion capture data** with interactive 3D animations that show the dancer's movement through time.

2. **Developed a labeling strategy** that uses semi-supervised learning to generate descriptive text for dance sequences.

3. **Trained a contrastive learning model** that embeds both dance movements and text descriptions in a shared space.

4. **Demonstrated bidirectional generation** by retrieving dance sequences from text and generating text descriptions for dance movements.

### Future Directions

There are several exciting directions for further development:

1. **Improved Dance Representation**:
   - Use more sophisticated temporal models (e.g., LSTMs, Transformers) to better capture the sequential nature of dance
   - Incorporate physical constraints and biomechanical principles into the encoding

2. **Enhanced Text Processing**:
   - Use pre-trained language models (e.g., BERT, RoBERTa) for more nuanced text understanding
   - Develop a more specialized vocabulary for dance movements

3. **Generative Models**:
   - Move beyond retrieval to truly generative models that can synthesize novel dance sequences
   - Incorporate music as a third modality for multimodal generation

4. **Interactive Applications**:
   - Develop creative tools that allow choreographers to explore the dance-text space
   - Create interactive installations where viewers can generate dance through language

This project demonstrates the potential of AI to bridge movement and language, opening new possibilities for both choreographic exploration and dance documentation.
"""