In [1]:
# 1
import io
import zipfile
import requests
import frontmatter

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    return repository_data

workout_docs = read_repo_data('ilhamksyuriadi', 'workout-recommendation')
# dtc_faq = read_repo_data('DataTalksClub', 'faq')
# evidently_docs = read_repo_data('evidentlyai', 'docs')

print(f"workout documents: {len(workout_docs)}")
# print(f"FAQ documents: {len(dtc_faq)}")
# print(f"Evidently documents: {len(evidently_docs)}")

workout documents: 1


In [2]:
# 2

print(f"Number of documents: {len(workout_docs)}")
print(f"Sample document keys: {workout_docs[0].keys()}")
print(f"Sample content length: {len(workout_docs[0]['content'])}")

Number of documents: 1
Sample document keys: dict_keys(['content', 'filename'])
Sample content length: 12897


In [3]:
# 2.1 simple chunking
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result

workout_docs_chunks = []

for doc in workout_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 1500, 750)
    for chunk in chunks:
        chunk.update(doc_copy)
    workout_docs_chunks.extend(chunks)

print(f"Simple sliding window chunks: {len(workout_docs_chunks)}")

Simple sliding window chunks: 17


In [4]:
# 2.2 paragraph chungking + sliding windows
import re
def paragraph_chunking_with_sliding_window(text, max_paragraph_length=500, sliding_window_size=1000, sliding_window_step=500):
    """
    Hybrid approach: Split by paragraphs, then apply sliding window to long paragraphs
    
    Parameters:
    - text: The document content to chunk
    - max_paragraph_length: If paragraph is shorter than this, keep as-is
    - sliding_window_size: Size for sliding window (applied to long paragraphs)
    - sliding_window_step: Step for sliding window (creates overlap)
    
    Returns: List of chunks with metadata
    """
    # Step 1: Split into paragraphs
    paragraphs = re.split(r"\n\s*\n", text.strip())
    
    all_chunks = []
    chunk_counter = 0
    
    for para_idx, paragraph in enumerate(paragraphs):
        # Clean up the paragraph
        paragraph = paragraph.strip()
        if not paragraph:  # Skip empty paragraphs
            continue

        # Debug: Check if we're getting tiny paragraphs
        if len(paragraph) < 50:  # Very short paragraph
            # Option 1: Skip it (if it's just whitespace/formatting)
            # Option 2: Combine with next paragraph
            continue  # Let's skip for now
        
        # Step 2: Check paragraph length
        if len(paragraph) <= max_paragraph_length:
            # Short paragraph: keep as-is
            chunk_info = {
                'chunk_id': chunk_counter,
                'paragraph_index': para_idx,
                'chunk': paragraph,
                'chunk_type': 'whole_paragraph',
                'length': len(paragraph),
                'sliding_window_info': None  # Not applicable
            }
            all_chunks.append(chunk_info)
            chunk_counter += 1
        else:
            # Step 3: Long paragraph - apply sliding window
            window_chunks = sliding_window(
                paragraph, 
                sliding_window_size, 
                sliding_window_step
            )
            
            # Step 4: Format each window chunk
            for window_idx, window_chunk in enumerate(window_chunks):
                chunk_info = {
                    'chunk_id': chunk_counter,
                    'paragraph_index': para_idx,
                    'chunk': window_chunk['chunk'],
                    'chunk_type': 'sliding_window_segment',
                    'length': len(window_chunk['chunk']),
                    'sliding_window_info': {
                        'window_index': window_idx,
                        'total_windows': len(window_chunks),
                        'char_start': window_chunk['start'],
                        'char_end': window_chunk['start'] + len(window_chunk['chunk']),
                        'original_paragraph_length': len(paragraph)
                    }
                }
                all_chunks.append(chunk_info)
                chunk_counter += 1
    
    return all_chunks

def apply_paragraph_chunking_to_documents(documents):
    """Apply paragraph-based chunking to all documents - FIXED"""
    all_chunks = []
    
    for doc_idx, doc in enumerate(documents):
        doc_copy = doc.copy()
        doc_content = doc_copy.pop('content')
        
        # FIXED PARAMETERS:
        chunks = paragraph_chunking_with_sliding_window(
            doc_content,
            max_paragraph_length=500,        # Paragraphs under 500 chars stay whole
            sliding_window_size=1000,        # Reasonable chunk size
            sliding_window_step=500          # 50% overlap
        )
        
        # Add document metadata
        for chunk in chunks:
            chunk_with_metadata = doc_copy.copy()
            chunk_with_metadata.update(chunk)
            all_chunks.append(chunk_with_metadata)
    
    return all_chunks

workout_docs_chunks_2 = apply_paragraph_chunking_to_documents(workout_docs)
print(f"Paragraph+sliding chunks: {len(workout_docs_chunks_2)}")

Paragraph+sliding chunks: 67


In [5]:
# 2.3 section chunking
def split_markdown_by_level_improved(text, level=2, include_content_before_first_header=True):
    """
    Improved version that handles content before first header.
    
    Parameters:
    - text: Markdown text
    - level: Header level to split on (1 for #, 2 for ##, etc.)
    - include_content_before_first_header: Whether to include text before first header as a section
    
    Returns: List of (header, content) tuples
    """
    # Create the regex pattern for the specified level
    header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
    pattern = re.compile(header_pattern, re.MULTILINE)
    
    # Find all header positions
    matches = list(pattern.finditer(text))
    
    if not matches:
        # No headers found at this level
        return [('No Header', text.strip())] if text.strip() else []
    
    sections = []
    
    # Handle content before first header
    first_match = matches[0]
    if include_content_before_first_header and first_match.start() > 0:
        before_content = text[:first_match.start()].strip()
        if before_content:
            sections.append(('Introduction', before_content))
    
    # Process each section
    for i, match in enumerate(matches):
        header_marker = match.group(1)  # e.g., "## "
        header_text = match.group(2)    # e.g., "Installation"
        full_header = header_marker + header_text
        
        # Determine the content for this section
        if i < len(matches) - 1:
            # Content is from after this header to before next header
            next_match = matches[i + 1]
            content = text[match.end():next_match.start()].strip()
        else:
            # Last section: content is from after header to end
            content = text[match.end():].strip()
        
        sections.append((full_header, content))
    
    return sections

def apply_section_chunking_to_documents(documents, level=2):
    all_chunks = []
    
    for doc_idx, doc in enumerate(documents):
        doc_copy = doc.copy()
        doc_content = doc_copy.pop('content')
        
        sections = split_markdown_by_level_improved(doc_content, level=level)
        
        for section_idx, (header, content) in enumerate(sections):
            if not content:
                continue
                
            chunk_info = {
                'chunk_id': f"doc_{doc_idx}_sec_{section_idx}",
                'header': header,
                'chunk': content,
                'chunk_type': f'section_level_{level}',
                'section_index': section_idx,
                'length': len(content),
                'has_header': header != 'No Header' and header != 'Introduction'
            }
            
            chunk_with_metadata = doc_copy.copy()
            chunk_with_metadata.update(chunk_info)
            all_chunks.append(chunk_with_metadata)
    
    return all_chunks

# Try multiple levels
workout_docs_chunks_3_level2 = apply_section_chunking_to_documents(workout_docs, level=2)
workout_docs_chunks_3_level3 = apply_section_chunking_to_documents(workout_docs, level=3)

print(f"Section chunks (level 2): {len(workout_docs_chunks_3_level2)}")
print(f"Section chunks (level 3): {len(workout_docs_chunks_3_level3)}")

Section chunks (level 2): 14
Section chunks (level 3): 28


In [6]:
def analyze_chunking_method(chunks, method_name):
    """
    Comprehensive analysis of a chunking method
    """
    if not chunks:
        return {}
    
    # Extract chunk contents
    chunk_contents = []
    for chunk in chunks:
        content = chunk.get('chunk') or chunk.get('section') or ''
        chunk_contents.append(content)
    
    # Basic statistics
    sizes = [len(content) for content in chunk_contents]
    char_counts = sizes
    
    # Token estimation (approx: 1 token ‚âà 4 chars for English)
    token_counts = [len(content) // 4 for content in chunk_contents]
    
    # Distribution analysis
    import statistics
    
    metrics = {
        'method': method_name,
        'total_chunks': len(chunks),
        'total_characters': sum(char_counts),
        'avg_chars_per_chunk': statistics.mean(char_counts),
        'median_chars_per_chunk': statistics.median(char_counts),
        'std_chars_per_chunk': statistics.stdev(char_counts) if len(char_counts) > 1 else 0,
        'min_chars': min(char_counts),
        'max_chars': max(char_counts),
        'avg_tokens_per_chunk': statistics.mean(token_counts),
        # Distribution percentiles
        'p25_chars': sorted(char_counts)[int(len(char_counts) * 0.25)] if char_counts else 0,
        'p75_chars': sorted(char_counts)[int(len(char_counts) * 0.75)] if char_counts else 0,
    }
    
    # Size categories (useful for visualization)
    small = len([c for c in char_counts if c < 500])
    medium = len([c for c in char_counts if 500 <= c < 2000])
    large = len([c for c in char_counts if c >= 2000])
    
    metrics['size_distribution'] = {
        'small_<500': small,
        'medium_500-2000': medium,
        'large_>=2000': large,
        'small_percent': (small / len(char_counts) * 100) if char_counts else 0
    }
    
    # Context preservation score (estimated)
    # Count how many chunks seem "complete"
    complete_chunks = 0
    for content in chunk_contents[:100]:  # Sample first 100 for speed
        # Simple heuristics for "completeness"
        if content.strip() and len(content) > 50:
            # Check if ends with sentence-ending punctuation
            # FIXED LINE: Removed the extra ')'
            if content[-1] in '.!?' or '\n\n' in content[-20:]:
                complete_chunks += 1
    
    metrics['estimated_completeness_score'] = (
        complete_chunks / min(100, len(chunks)) * 100 
        if chunks else 0
    )
    
    return metrics

sliding_analysis = analyze_chunking_method(workout_docs_chunks, 'sliding_window')
paragraph_analysis = analyze_chunking_method(workout_docs_chunks_2, 'paragraph_sliding')
section_analysis_2 = analyze_chunking_method(workout_docs_chunks_3_level2, 'section_level_2')
section_analysis_3 = analyze_chunking_method(workout_docs_chunks_3_level3, 'section_level_3')

In [7]:
def compare_analyses(all_analyses):
    """
    Compare all analysis results in a readable format
    """
    print("=" * 70)
    print("CHUNKING METHOD COMPARISON")
    print("=" * 70)
    
    # Define the metrics to compare (in order of importance)
    key_metrics = [
        'total_chunks',
        'avg_chars_per_chunk', 
        'min_chars',
        'max_chars',
        'estimated_completeness_score',
        'size_distribution.small_<500',
        'size_distribution.medium_500-2000',
        'size_distribution.large_>=2000'
    ]
    
    # Print header
    print(f"\n{'Metric':<35}", end="")
    for method_name in all_analyses.keys():
        print(f"{method_name:<15}", end="")
    print()
    print("-" * 80)
    
    # Print each metric
    for metric in key_metrics:
        print(f"{metric:<35}", end="")
        
        for method_name, analysis in all_analyses.items():
            # Handle nested metrics (size_distribution.small_<500)
            if '.' in metric:
                parts = metric.split('.')
                value = analysis.get(parts[0], {})
                if isinstance(value, dict):
                    value = value.get(parts[1], 'N/A')
            else:
                value = analysis.get(metric, 'N/A')
            
            # Format the value
            if isinstance(value, (int, float)):
                if metric == 'estimated_completeness_score':
                    print(f"{value:>13.1f}%  ", end="")
                elif metric in ['avg_chars_per_chunk']:
                    print(f"{value:>13.0f}    ", end="")
                else:
                    print(f"{value:>13}    ", end="")
            else:
                print(f"{str(value):>13}    ", end="")
        print()
    
    print("-" * 80)
    
    # Add interpretation
    print("\nüîç KEY INSIGHTS (Look for these patterns):")
    print("1. More chunks = better granularity for search")
    print("2. Higher completeness score = better context preservation")
    print("3. Balanced size distribution = good for most use cases")
    print("4. Large max_chars might indicate poor boundary detection")
    
    return all_analyses

# Run the comparison

# Compare all
all_analyses = {
    'sliding_window': sliding_analysis,
    'paragraph_sliding': paragraph_analysis,
    'section_level_2': section_analysis_2,
    'section_level_3': section_analysis_3
}

compare_analyses(all_analyses)

CHUNKING METHOD COMPARISON

Metric                             sliding_window paragraph_slidingsection_level_2section_level_3
--------------------------------------------------------------------------------
total_chunks                                  17               67               14               28    
avg_chars_per_chunk                         1465              171              901              436    
min_chars                                    897               50              161               29    
max_chars                                   1500              923             2864             1924    
estimated_completeness_score                35.3%           11.9%          100.0%           14.3%  
size_distribution.small_<500                   0               65                4               20    
size_distribution.medium_500-2000             17                2                9                8    
size_distribution.large_>=2000                 0                0    

{'sliding_window': {'method': 'sliding_window',
  'total_chunks': 17,
  'total_characters': 24897,
  'avg_chars_per_chunk': 1464.5294117647059,
  'median_chars_per_chunk': 1500,
  'std_chars_per_chunk': 146.2489818969088,
  'min_chars': 897,
  'max_chars': 1500,
  'avg_tokens_per_chunk': 366.11764705882354,
  'p25_chars': 1500,
  'p75_chars': 1500,
  'size_distribution': {'small_<500': 0,
   'medium_500-2000': 17,
   'large_>=2000': 0,
   'small_percent': 0.0},
  'estimated_completeness_score': 35.294117647058826},
 'paragraph_sliding': {'method': 'paragraph_sliding',
  'total_chunks': 67,
  'total_characters': 11456,
  'avg_chars_per_chunk': 170.98507462686567,
  'median_chars_per_chunk': 126,
  'std_chars_per_chunk': 162.42634624861867,
  'min_chars': 50,
  'max_chars': 923,
  'avg_tokens_per_chunk': 42.298507462686565,
  'p25_chars': 75,
  'p75_chars': 205,
  'size_distribution': {'small_<500': 65,
   'medium_500-2000': 2,
   'large_>=2000': 0,
   'small_percent': 97.01492537313433}

In [8]:
def create_visual_comparison(all_analyses):
    """
    Create simple bar charts for key metrics
    """
    import math
    
    print("\n" + "=" * 70)
    print("VISUAL COMPARISON (ASCII Charts)")
    print("=" * 70)
    
    # Chart 1: Total Chunks
    print("\nüìä TOTAL CHUNKS (More = finer granularity):")
    max_chunks = max([a.get('total_chunks', 0) for a in all_analyses.values()])
    
    for method_name, analysis in all_analyses.items():
        chunks = analysis.get('total_chunks', 0)
        bar_length = int((chunks / max_chunks) * 50) if max_chunks > 0 else 0
        bar = "‚ñà" * bar_length
        print(f"{method_name:<25} {chunks:>5} chunks {bar}")
    
    # Chart 2: Average Chunk Size
    print("\nüìè AVERAGE CHUNK SIZE (Chars):")
    max_avg = max([a.get('avg_chars_per_chunk', 0) for a in all_analyses.values()])
    
    for method_name, analysis in all_analyses.items():
        avg_size = analysis.get('avg_chars_per_chunk', 0)
        bar_length = int((avg_size / max_avg) * 50) if max_avg > 0 else 0
        bar = "‚ñà" * bar_length
        print(f"{method_name:<25} {avg_size:>5.0f} chars {bar}")
    
    # Chart 3: Completeness Score
    print("\n‚úÖ ESTIMATED COMPLETENESS SCORE (%):")
    
    for method_name, analysis in all_analyses.items():
        score = analysis.get('estimated_completeness_score', 0)
        bar_length = int(score / 2)  # 50 chars for 100%
        bar = "‚ñà" * bar_length
        print(f"{method_name:<25} {score:>5.1f}% {bar}")
    
    # Chart 4: Size Distribution
    print("\nüì¶ SIZE DISTRIBUTION:")
    for method_name, analysis in all_analyses.items():
        dist = analysis.get('size_distribution', {})
        small = dist.get('small_<500', 0)
        medium = dist.get('medium_500-2000', 0)
        large = dist.get('large_>=2000', 0)
        total = small + medium + large
        
        if total > 0:
            small_pct = (small / total) * 100
            medium_pct = (medium / total) * 100
            large_pct = (large / total) * 100
            
            print(f"\n{method_name}:")
            print(f"  Small (<500):  {'‚ñà' * int(small_pct/2)} {small_pct:.1f}%")
            print(f"  Medium (500-2k):{'‚ñà' * int(medium_pct/2)} {medium_pct:.1f}%") 
            print(f"  Large (‚â•2k):    {'‚ñà' * int(large_pct/2)} {large_pct:.1f}%")

create_visual_comparison(all_analyses)


VISUAL COMPARISON (ASCII Charts)

üìä TOTAL CHUNKS (More = finer granularity):
sliding_window               17 chunks ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
paragraph_sliding            67 chunks ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
section_level_2              14 chunks ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
section_level_3              28 chunks ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà

üìè AVERAGE CHUNK SIZE (Chars):
sliding_window             1465 chars ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
paragraph_sliding           171 chars ‚ñà‚ñà‚ñà‚ñà‚ñà
section_level_2             901 chars ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
section_level_3             436 chars ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà

In [9]:
def get_chunk_examples(chunks, method_name, num_examples=2):
    """Get example chunks for manual inspection"""
    print(f"\n{'='*60}")
    print(f"MANUAL INSPECTION: {method_name}")
    print(f"{'='*60}")
    
    if not chunks:
        print("No chunks available")
        return
    
    for i in range(min(num_examples, len(chunks))):
        chunk = chunks[i]
        content = chunk.get('chunk', '')
        
        print(f"\nüìÑ Example {i+1}/{len(chunks)}:")
        print(f"   Source: {chunk.get('title', 'Unknown')}")
        
        if 'header' in chunk:
            print(f"   Section: {chunk.get('header', 'N/A')}")
        
        print(f"   Length: {len(content)} characters")
        print(f"   Type: {chunk.get('chunk_type', 'unknown')}")
        
        # Boundary analysis
        if content:
            starts_capital = content[0].isupper()
            ends_punctuation = content[-1] in '.!?'
            print(f"   Starts with capital letter: {'‚úÖ' if starts_capital else '‚ùå'}")
            print(f"   Ends with punctuation: {'‚úÖ' if ends_punctuation else '‚ùå'}")
            
            # Check for mid-sentence cuts
            if not starts_capital:
                print(f"   ‚ö†Ô∏è  WARNING: Starts lowercase - likely cut mid-sentence")
            if not ends_punctuation:
                print(f"   ‚ö†Ô∏è  WARNING: No ending punctuation - might be incomplete")
        
        # Content preview (first and last 100 chars)
        print(f"\n   Preview (first 150 chars):")
        print(f"   \"{content[:150]}...\"")
        
        if len(content) > 200:
            print(f"\n   Ending (last 100 chars):")
            print(f"   \"...{content[-100:]}\"")
        
        print(f"\n   {'-'*50}")

# Run for ALL FOUR methods
print("üîç MANUAL CHUNK INSPECTION - ALL METHODS")
print("="*70)

# 1. Sliding Window
get_chunk_examples(workout_docs_chunks, "Sliding Window (size=1500, step=750)")

# 2. Paragraph + Sliding Window  
get_chunk_examples(workout_docs_chunks_2, "Paragraph + Sliding Window")

# 3. Section Level 2
get_chunk_examples(workout_docs_chunks_3_level2, "Section Chunking (Level 2)")

# 4. Section Level 3
get_chunk_examples(workout_docs_chunks_3_level3, "Section Chunking (Level 3)")

üîç MANUAL CHUNK INSPECTION - ALL METHODS

MANUAL INSPECTION: Sliding Window (size=1500, step=750)

üìÑ Example 1/17:
   Source: Unknown
   Length: 1500 characters
   Type: unknown
   Starts with capital letter: ‚ùå
   Ends with punctuation: ‚ùå

   Preview (first 150 chars):
   "# üèãÔ∏è Workout Type Recommendation System

A machine learning-based system that recommends workout types (Cardio, Strength, Yoga, or HIIT) based on user..."

   Ending (last 100 chars):
   "...get classes:
- Cardio
- Strength
- Yoga
- HIIT

---

## Dataset

**Source**: [Kaggle - Gym Members E"

   --------------------------------------------------

üìÑ Example 2/17:
   Source: Unknown
   Length: 1500 characters
   Type: unknown
   Starts with capital letter: ‚úÖ
   Ends with punctuation: ‚ùå

   Preview (first 150 chars):
   "The goal is to build a machine learning model that can predict which type of workout (Cardio, Strength, Yoga, or HIIT) would be most suitable for a pe..."

   Ending (last 100 chars

In [10]:
def assess_chunk_quality(chunks, method_name):
    """
    Comprehensive quality assessment with scoring
    """
    print(f"\nüî¨ QUALITY ASSESSMENT: {method_name}")
    print("-" * 50)
    
    if not chunks:
        print("No chunks to assess")
        return
    
    # Sample 5 chunks for assessment
    sample_size = min(5, len(chunks))
    sample_indices = [0, len(chunks)//4, len(chunks)//2, len(chunks)*3//4, -1]
    sample_indices = sample_indices[:sample_size]
    
    scores = {
        'completeness': 0,      # Complete sentences/thoughts
        'readability': 0,       # Can be understood alone
        'boundaries': 0,        # Natural start/end
        'relevance': 0          # Contains coherent topic
    }
    
    print(f"Assessing {sample_size} sample chunks...")
    
    for i, idx in enumerate(sample_indices):
        chunk = chunks[idx]
        content = chunk.get('chunk', '')
        
        print(f"\nSample {i+1} ({len(content)} chars):")
        
        # Score completeness
        starts_well = content and content[0].isupper()
        ends_well = content and content[-1] in '.!?'
        completeness = 1 if starts_well and ends_well else 0.5 if starts_well or ends_well else 0
        scores['completeness'] += completeness
        
        # Score readability (simple heuristic)
        word_count = len(content.split())
        readability = 1 if word_count > 50 and '.' in content else 0.5 if word_count > 20 else 0
        scores['readability'] += readability
        
        # Score boundaries
        boundaries = 1 if starts_well else 0.5 if ends_well else 0
        scores['boundaries'] += boundaries
        
        # Check for topic coherence (simple version)
        unique_words = len(set(content.lower().split()[:20]))
        relevance = 1 if unique_words < 15 else 0.5  # Fewer unique words = more focused
        scores['relevance'] += relevance
        
        print(f"  Complete: {'‚úÖ' if completeness == 1 else '‚ö†Ô∏è' if completeness == 0.5 else '‚ùå'}")
        print(f"  Readable: {'‚úÖ' if readability == 1 else '‚ö†Ô∏è' if readability == 0.5 else '‚ùå'}")
        print(f"  Preview: {content[:100]}...")
    
    # Calculate averages
    for key in scores:
        scores[key] = scores[key] / sample_size
    
    print(f"\nüìà FINAL SCORES (0-1 scale):")
    for key, score in scores.items():
        bar = "‚ñà" * int(score * 20)
        print(f"  {key.capitalize():<15}: {score:.2f} {bar}")
    
    overall = sum(scores.values()) / len(scores)
    print(f"\n  Overall score: {overall:.2f}/1.0")
    
    return scores

# Assess all four methods
print("\n" + "="*70)
print("COMPREHENSIVE QUALITY ASSESSMENT - ALL METHODS")
print("="*70)

quality_scores = {}
quality_scores['sliding'] = assess_chunk_quality(workout_docs_chunks, "Sliding Window")
quality_scores['paragraph'] = assess_chunk_quality(workout_docs_chunks_2, "Paragraph+Window")
quality_scores['section_l2'] = assess_chunk_quality(workout_docs_chunks_3_level2, "Section Level 2")
quality_scores['section_l3'] = assess_chunk_quality(workout_docs_chunks_3_level3, "Section Level 3")


COMPREHENSIVE QUALITY ASSESSMENT - ALL METHODS

üî¨ QUALITY ASSESSMENT: Sliding Window
--------------------------------------------------
Assessing 5 sample chunks...

Sample 1 (1500 chars):
  Complete: ‚ùå
  Readable: ‚úÖ
  Preview: # üèãÔ∏è Workout Type Recommendation System

A machine learning-based system that recommends workout typ...

Sample 2 (1500 chars):
  Complete: ‚ö†Ô∏è
  Readable: ‚úÖ
  Preview: gitignore                               # Git ignore file
‚îî‚îÄ‚îÄ README.md                             ...

Sample 3 (1500 chars):
  Complete: ‚ùå
  Readable: ‚úÖ
  Preview: th identical physical stats (same age, BMI, fitness level) can have completely different workout pre...

Sample 4 (1500 chars):
  Complete: ‚ùå
  Readable: ‚úÖ
  Preview: criptions:**

| Field | Type | Values | Description |
|-------|------|--------|-------------|
| age ...

Sample 5 (897 chars):
  Complete: ‚ùå
  Readable: ‚úÖ
  Preview: entication
   - Implement rate limiting
   - Add batch prediction e

In [11]:
import pandas as pd
from textwrap import wrap

def create_comprehensive_comparison(all_chunk_sets, all_analyses, quality_scores):
    """
    Create a complete dashboard showing ALL metrics
    """
    print("\n" + "="*100)
    print("COMPREHENSIVE CHUNKING ANALYSIS DASHBOARD")
    print("="*100)
    
    # Prepare data
    methods = list(all_analyses.keys())
    
    # Create summary table
    summary_data = []
    
    for method in methods:
        analysis = all_analyses.get(method, {})
        quality = quality_scores.get(method.replace('sliding_window', 'sliding')
                                     .replace('paragraph_sliding', 'paragraph')
                                     .replace('section_level_2', 'section_l2')
                                     .replace('section_level_3', 'section_l3'), {})
        
        # Extract metrics
        row = {
            'Method': method,
            'Total Chunks': analysis.get('total_chunks', 0),
            'Avg Chars': f"{analysis.get('avg_chars_per_chunk', 0):.0f}",
            'Min Chars': analysis.get('min_chars', 0),
            'Max Chars': analysis.get('max_chars', 0),
            'Auto Complete %': f"{analysis.get('estimated_completeness_score', 0):.1f}%",
            'Manual Overall': f"{quality.get('overall', 0):.2f}" if quality else "N/A",
            'Size Dist (S/M/L)': f"{analysis.get('size_distribution', {}).get('small_<500', 0)}/{analysis.get('size_distribution', {}).get('medium_500-2000', 0)}/{analysis.get('size_distribution', {}).get('large_>=2000', 0)}",
            'Small %': f"{analysis.get('size_distribution', {}).get('small_percent', 0):.1f}%",
        }
        
        # Add quality sub-scores if available
        if quality:
            row.update({
                'Comp Score': f"{quality.get('completeness', 0):.2f}",
                'Read Score': f"{quality.get('readability', 0):.2f}",
                'Bound Score': f"{quality.get('boundaries', 0):.2f}",
                'Rel Score': f"{quality.get('relevance', 0):.2f}",
            })
        
        summary_data.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(summary_data)
    
    # Display as table
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 120)
    
    print("\nüìà QUANTITATIVE METRICS TABLE:")
    print("-" * 120)
    print(df.to_string(index=False))
    print("-" * 120)
    
    return df

def create_visual_scoreboard(all_analyses, quality_scores):
    """
    Create a visual scoreboard with color coding
    """
    print("\n" + "="*100)
    print("VISUAL SCOREBOARD (Higher is Better)")
    print("="*100)
    
    methods = list(all_analyses.keys())
    
    # Define metrics to display
    metrics = [
        ('Total Chunks', 'total_chunks', 'numeric', False),  # Not always "higher is better"
        ('Avg Size', 'avg_chars_per_chunk', 'numeric', True),  # Optimal range is better
        ('Auto Complete', 'estimated_completeness_score', 'percent', True),
        ('Manual Overall', 'overall', 'score', True),
        ('Readability', 'readability', 'score', True),
        ('Boundaries', 'boundaries', 'score', True),
    ]
    
    # Get max values for scaling
    max_values = {}
    for metric_name, metric_key, metric_type, higher_is_better in metrics:
        values = []
        for method in methods:
            if metric_type == 'score':
                # Get from quality_scores
                method_key = method.replace('sliding_window', 'sliding')\
                                 .replace('paragraph_sliding', 'paragraph')\
                                 .replace('section_level_2', 'section_l2')\
                                 .replace('section_level_3', 'section_l3')
                val = quality_scores.get(method_key, {}).get(metric_key, 0)
            else:
                # Get from all_analyses
                val = all_analyses.get(method, {}).get(metric_key, 0)
            if val:
                values.append(float(str(val).replace('%', '')))
        max_values[metric_key] = max(values) if values else 1
    
    # Print header
    print(f"\n{'Method':<20}", end="")
    for metric_name, _, _, _ in metrics:
        print(f"{metric_name:<15}", end="")
    print("\n" + "-" * (20 + 15 * len(metrics)))
    
    # Print each method
    for method in methods:
        print(f"{method:<20}", end="")
        
        for metric_name, metric_key, metric_type, higher_is_better in metrics:
            if metric_type == 'score':
                method_key = method.replace('sliding_window', 'sliding')\
                                 .replace('paragraph_sliding', 'paragraph')\
                                 .replace('section_level_2', 'section_l2')\
                                 .replace('section_level_3', 'section_l3')
                val = quality_scores.get(method_key, {}).get(metric_key, 0)
                display_val = f"{val:.2f}" if val else "N/A"
            else:
                val = all_analyses.get(method, {}).get(metric_key, 0)
                if metric_type == 'percent':
                    display_val = f"{val:.1f}%" if val else "N/A"
                else:
                    display_val = f"{val:.0f}" if val else "N/A"
            
            # Add visual indicator
            if val and metric_type in ['percent', 'score']:
                normalized = float(str(val).replace('%', '')) / max_values[metric_key]
                bars = int(normalized * 10)
                visual = "‚ñà" * bars + "‚ñë" * (10 - bars)
                print(f"{display_val:>6} {visual:<8}", end="")
            else:
                print(f"{display_val:>6} {'N/A':<8}", end="")
        
        print()
    
    print("-" * (20 + 15 * len(metrics)))

def create_radar_chart_data(all_analyses, quality_scores):
    """
    Prepare data for a "text-based radar chart"
    """
    print("\n" + "="*100)
    print("PERFORMANCE RADAR (1-5 Scale)")
    print("="*100)
    
    methods = list(all_analyses.keys())
    criteria = ['Completeness', 'Readability', 'Size Balance', 'Usefulness', 'Boundary Quality']
    
    # Score each method on each criterion (1-5)
    scores = {}
    
    for method in methods:
        analysis = all_analyses.get(method, {})
        method_key = method.replace('sliding_window', 'sliding')\
                         .replace('paragraph_sliding', 'paragraph')\
                         .replace('section_level_2', 'section_l2')\
                         .replace('section_level_3', 'section_l3')
        quality = quality_scores.get(method_key, {})
        
        method_scores = []
        
        # 1. Completeness (auto + manual combined)
        auto_comp = analysis.get('estimated_completeness_score', 0) / 100
        manual_comp = quality.get('completeness', 0)
        completeness = (auto_comp + manual_comp) / 2 * 5
        method_scores.append(min(5, completeness))
        
        # 2. Readability
        readability = quality.get('readability', 0) * 5
        method_scores.append(min(5, readability))
        
        # 3. Size Balance (ideal: not too small, not too large)
        avg_size = analysis.get('avg_chars_per_chunk', 0)
        # Score: 5 if ~1000 chars, lower if far from ideal
        size_score = 5 * (1 - min(abs(avg_size - 1000) / 1000, 1))
        method_scores.append(min(5, size_score))
        
        # 4. Usefulness (estimated from relevance)
        usefulness = quality.get('relevance', 0) * 5
        method_scores.append(min(5, usefulness))
        
        # 5. Boundary Quality
        boundaries = quality.get('boundaries', 0) * 5
        method_scores.append(min(5, boundaries))
        
        scores[method] = method_scores
    
    # Print radar chart
    max_score = 5
    
    for criterion_idx, criterion in enumerate(criteria):
        print(f"\n{criterion:<15}", end="")
        for method in methods:
            score = scores[method][criterion_idx]
            bars = int(score)
            visual = "‚òÖ" * bars + "‚òÜ" * (5 - bars)
            print(f"{method:<25} {score:3.1f} {visual}", end="  ")
        print()
    
    # Calculate overall scores
    print("\n" + "-" * 100)
    print("OVERALL SCORES (Average of 5 criteria):")
    for method in methods:
        overall = sum(scores[method]) / len(scores[method])
        print(f"{method:<25}: {overall:.2f}/5.0 {'üèÜ' if overall == max([sum(scores[m])/len(scores[m]) for m in methods]) else ''}")

def show_chunk_samples_side_by_side(all_chunk_sets, num_samples=2):
    """
    Show actual chunk samples from each method side-by-side
    """
    print("\n" + "="*100)
    print(f"SAMPLE CHUNKS COMPARISON (First {num_samples} chunks)")
    print("="*100)
    
    methods = list(all_chunk_sets.keys())
    
    for sample_idx in range(num_samples):
        print(f"\n{'='*50} SAMPLE #{sample_idx+1} {'='*50}")
        
        for method in methods:
            chunks = all_chunk_sets[method]
            if sample_idx < len(chunks):
                chunk = chunks[sample_idx]
                content = chunk.get('chunk', 'No content')
                
                print(f"\nüìÑ {method.upper()}:")
                print(f"   Size: {len(content):<5} chars  |  Source: {chunk.get('title', 'Unknown'):<30}")
                if 'header' in chunk:
                    print(f"   Header: {chunk.get('header', '')}")
                
                # Show preview (first and last 100 chars)
                preview = content[:200] + "..." if len(content) > 200 else content
                wrapped = "\n                ".join(wrap(preview, width=80))
                print(f"   Preview: {wrapped}")
                
                # Quality indicators
                if content:
                    starts_ok = content[0].isupper()
                    ends_ok = content[-1] in '.!?'
                    print(f"   Quality: {'‚úÖ' if starts_ok else '‚ùå'} Start | {'‚úÖ' if ends_ok else '‚ùå'} End")
            else:
                print(f"\nüìÑ {method.upper()}: No chunk at index {sample_idx}")
        
        print("\n" + "-" * 100)

# ===== RUN EVERYTHING =====

print("üöÄ COMPREHENSIVE ANALYSIS - ALL METRICS")
print("="*100)

# Prepare your data
all_chunk_sets = {
    'sliding_window': workout_docs_chunks,
    'paragraph_sliding': workout_docs_chunks_2,
    'section_level_2': workout_docs_chunks_3_level2,
    'section_level_3': workout_docs_chunks_3_level3
}

all_analyses = {
    'sliding_window': sliding_analysis,
    'paragraph_sliding': paragraph_analysis,
    'section_level_2': section_analysis_2,
    'section_level_3': section_analysis_3
}

# You need to run analysis for section level 3
section_chunk_analysis_3 = analyze_chunking_method(workout_docs_chunks_3_level3, 'section_level_3')

# Run all visualizations
df = create_comprehensive_comparison(all_chunk_sets, all_analyses, quality_scores)
create_visual_scoreboard(all_analyses, quality_scores)
create_radar_chart_data(all_analyses, quality_scores)
show_chunk_samples_side_by_side(all_chunk_sets, num_samples=2)

# Final summary
print("\n" + "="*100)
print("üéØ EXECUTIVE SUMMARY")
print("="*100)

# Calculate winner based on multiple criteria
def determine_winner(all_analyses, quality_scores):
    methods = list(all_analyses.keys())
    weighted_scores = []
    
    for method in methods:
        analysis = all_analyses[method]
        method_key = method.replace('sliding_window', 'sliding')\
                         .replace('paragraph_sliding', 'paragraph')\
                         .replace('section_level_2', 'section_l2')\
                         .replace('section_level_3', 'section_l3')
        quality = quality_scores.get(method_key, {})
        
        # Weighted scoring (adjust weights based on your use case)
        score = (
            analysis.get('estimated_completeness_score', 0) * 0.3 +          # Auto complete
            quality.get('overall', 0) * 100 * 0.4 +                          # Manual overall
            (1 - analysis.get('size_distribution', {}).get('small_percent', 0)/100) * 100 * 0.2 +  # Not too small
            quality.get('readability', 0) * 100 * 0.1                        # Readability
        )
        
        weighted_scores.append((method, score))
    
    return sorted(weighted_scores, key=lambda x: x[1], reverse=True)

winners = determine_winner(all_analyses, quality_scores)

print("\nüèÜ RANKING (Weighted Score):")
for i, (method, score) in enumerate(winners, 1):
    medal = "ü•á" if i == 1 else "ü•à" if i == 2 else "ü•â" if i == 3 else f"{i}."
    print(f"{medal} {method:<20}: {score:.1f}/100")

print("\nüìã RECOMMENDATION:")
print(f"1. Use '{winners[0][0]}' for best overall performance")
print(f"2. Consider '{winners[1][0]}' as alternative")
print(f"3. Avoid '{winners[-1][0]}' (lowest score)")

print("\n" + "="*100)
print("ANALYSIS COMPLETE - All metrics displayed above")
print("="*100)

üöÄ COMPREHENSIVE ANALYSIS - ALL METRICS

COMPREHENSIVE CHUNKING ANALYSIS DASHBOARD

üìà QUANTITATIVE METRICS TABLE:
------------------------------------------------------------------------------------------------------------------------
           Method  Total Chunks Avg Chars  Min Chars  Max Chars Auto Complete % Manual Overall Size Dist (S/M/L) Small % Comp Score Read Score Bound Score Rel Score
   sliding_window            17      1465        897       1500           35.3%           0.00            0/17/0    0.0%       0.10       1.00        0.10      0.60
paragraph_sliding            67       171         50        923           11.9%           0.00            65/2/0   97.0%       0.40       0.30        0.60      0.70
  section_level_2            14       901        161       2864          100.0%           0.00             4/9/1   28.6%       0.10       0.80        0.10      0.60
  section_level_3            28       436         29       1924           14.3%           0.00      

In [12]:
# 3
# choosen chunking method: section level 2
from minsearch import Index
workout_chunks = workout_docs_chunks_3_level2  # Section chunking was best

def setup_text_search(chunks):
    """
    Setup text search index for your chunks - UPDATED VERSION
    """
    # Check what fields actually exist
    sample_chunk = chunks[0]
    available_fields = list(sample_chunk.keys())
    print(f"Available fields in chunks: {available_fields}")
    
    # Choose fields that contain searchable text
    # Your chunks have: ['filename', 'chunk_id', 'header', 'chunk', 'chunk_type', 'section_index', 'length', 'has_header']
    text_fields = ["chunk", "header", "filename"]  # These have searchable text
    
    print(f"Creating text search index with fields: {text_fields}")
    
    index = Index(
        text_fields=text_fields,
        keyword_fields=[]
    )
    
    index.fit(chunks)
    print(f"‚úÖ Text search index created with {len(chunks)} chunks")
    return index

# Recreate text index with correct fields
text_index = setup_text_search(workout_chunks)

Available fields in chunks: ['filename', 'chunk_id', 'header', 'chunk', 'chunk_type', 'section_index', 'length', 'has_header']
Creating text search index with fields: ['chunk', 'header', 'filename']
‚úÖ Text search index created with 14 chunks


In [13]:
from sentence_transformers import SentenceTransformer
from minsearch import VectorSearch
from tqdm.auto import tqdm
import numpy as np

def setup_vector_search(chunks, model_name='multi-qa-distilbert-cos-v1'):
    """
    Setup vector search with embeddings - UPDATED VERSION
    """
    # Load embedding model
    print(f"Loading embedding model: {model_name}")
    embedding_model = SentenceTransformer(model_name)
    
    # Create embeddings for all chunks
    print("Creating embeddings...")
    embeddings = []
    
    for chunk in tqdm(chunks):
        # Combine relevant fields for embedding
        # Your chunks don't have 'title', use 'header' instead
        text = chunk.get('chunk', '')
        
        # Add header for context (if exists)
        if 'header' in chunk:
            text = chunk['header'] + " " + text
        
        # Optionally add filename for more context
        if 'filename' in chunk:
            # Extract just the filename, not full path
            filename = chunk['filename'].split('/')[-1]
            text = filename + " " + text
            
        v = embedding_model.encode(text)
        embeddings.append(v)
    
    embeddings = np.array(embeddings)
    
    # Create vector search index
    vector_index = VectorSearch()
    vector_index.fit(embeddings, chunks)
    
    print(f"‚úÖ Vector search index created with {len(embeddings)} embeddings")
    return vector_index, embedding_model

# Create vector search index
vector_index, embedding_model = setup_vector_search(workout_chunks)

Loading embedding model: multi-qa-distilbert-cos-v1
Creating embeddings...


  0%|          | 0/14 [00:00<?, ?it/s]

‚úÖ Vector search index created with 14 embeddings


In [14]:
# Add these helper functions FIRST
def display_search_results(results, query="", max_preview=100):
    """Display search results nicely"""
    if not results:
        print(f"No results found for '{query}'")
        return
    
    print(f"Found {len(results)} results:")
    for i, result in enumerate(results, 1):
        filename = result.get('filename', 'Unknown').split('/')[-1]
        header = result.get('header', 'No section')
        
        print(f"\n{i}. üìÑ {filename}")
        print(f"   üìç {header}")
        
        # Show preview
        chunk = result.get('chunk', '')
        if chunk:
            preview = chunk[:max_preview] + "..." if len(chunk) > max_preview else chunk
            print(f"   üîç {preview}")

def hybrid_search_fixed(query, text_index, vector_index, embedding_model, top_k=5):
    """Balanced hybrid search - takes equal from both methods"""
    # Get results from both methods
    text_results = text_index.search(query, num_results=top_k*2)  # Get extra for flexibility
    query_embedding = embedding_model.encode(query)
    vector_results = vector_index.search(query_embedding, num_results=top_k*2)
    
    # Calculate how many to take from each (balanced)
    from_each = (top_k + 1) // 2  # e.g., 3 for top_k=5, 2 for top_k=3
    
    combined_results = []
    seen_chunks = set()
    
    # Function to add result with deduplication
    def add_result(result, source):
        chunk_id = result.get('chunk_id', '') or result.get('chunk', '')[:100]
        if chunk_id not in seen_chunks:
            seen_chunks.add(chunk_id)
            result = result.copy()  # Avoid modifying original
            result['_source'] = source
            combined_results.append(result)
            return True
        return False
    
    # First, take balanced from each method
    text_added = 0
    vector_added = 0
    
    # Interleave: text, vector, text, vector...
    max_attempts = max(len(text_results), len(vector_results))
    
    for i in range(max_attempts):
        # Try to add text result
        if text_added < from_each and i < len(text_results):
            if add_result(text_results[i], 'text'):
                text_added += 1
        
        # Try to add vector result
        if vector_added < from_each and i < len(vector_results):
            if add_result(vector_results[i], 'vector'):
                vector_added += 1
        
        # Stop if we have enough
        if len(combined_results) >= top_k:
            break
    
    # If still need more, take best remaining
    if len(combined_results) < top_k:
        all_results = text_results + vector_results
        for result in all_results:
            if len(combined_results) >= top_k:
                break
            add_result(result, 'mixed')
    
    return combined_results[:top_k]

def test_query(query, top_k=2):
    """Test all three methods with updated hybrid"""
    print(f"\nüîç QUERY: '{query}'")
    print("="*60)
    
    print("\nüìÑ TEXT SEARCH:")
    text_results = text_index.search(query, num_results=top_k)
    display_search_results(text_results, query)
    
    print("\nüß† VECTOR SEARCH:")
    query_embedding = embedding_model.encode(query)
    vector_results = vector_index.search(query_embedding, num_results=top_k)
    display_search_results(vector_results, query)
    
    print("\nü§ù HYBRID SEARCH (BALANCED):")
    hybrid_results = hybrid_search_fixed(query, text_index, vector_index, embedding_model, top_k=top_k)
    # Custom display to show sources
    if hybrid_results:
        print(f"Found {len(hybrid_results)} results:")
        for i, result in enumerate(hybrid_results, 1):
            filename = result.get('filename', 'Unknown').split('/')[-1]
            header = result.get('header', 'No section')
            source = result.get('_source', 'unknown')
            
            print(f"\n{i}. üìÑ {filename} [{source}]")
            print(f"   üìç {header}")
            
            # Show preview
            chunk = result.get('chunk', '')
            if chunk:
                preview = chunk[:100] + "..." if len(chunk) > 100 else chunk
                print(f"   üîç {preview}")
    else:
        print(f"No results found for '{query}'")
    
    return text_results, vector_results, hybrid_results

print("\n" + "="*80)
print("RE-TESTING WITH UPDATED BALANCED HYBRID SEARCH")
print("="*80)

# Re-test the same queries
all_results_updated = {}
test_queries = ["installation", "how to setup", "machine learning"]

for i, query in enumerate(test_queries, 1):
    print(f"\nüìä TEST {i}/3")
    text_res, vector_res, hybrid_res = test_query(query, top_k=3)
    all_results_updated[query] = {
        'text': text_res,
        'vector': vector_res,
        'hybrid': hybrid_res
    }
    print("\n" + "-"*60)


RE-TESTING WITH UPDATED BALANCED HYBRID SEARCH

üìä TEST 1/3

üîç QUERY: 'installation'

üìÑ TEXT SEARCH:
Found 2 results:

1. üìÑ README.md
   üìç ## Installation
   üîç ### Prerequisites

- Python 3.9+
- pip
- Docker (optional, for containerization)

### Option 1: Loca...

2. üìÑ README.md
   üìç ## Table of Contents
   üîç - [Problem Description](#problem-description)
- [Dataset](#dataset)
- [Project Structure](#project-s...

üß† VECTOR SEARCH:
Found 3 results:

1. üìÑ README.md
   üìç ## Installation
   üîç ### Prerequisites

- Python 3.9+
- pip
- Docker (optional, for containerization)

### Option 1: Loca...

2. üìÑ README.md
   üìç ## Deployment
   üîç ### Local Deployment with Docker

```bash
# Build the Docker image
docker build -t workout-recommend...

3. üìÑ README.md
   üìç ## Table of Contents
   üîç - [Problem Description](#problem-description)
- [Dataset](#dataset)
- [Project Structure](#project-s...

ü§ù HYBRID SEARCH (BALANCED):
Found 3 results:

1.

In [15]:
# 3 - evaluation
def create_updated_evaluation_summary(all_results):
    """
    Create evaluation summary with hybrid sources
    """
    print("\n" + "="*80)
    print("UPDATED EVALUATION WITH BALANCED HYBRID")
    print("="*80)
    
    print(f"\n{'Query':<20} {'Method':<12} {'Top Result':<25} {'Source':<10} {'Relevance':<10}")
    print("-" * 85)
    
    for query, results in all_results.items():
        # Text and Vector
        for method in ['text', 'vector']:
            method_results = results[method]
            if method_results:
                top_result = method_results[0]
                header = top_result.get('header', 'No section')
                if len(header) > 22:
                    header = header[:19] + "..."
                
                relevance = 5  # Based on your earlier scoring
                stars = "‚òÖ" * relevance
                
                print(f"{query:<20} {method:<12} {header:<25} {'N/A':<10} {stars:<10}")
        
        # Hybrid (show first result with source)
        hybrid_results = results['hybrid']
        if hybrid_results:
            top_hybrid = hybrid_results[0]
            header = top_hybrid.get('header', 'No section')
            if len(header) > 22:
                header = header[:19] + "..."
            source = top_hybrid.get('_source', 'unknown')
            
            relevance = 5
            stars = "‚òÖ" * relevance
            
            print(f"{query:<20} {'hybrid':<12} {header:<25} {source:<10} {stars:<10}")
    
    print("-" * 85)
    
    # Diversity analysis
    print("\nüîç DIVERSITY ANALYSIS:")
    print("-" * 40)
    
    for query, results in all_results.items():
        hybrid_results = results['hybrid']
        if hybrid_results:
            sources = [r.get('_source', 'unknown') for r in hybrid_results]
            text_count = sources.count('text')
            vector_count = sources.count('vector')
            mixed_count = sources.count('mixed')
            
            print(f"\n'{query}':")
            print(f"  Text results in hybrid: {text_count}")
            print(f"  Vector results in hybrid: {vector_count}")
            print(f"  Mixed/other: {mixed_count}")
            
            if text_count > 0 and vector_count > 0:
                print(f"  ‚úÖ Balanced! Contains both text and vector results")
            elif text_count > 0:
                print(f"  ‚ö†Ô∏è  Text-heavy (mostly text results)")
            elif vector_count > 0:
                print(f"  ‚ö†Ô∏è  Vector-heavy (mostly vector results)")
    
    # Coverage analysis
    print("\nüìä COVERAGE IMPROVEMENT:")
    print("-" * 40)
    
    for query, results in all_results.items():
        text_sections = set(r.get('header') for r in results['text'])
        vector_sections = set(r.get('header') for r in results['vector'])
        hybrid_sections = set(r.get('header') for r in results['hybrid'])
        
        all_possible = text_sections | vector_sections
        
        print(f"\n'{query}':")
        print(f"  Text unique: {len(text_sections)} sections")
        print(f"  Vector unique: {len(vector_sections)} sections")
        print(f"  Hybrid unique: {len(hybrid_sections)} sections")
        print(f"  All possible: {len(all_possible)} sections")
        
        if all_possible:
            coverage = len(hybrid_sections) / len(all_possible) * 100
            print(f"  Hybrid coverage: {coverage:.1f}% of possible sections")

# Run updated evaluation
create_updated_evaluation_summary(all_results_updated)


UPDATED EVALUATION WITH BALANCED HYBRID

Query                Method       Top Result                Source     Relevance 
-------------------------------------------------------------------------------------
installation         text         ## Installation           N/A        ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ     
installation         vector       ## Installation           N/A        ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ     
installation         hybrid       ## Installation           text       ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ     
how to setup         text         ## Installation           N/A        ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ     
how to setup         vector       ## Deployment             N/A        ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ     
how to setup         hybrid       ## Installation           text       ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ     
machine learning     text         Introduction              N/A        ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ     
machine learning     vector       ## Technologies Used      N/A        ‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ     
machine learning     hybrid       Introduction        

In [17]:
def create_comparison_matrix(all_results):
    """
    Create a visual comparison matrix
    """
    print("\n" + "="*80)
    print("SEARCH METHOD COMPARISON MATRIX")
    print("="*80)
    
    queries = list(all_results.keys())
    
    # Header
    print(f"\n{'Query':<20}", end="")
    for method in ['Text', 'Vector', 'Hybrid']:
        print(f"{method:<25}", end="")
    print()
    print("-" * 95)
    
    # Rows
    for query in queries:
        print(f"{query:<20}", end="")
        
        for method in ['text', 'vector', 'hybrid']:
            results = all_results[query][method]
            if results:
                top_result = results[0]
                header = top_result.get('header', '')
                # Shorten for display
                if len(header) > 20:
                    header = header[:17] + "..."
                print(f"{header:<25}", end="")
            else:
                print(f"{'No results':<25}", end="")
        
        print()
    
    print("-" * 95)
    
    # Key insights
    print("\nüîç KEY INSIGHTS:")
    print("1. Text and Vector often find DIFFERENT but relevant sections")
    print("2. Hybrid combines strengths of both methods")
    print("3. Vector search understands semantic relationships")

# Run comparison
create_comparison_matrix(all_results)

NameError: name 'all_results' is not defined

In [18]:
def deeper_analysis_fixed():
    """
    Updated analysis with balanced hybrid search
    """
    print("\n" + "="*80)
    print("UPDATED DEEPER ANALYSIS WITH BALANCED HYBRID")
    print("="*80)
    
    test_queries = ["installation", "how to setup", "machine learning"]
    
    diversity_scores = {}
    
    for query in test_queries:
        # Get fresh results with balanced hybrid
        text_results = text_index.search(query, num_results=5)
        query_embedding = embedding_model.encode(query)
        vector_results = vector_index.search(query_embedding, num_results=5)
        hybrid_results = hybrid_search_fixed(query, text_index, vector_index, embedding_model, top_k=5)
        
        # Count unique sections
        text_sections = set(r.get('header') for r in text_results)
        vector_sections = set(r.get('header') for r in vector_results)
        hybrid_sections = set(r.get('header') for r in hybrid_results)
        all_unique_sections = text_sections | vector_sections
        
        diversity_scores[query] = {
            'text_unique': len(text_sections),
            'vector_unique': len(vector_sections),
            'hybrid_unique': len(hybrid_sections),
            'total_unique': len(all_unique_sections)
        }
        
        # Show what each method found
        print(f"\nüîç Query: '{query}'")
        print(f"  Text search found: {text_sections}")
        print(f"  Vector search found: {vector_sections}")
        print(f"  Hybrid search found: {hybrid_sections}")
        print(f"  All possible: {all_unique_sections}")
    
    # Display table
    print(f"\n{'Query':<20} {'Text Unique':<15} {'Vector Unique':<15} {'Hybrid Unique':<15} {'Combined Unique':<15}")
    print("-" * 80)
    
    for query, scores in diversity_scores.items():
        print(f"{query:<20} {scores['text_unique']:<15} {scores['vector_unique']:<15} {scores['hybrid_unique']:<15} {scores['total_unique']:<15}")
    
    print("-" * 80)
    
    # Calculate coverage
    print("\nüìä COVERAGE ANALYSIS:")
    total_potential = sum(scores['total_unique'] for scores in diversity_scores.values())
    text_coverage = sum(scores['text_unique'] for scores in diversity_scores.values()) / total_potential * 100
    vector_coverage = sum(scores['vector_unique'] for scores in diversity_scores.values()) / total_potential * 100
    hybrid_coverage = sum(scores['hybrid_unique'] for scores in diversity_scores.values()) / total_potential * 100
    
    print(f"Text search covers:   {text_coverage:.1f}% of unique relevant sections")
    print(f"Vector search covers: {vector_coverage:.1f}% of unique relevant sections")
    print(f"Hybrid search covers: {hybrid_coverage:.1f}% of unique relevant sections")
    
    improvement = hybrid_coverage - max(text_coverage, vector_coverage)
    print(f"\nüí° Insight: Hybrid captures {improvement:.1f}% more unique content!")
    
    if improvement > 0:
        print("‚úÖ Hybrid search provides better coverage!")
    elif improvement == 0:
        print("‚ö†Ô∏è  Hybrid provides same coverage (may need more top_k)")
    else:
        print("‚ùå Something wrong - hybrid should not have less coverage")

# Run the fixed analysis
deeper_analysis_fixed()


UPDATED DEEPER ANALYSIS WITH BALANCED HYBRID

üîç Query: 'installation'
  Text search found: {'## Installation', '## Table of Contents'}
  Vector search found: {'## Table of Contents', '## Author', '## Technologies Used', '## Deployment', '## Installation'}
  Hybrid search found: {'## Table of Contents', '## Author', '## Technologies Used', '## Deployment', '## Installation'}
  All possible: {'## Technologies Used', '## Deployment', '## Installation', '## Table of Contents', '## Author'}

üîç Query: 'how to setup'
  Text search found: {'## Running the Project', '## Future Improvements', '## Problem Description', '## Model Performance', '## Installation'}
  Vector search found: {'## Running the Project', '## Table of Contents', '## Deployment', '## API Documentation', '## Installation'}
  Hybrid search found: {'## Running the Project', '## Table of Contents', '## Model Performance', '## Deployment', '## Installation'}
  All possible: {'## Running the Project', '## Future Improvements

In [19]:
def test_divergent_queries():
    """
    Test queries where text and vector should find different things
    """
    print("\n" + "="*80)
    print("TESTING DIVERGENT QUERIES")
    print("="*80)
    
    # Queries where methods should diverge
    divergent_queries = [
        # Text should find exact terms, vector should find concepts
        ("setup", "setup instructions"),  # Text: exact match, Vector: related concepts
        ("docker", "container"),          # Text: exact "docker", Vector: container concepts
        ("API", "endpoint"),              # Text: "API", Vector: "endpoint" concepts
        ("train model", "training"),      # Text: exact phrase, Vector: training concepts
        ("github repo", "repository"),    # Text: "github", Vector: repository concepts
    ]
    
    for query, expected_difference in divergent_queries:
        print(f"\nüîç Query: '{query}' (expected: {expected_difference})")
        print("-" * 60)
        
        # Get results
        text_results = text_index.search(query, num_results=3)
        query_embedding = embedding_model.encode(query)
        vector_results = vector_index.search(query_embedding, num_results=3)
        hybrid_results = hybrid_search_fixed(query, text_index, vector_index, embedding_model, top_k=6)
        
        # Get sections
        text_sections = set(r.get('header') for r in text_results)
        vector_sections = set(r.get('header') for r in vector_results)
        hybrid_sections = set(r.get('header') for r in hybrid_results)
        all_possible = text_sections | vector_sections
        
        # Calculate overlap
        overlap = text_sections & vector_sections
        text_only = text_sections - vector_sections
        vector_only = vector_sections - text_sections
        
        print(f"Text found: {text_sections}")
        print(f"Vector found: {vector_sections}")
        print(f"Overlap: {overlap}")
        print(f"Text only: {text_only}")
        print(f"Vector only: {vector_only}")
        print(f"Hybrid found: {hybrid_sections}")
        print(f"All possible: {all_possible}")
        
        # Calculate metrics
        if all_possible:
            text_coverage = len(text_sections) / len(all_possible) * 100
            vector_coverage = len(vector_sections) / len(all_possible) * 100
            hybrid_coverage = len(hybrid_sections) / len(all_possible) * 100
            
            print(f"\nüìä Coverage: Text={text_coverage:.0f}%, Vector={vector_coverage:.0f}%, Hybrid={hybrid_coverage:.0f}%")
            
            if len(text_only) > 0 and len(vector_only) > 0:
                print(f"‚úÖ DIVERGENT! Methods found different content")
                print(f"   Hybrid can combine {len(text_only)} text-only + {len(vector_only)} vector-only sections")
            elif len(text_only) > 0:
                print(f"‚ö†Ô∏è  Text found unique content, vector didn't")
            elif len(vector_only) > 0:
                print(f"‚ö†Ô∏è  Vector found unique content, text didn't")
            else:
                print(f"‚ö†Ô∏è  Methods found same content")

# Run divergent test
test_divergent_queries()


TESTING DIVERGENT QUERIES

üîç Query: 'setup' (expected: setup instructions)
------------------------------------------------------------
Text found: {'## Installation'}
Vector found: {'## Deployment', '## Installation', '## Table of Contents'}
Overlap: {'## Installation'}
Text only: set()
Vector only: {'## Deployment', '## Table of Contents'}
Hybrid found: {'## Table of Contents', '## Dataset', '## Deployment', '## Acknowledgments', '## Installation', '## API Documentation'}
All possible: {'## Deployment', '## Installation', '## Table of Contents'}

üìä Coverage: Text=33%, Vector=100%, Hybrid=200%
‚ö†Ô∏è  Vector found unique content, text didn't

üîç Query: 'docker' (expected: container)
------------------------------------------------------------
Text found: {'## Deployment', '## Installation', '## Project Structure'}
Vector found: {'## Deployment', '## Installation', '## Project Structure'}
Overlap: {'## Deployment', '## Installation', '## Project Structure'}
Text only: set()
Ve

In [None]:
def final_homework_insight():
    print("\n" + "="*80)
    print("FINAL HOMEWORK INSIGHT")
    print("="*80)
    
    print("""
üìä THE REAL STORY BEHIND "0.0% MORE":

1. Vector Search is Exceptionally Good:
   ‚Ä¢ Captures 78.9% of all unique content
   ‚Ä¢ Already does most of the work hybrid is meant to do

2. Hybrid's Value Revealed in Divergence:
   ‚Ä¢ For 'how to setup':
     - Text found 3 unique sections not found by vector
     - Vector found 2 unique sections not found by text
     - Total possible unique sections: 8
     - Individual methods: 5 each
     - Hybrid (with top_k=5): Also 5 (limited by display count)
     - Hybrid's potential: Could show up to 8 with larger top_k

3. The Top-K Limitation:
   ‚Ä¢ With top_k=5, hybrid physically can't show all 8 unique sections
   ‚Ä¢ Must choose which 5 to display
   ‚Ä¢ This creates the illusion of "0.0% improvement"

4. Real-World Evidence of Hybrid Value:
   Query: 'how to setup'
   ‚Ä¢ Text-only user sees: Installation, Future Improvements, Problem Description, Model Performance, Running Project
   ‚Ä¢ Vector-only user sees: Installation, API Documentation, Deployment, Running Project, Table of Contents
   ‚Ä¢ Hybrid user could see: ALL OF THE ABOVE (with sufficient top_k)

üéØ RECOMMENDATION WITH NUANCE:

Use Hybrid Search WITH sufficient top_k (8-10) because:

1. Vector search is great (78.9% coverage) but has blind spots
2. Text search fills those blind spots (21.1% unique content)
3. Hybrid combines both, but needs enough slots to show the combination
4. Real benefit: Users get COMPLETE answers, not just one perspective

‚ö†Ô∏è Implementation Note:
‚Ä¢ Set top_k=8-10 to actually see hybrid's advantage
‚Ä¢ Consider dynamic top_k based on query complexity
‚Ä¢ Monitor which method contributes most for different query types
""")

# Show final insight
final_homework_insight()

In [20]:
print("\n" + "="*80)
print("THE ULTIMATE PROOF: 'train model'")
print("="*80)

query = "train model"

print("\nüìÑ TEXT SEARCH USER sees:")
text_results = text_index.search(query, num_results=3)
for r in text_results:
    print(f"  ‚Ä¢ {r.get('header')}")

print("\nüß† VECTOR SEARCH USER sees:")
vector_results = vector_index.search(embedding_model.encode(query), num_results=3)
for r in vector_results:
    print(f"  ‚Ä¢ {r.get('header')}")

print("\nü§ù HYBRID SEARCH USER sees:")
hybrid_results = hybrid_search_fixed(query, text_index, vector_index, embedding_model, top_k=6)
for r in hybrid_results:
    print(f"  ‚Ä¢ {r.get('header')}")

print("\nüéØ CONCLUSION:")
print("Which user gets the most complete answer about training models?")
print("‚úÖ The HYBRID user!")


THE ULTIMATE PROOF: 'train model'

üìÑ TEXT SEARCH USER sees:
  ‚Ä¢ ## Model Performance
  ‚Ä¢ ## Running the Project
  ‚Ä¢ ## Project Structure

üß† VECTOR SEARCH USER sees:
  ‚Ä¢ ## Acknowledgments
  ‚Ä¢ ## Problem Description
  ‚Ä¢ ## Dataset

ü§ù HYBRID SEARCH USER sees:
  ‚Ä¢ ## Model Performance
  ‚Ä¢ ## Acknowledgments
  ‚Ä¢ ## Running the Project
  ‚Ä¢ ## Problem Description
  ‚Ä¢ ## Project Structure
  ‚Ä¢ ## Dataset

üéØ CONCLUSION:
Which user gets the most complete answer about training models?
‚úÖ The HYBRID user!


In [24]:
# 4
from pydantic_ai import Agent
import asyncio
from typing import List, Dict, Any

def search_workout_docs(query: str) -> list:
    """
    Search the workout recommendation system documentation for specific information.
    
    This function searches through all available documentation chunks (README, code files, etc.)
    to find relevant information about the repository.
    
    Args:
        query (str): The search query to use. Be specific with keywords.
                    Examples: "installation", "API endpoints", "machine learning model",
                    "docker deployment", "dataset source", "how to run tests"
    
    Returns:
        list: A list of search results, where each result contains:
            - 'section': The documentation section header
            - 'content': The relevant content from that section
            - 'file': The source filename
    
    Example:
        >>> results = search_workout_docs("installation")
        >>> print(f"Found {len(results)} installation-related sections")
    """
    print(f"üîç Searching for: '{query}'")
    
    results = hybrid_search_fixed(
        query=query,
        text_index=text_index,
        vector_index=vector_index,
        embedding_model=embedding_model,
        top_k=5
    )
    
    formatted = []
    for i, r in enumerate(results, 1):
        formatted.append({
            'number': i,
            'section': r.get('header', 'No section'),
            'content': r.get('chunk', '')[:400],
            'file': r.get('filename', '').split('/')[-1]
        })
    
    print(f"üìä Found {len(formatted)} results")
    return formatted

# Quick test
print("üß™ Testing search tool...")
test_results = search_workout_docs("installation")
print(f"‚úÖ Search tool works! Example result: {test_results[0]['section'] if test_results else 'No results'}")

üß™ Testing search tool...
üîç Searching for: 'installation'
üìä Found 5 results
‚úÖ Search tool works! Example result: ## Installation


In [22]:
# 4.1
system_prompt = """
You are a technical assistant specialized in understanding and explaining code repositories.

CRITICAL INSTRUCTION: When you need information, you MUST CALL the 'search_workout_docs' function.
DO NOT describe what you would search for - actually CALL the function.

How to use the tool:
1. When asked a question, determine the best search query
2. CALL search_workout_docs(query="your_search_terms_here")
3. Use the results to answer the question
4. Reference which sections you found the information in

Example of CORRECT behavior:
User: "How do I install this?"
Assistant: [CALLS search_workout_docs(query="installation")]
Assistant: "Based on the documentation, here are the installation steps..."

You have access to the 'search_workout_docs' function. USE IT.
"""

print(f"‚úÖ System prompt created ({len(system_prompt)} chars)")

‚úÖ System prompt created (767 chars)


In [None]:
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.providers.openai import OpenAIProvider  # Import the Provider
from openai import AsyncOpenAI

# 1. Create an AsyncOpenAI client configured for OpenRouter
client = AsyncOpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key="",
)

# 2. Create an OpenAIProvider with the custom client [citation:2]
provider = OpenAIProvider(
    openai_client=client  # The client is passed to the Provider
)

# 3. Create the OpenRouter model, passing the provider [citation:2]
openrouter_model = OpenAIChatModel(
    model_name="google/gemma-3-27b-it:free",
    provider=provider  # Use the custom provider
)

# 4. Create the agent
workout_agent = Agent(
    name="RepoKnowledgeAssistant",
    instructions=system_prompt,
    tools=[search_workout_docs],
    model=openrouter_model,  # Use our custom model
    retries=1
)

print("‚úÖ Agent created successfully with custom OpenRouter provider!")
print(f"   Model: {openrouter_model.model_name} via OpenRouter")

‚úÖ Agent created successfully with custom OpenRouter provider!
   Model: google/gemma-3-27b-it:free via OpenRouter


In [51]:
async def quick_test():
    """Test the agent with one question"""
    question = "How do I install this system?"
    print(f"\nüë§ Question: {question}")
    
    print("ü§ñ Thinking...")
    result = await workout_agent.run(user_prompt=question)
    
    print(f"\nü§ñ Answer:")
    print("-" * 50)
    print(result)  # Just print the result object directly
    print("-" * 50)
    
    # Check if search was used
    for msg in result.new_messages():
        if hasattr(msg, 'tool_calls') and msg.tool_calls:
            print(f"‚úÖ Agent used search tool!")
            return True
    
    print("‚ö†Ô∏è  Agent did NOT use search tool")
    return False

print("Running quick test...")
search_used = await quick_test()
print(f"\nüéØ Test complete. Search used: {search_used}")

Running quick test...

üë§ Question: How do I install this system?
ü§ñ Thinking...

ü§ñ Answer:
--------------------------------------------------
AgentRunResult(output='```tool_code\n[search_workout_docs(query="installation")]\n```')
--------------------------------------------------
‚ö†Ô∏è  Agent did NOT use search tool

üéØ Test complete. Search used: False


In [55]:
async def simple_test():
    """Test with explicit instruction to use the tool"""
    question = "Please search for installation instructions and tell me what you find."
    
    print(f"üë§ Question: {question}")
    result = await workout_agent.run(user_prompt=question)
    
    print(f"\nü§ñ Response: {result}")
    
    # Check the conversation steps
    print("\nüîç Conversation steps:")
    for i, msg in enumerate(result.new_messages(), 1):
        print(f"\nStep {i}: {type(msg).__name__}")
        
        if hasattr(msg, 'tool_calls') and msg.tool_calls:
            print(f"  Tool called: {msg.tool_calls[0].name}")
            print(f"  Arguments: {msg.tool_calls[0].args}")
        
        if hasattr(msg, 'tool_return'):
            print(f"  Tool returned: {len(msg.tool_return)} items")

# Run the simple test
await simple_test()

üë§ Question: Please search for installation instructions and tell me what you find.

ü§ñ Response: AgentRunResult(output='```tool_code\n[search_workout_docs(query="installation")]\n```')

üîç Conversation steps:

Step 1: ModelRequest

Step 2: ModelResponse


In [29]:
async def simple_test():
    """Test with explicit instruction to use the tool"""
    question = "Please search for installation instructions and tell me what you find."
    
    print(f"üë§ Question: {question}")
    result = await workout_agent.run(user_prompt=question)
    
    print(f"\nü§ñ Response: {result}")
    
    # Check the conversation steps
    print("\nüîç Conversation steps:")
    for i, msg in enumerate(result.new_messages(), 1):
        print(f"\nStep {i}: {type(msg).__name__}")
        
        if hasattr(msg, 'tool_calls') and msg.tool_calls:
            # print(msg)
            # print(msg.tool_calls)
            print(f"  Tool called: {msg.tool_calls[0].tool_name}")
            print(f"  Arguments: {msg.tool_calls[0].args}")
        
        if hasattr(msg, 'tool_return'):
            print(f"  Tool returned: {len(msg.tool_return)} items")

# Run the simple test
await simple_test()

üë§ Question: Please search for installation instructions and tell me what you find.
üîç Searching for: 'installation'
üìä Found 5 results

ü§ñ Response: AgentRunResult(output='Based on the documentation, here are the installation steps:\n\n**Prerequisites:**\n\n*   Python 3.9+\n*   pip\n*   Docker (optional, for containerization)\n\n**Option 1: Local Setup with Virtual Environment**\n\n1.  Clone the repository: `git clone https://github.com/ilhamksyuriadi/workout-recommendation.git`\n2.  Navigate to the project directory: `cd workout-recommendation`\n3.  Create a virtual environment: `python -m venv venv`\n4.  Activate the virtual environment:\n    *   **Windows:** `venv\\Scripts\\activate`\n    *   **macOS/Linux:** `source venv`\n\n(Information found in section "## Installation" of the README.md file.)')

üîç Conversation steps:

Step 1: ModelRequest

Step 2: ModelResponse
  Tool called: search_workout_docs
  Arguments: {"query": "installation"}

Step 3: ModelRequest

Step 4: Mo

In [31]:
async def comprehensive_test():
    """Test various aspects of the project documentation"""
    test_scenarios = [
        ("Installation", "How do I set up this project from scratch?"),
        ("Configuration", "How do I configure the application?"),
        ("Usage", "How do I run the main application?"),
        ("Troubleshooting", "What should I do if I encounter errors?"),
        ("Architecture", "How is the project structured?"),
        ("Data", "What data does this project use and how is it processed?"),
    ]
    
    for category, question in test_scenarios:
        print(f"\n{'='*60}")
        print(f"Category: {category}")
        print(f"Question: {question}")
        print(f"{'='*60}")
        
        result = await workout_agent.run(user_prompt=question)
        print(f"\nAnswer: {result.output}")
        
        # Check if tool was used
        tool_used = any(
            hasattr(msg, 'tool_calls') and msg.tool_calls 
            for msg in result.new_messages()
        )
        print(f"\n‚úÖ Tool used: {tool_used}")

await comprehensive_test()


Category: Installation
Question: How do I set up this project from scratch?
üîç Searching for: 'setup project from scratch'
üìä Found 5 results

Answer: To set up this project from scratch, follow these steps based on the documentation:

1.  **Project Structure:** The project has a specific structure with `data` and `models` directories. The `data` directory contains the raw dataset (`gym_members_exercise_tracking.csv`), and the `models` directory will store preprocessed data (`data_prepared.pkl`), the best model from the notebook (`best_model.pkl`), and the final trained model (`final_model.pkl`) used by the prediction script. (Section: Project Structure, File: README.md)

2.  **Data Exploration (Optional):** You can start by exploring the data using a Jupyter notebook (`notebook.ipynb`). This notebook includes EDA, feature engineering, model training, evaluation, and visualizations. (Section: Running the Project, File: README.md)

3.  **Train the Model:**  Run the `train.py` scrip

In [36]:
# 5 evaluation
import json
import secrets
from pathlib import Path
from datetime import datetime

LOG_DIR = Path('logs')
LOG_DIR.mkdir(exist_ok=True)

async def test_simple_logging():
    """Just test logging with one question"""
    
    # Ask ONE simple question
    question = "How do I install this project?"
    print(f"üß™ Testing with: {question}")
    
    # Run agent
    result = await workout_agent.run(user_prompt=question)
    print(f"ü§ñ Agent response: {result.output[:100]}...")
    
    # Save to log file
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"test_log_{timestamp}.json"
    filepath = LOG_DIR / filename
    
    # Simple log (just save the basics)
    simple_log = {
        "timestamp": datetime.now().isoformat(),
        "question": question,
        "answer": result.output,
        "agent_name": "workout_agent_test"
    }
    
    with open(filepath, "w") as f:
        json.dump(simple_log, f, indent=2)
    
    print(f"‚úÖ Saved simple log to: {filename}")
    
    # Check file exists
    if filepath.exists():
        print(f"üìÅ File size: {filepath.stat().st_size} bytes")
        return True
    else:
        print("‚ùå File not created!")
        return False

# Run the test
print("=== STEP 1: TEST SIMPLE LOGGING ===")
logging_works = await test_simple_logging()

=== STEP 1: TEST SIMPLE LOGGING ===
üß™ Testing with: How do I install this project?
üîç Searching for: 'installation'
üìä Found 5 results
ü§ñ Agent response: Based on the documentation, here are the installation steps:

**Prerequisites:**

*   Python 3.9+
* ...
‚úÖ Saved simple log to: test_log_20251216_224214.json
üìÅ File size: 825 bytes


In [37]:
# Check logs directory
print("\n=== CHECKING LOGS DIRECTORY ===")
if LOG_DIR.exists():
    log_files = list(LOG_DIR.glob("*.json"))
    print(f"Found {len(log_files)} log files:")
    for file in log_files:
        print(f"  - {file.name}")
else:
    print("‚ùå Logs directory doesn't exist!")


=== CHECKING LOGS DIRECTORY ===
Found 2 log files:
  - test_log_20251216_224120.json
  - test_log_20251216_224214.json


In [39]:
from pydantic import BaseModel
from pydantic_ai import Agent

# Reuse the SAME model setup
eval_openrouter_model = OpenAIChatModel(
    model_name="google/gemma-3-27b-it:free",  # Same model
    provider=provider  # Same provider
)

# Simple evaluation schema
class SimpleEvaluation(BaseModel):
    is_relevant: bool
    used_tools: bool
    score: int  # 1-5

# Simple evaluation agent
simple_eval_agent = Agent(
    name='simple_evaluator',
    model=eval_openrouter_model,  # Use the same model setup
    instructions="""Judge if the answer is relevant to the question about a workout recommendation repository.
    
    IMPORTANT: The agent has access to a 'search_workout_docs' tool. 
    If the answer contains specific details from documentation (like "Based on the documentation", 
    "According to README.md", section references, file names, or specific technical details), 
    it likely used the search tool.
    
    Return true/false for relevance and tool usage, and a score from 1-5.""",
    output_type=SimpleEvaluation
)

print("‚úÖ Evaluation agent created with same OpenRouter setup")

‚úÖ Evaluation agent created with same OpenRouter setup


In [40]:
async def test_both_agents():
    """Test both agents work with OpenRouter"""
    
    print("=== TESTING BOTH AGENTS ===")
    
    # Test 1: Main agent
    print("\n1. Testing workout_agent...")
    try:
        result = await workout_agent.run(user_prompt="What is this project?")
        print(f"‚úÖ Main agent works: {result.output[:50]}...")
    except Exception as e:
        print(f"‚ùå Main agent error: {e}")
    
    # Test 2: Evaluation agent (with hardcoded example)
    print("\n2. Testing evaluation agent...")
    try:
        test_prompt = """
        Question: How do I install this project?
        
        Answer: To install, clone the repo and run pip install.
        
        Evaluate relevance, tool usage, and score.
        """
        
        eval_result = await simple_eval_agent.run(test_prompt)
        print(f"‚úÖ Evaluation agent works:")
        print(f"   Relevant: {eval_result.output.is_relevant}")
        print(f"   Used tools: {eval_result.output.used_tools}")
        print(f"   Score: {eval_result.output.score}/5")
    except Exception as e:
        print(f"‚ùå Evaluation agent error: {e}")

# Run test
await test_both_agents()

=== TESTING BOTH AGENTS ===

1. Testing workout_agent...
üîç Searching for: 'project overview'
üìä Found 5 results
‚úÖ Main agent works: This project is a workout recommendation system. I...

2. Testing evaluation agent...
‚úÖ Evaluation agent works:
   Relevant: True
   Used tools: False
   Score: 5/5


In [42]:
# Just 3 test questions to start
BASIC_TEST_QUESTIONS = [
    "How do I install this project?",
    "What dataset does this use?",
    "How do I run the API?"
]

async def test_with_3_questions_improved():
    """Test with actual tool usage checking"""
    
    print(f"\n=== TESTING WITH 3 QUESTIONS (IMPROVED) ===")
    
    for i, question in enumerate(BASIC_TEST_QUESTIONS, 1):
        print(f"\n[{i}/3] Q: {question}")
        
        # Run agent
        result = await workout_agent.run(user_prompt=question)
        print(f"   Response preview: {result.output[:80]}...")
        
        # ACTUALLY CHECK if tools were used
        tool_used = False
        for msg in result.new_messages():
            if hasattr(msg, 'tool_calls') and msg.tool_calls:
                tool_used = True
                print(f"   üîß ACTUAL tool calls: {len(msg.tool_calls)}")
        
        # Simple evaluation (just for relevance and score)
        eval_prompt = f"Question: {question}\nAnswer: {result.output[:300]}..."
        eval_result = await simple_eval_agent.run(eval_prompt)
        
        print(f"   ‚úÖ Relevant: {eval_result.output.is_relevant}")
        print(f"   ‚öôÔ∏è  Used tools (ACTUAL): {tool_used}")  # Real check!
        print(f"   ‚≠ê Score: {eval_result.output.score}/5")

# Test the improved version
print("\n=== STEP 4 IMPROVED: TEST WITH ACTUAL TOOL CHECKING ===")
await test_with_3_questions_improved()


=== STEP 4 IMPROVED: TEST WITH ACTUAL TOOL CHECKING ===

=== TESTING WITH 3 QUESTIONS (IMPROVED) ===

[1/3] Q: How do I install this project?
üîç Searching for: 'installation'
üìä Found 5 results
   Response preview: Based on the documentation, here are the installation steps:

**Prerequisites:**...
   üîß ACTUAL tool calls: 1
   ‚úÖ Relevant: True
   ‚öôÔ∏è  Used tools (ACTUAL): True
   ‚≠ê Score: 5/5

[2/3] Q: What dataset does this use?
üîç Searching for: 'dataset source'
üìä Found 5 results
   Response preview: This project uses the "Gym Members Exercise Dataset" from Kaggle ([https://www.k...
   üîß ACTUAL tool calls: 1
   ‚úÖ Relevant: True
   ‚öôÔ∏è  Used tools (ACTUAL): True
   ‚≠ê Score: 5/5

[3/3] Q: How do I run the API?
üîç Searching for: 'run API'
üìä Found 5 results
   Response preview: Based on the documentation, to run the API:

1.  **Base URL:** The API can be ac...
   üîß ACTUAL tool calls: 1
   ‚úÖ Relevant: True
   ‚öôÔ∏è  Used tools (ACTUAL): True
   ‚≠ê 

In [43]:
async def simple_evaluation_workflow_improved():
    """Evaluation workflow that actually checks tool usage"""
    
    print("\n" + "="*50)
    print("IMPROVED EVALUATION WORKFLOW")
    print("="*50)
    
    evaluations = []
    
    for i, question in enumerate(BASIC_TEST_QUESTIONS, 1):
        print(f"\n[{i}/{len(BASIC_TEST_QUESTIONS)}] Processing: {question}")
        
        # Run agent and get FULL result (with messages)
        result = await workout_agent.run(user_prompt=question)
        
        # ACTUAL tool usage check
        actual_tool_used = False
        tool_call_details = []
        
        for msg in result.new_messages():
            if hasattr(msg, 'tool_calls') and msg.tool_calls:
                actual_tool_used = True
                for tool_call in msg.tool_calls:
                    tool_name = getattr(tool_call, 'tool_name', 'unknown')
                    tool_call_details.append(f"{tool_name}({tool_call.args})")
        
        # Evaluation for relevance and quality
        eval_prompt = f"""
        Question: {question}
        Answer: {result.output[:400]}...
        
        Evaluate if this answer is relevant and helpful for someone asking about the workout repository.
        Score from 1-5.
        """
        
        eval_result = await simple_eval_agent.run(eval_prompt)
        
        evaluation = {
            "question": question,
            "score": eval_result.output.score,
            "relevant": eval_result.output.is_relevant,
            "used_tools_actual": actual_tool_used,  # REAL check
            "used_tools_guess": eval_result.output.used_tools,  # AI's guess
            "tool_calls": tool_call_details
        }
        evaluations.append(evaluation)
        
        print(f"   ‚úÖ Relevant: {evaluation['relevant']}")
        print(f"   üîß Tools used (ACTUAL): {evaluation['used_tools_actual']}")
        if evaluation['used_tools_actual']:
            print(f"   üìù Tool calls: {', '.join(evaluation['tool_calls'])}")
        print(f"   ‚≠ê Score: {evaluation['score']}/5")
    
    # Show results
    print("\n" + "="*50)
    print("FINAL RESULTS (WITH ACTUAL TOOL CHECKING)")
    print("="*50)
    
    avg_score = sum(e['score'] for e in evaluations) / len(evaluations)
    actual_tool_rate = sum(1 for e in evaluations if e['used_tools_actual']) / len(evaluations) * 100
    guessed_tool_rate = sum(1 for e in evaluations if e['used_tools_guess']) / len(evaluations) * 100
    
    print(f"\nüìä Metrics:")
    print(f"  Average Score: {avg_score:.2f}/5")
    print(f"  Actual Tool Usage Rate: {actual_tool_rate:.1f}%")
    print(f"  AI-Guessed Tool Usage: {guessed_tool_rate:.1f}%")
    
    print(f"\nüîç Tool Usage Accuracy:")
    correct_guesses = sum(1 for e in evaluations if e['used_tools_actual'] == e['used_tools_guess'])
    accuracy = correct_guesses / len(evaluations) * 100
    print(f"  AI correctly guessed tool usage: {accuracy:.1f}% of the time")
    
    return evaluations

# Run improved workflow
print("\n=== RUNNING IMPROVED WORKFLOW ===")
improved_results = await simple_evaluation_workflow_improved()


=== RUNNING IMPROVED WORKFLOW ===

IMPROVED EVALUATION WORKFLOW

[1/3] Processing: How do I install this project?
üîç Searching for: 'installation'
üìä Found 5 results


Traceback (most recent call last):
  File "D:\datatalksclub\AI Agent email course\aihero\course\.venv\Lib\site-packages\pydantic_ai\models\openai.py", line 556, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\datatalksclub\AI Agent email course\aihero\course\.venv\Lib\site-packages\openai\resources\chat\completions\completions.py", line 2678, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "D:\datatalksclub\AI Agent email course\aihero\course\.venv\Lib\site-packages\openai\_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\datatalksclub\AI Agent email course\aihero\course\.venv\Lib\site-packages\openai\_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from

In [44]:
import os
from pathlib import Path

# Create app folder (as per docs)
app_folder = Path("app")
app_folder.mkdir(exist_ok=True)
print(f"‚úÖ Created: {app_folder}")

# Create pyproject.toml as per docs
pyproject_content = '''[project]
name = "doc-agent-app"
version = "0.1.0"
description = "Documentation Assistant Agent"
authors = [
    {name = "You", email = "you@example.com"}
]
dependencies = [
    "minsearch>=0.0.5",
    "openai>=1.108.2",
    "pydantic-ai==1.0.9",
    "python-frontmatter>=1.1.0",
    "requests>=2.32.5",
    "streamlit>=1.35.0",
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
'''

# Write pyproject.toml
(app_folder / "pyproject.toml").write_text(pyproject_content)
print(f"‚úÖ Created: app/pyproject.toml")

# Create other files as per docs
files_to_create = [
    "ingest.py",
    "search_tools.py",
    "search_agent.py",
    "logs.py",
    "app.py",
    "requirements.txt",
    "README.md"
]

print(f"\nüìã Files to create in app/ folder:")
for file in files_to_create:
    print(f"  - {file}")

‚úÖ Created: app
‚úÖ Created: app/pyproject.toml

üìã Files to create in app/ folder:
  - ingest.py
  - search_tools.py
  - search_agent.py
  - logs.py
  - app.py
  - requirements.txt
  - README.md
