In [79]:
%pip install torch 

Note: you may need to restart the kernel to use updated packages.


In [80]:
%pip install pandas numpy  sentence-transformers chromadb tqdm


Note: you may need to restart the kernel to use updated packages.


In [81]:
%pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [82]:
# ============================================================================
# COMPLETE RAG PIPELINE - PART 1: EMBEDDING TRAINING & VECTOR DATABASE
# ============================================================================
# This notebook trains embeddings and creates a vector database from collected data

import json
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List, Dict, Any, Tuple
import torch
# Import Dataset from datasets FIRST (sentence-transformers needs this)
from datasets import Dataset
# Import torch Dataset with different name to avoid conflict
from torch.utils.data import Dataset as TorchDataset, DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from sentence_transformers.util import cos_sim
import chromadb
from chromadb.config import Settings
from tqdm import tqdm
import pickle
from datetime import datetime


In [83]:
# Dataset from datasets is now imported in the main imports cell above
# This cell is kept for backward compatibility but is no longer needed
pass

In [84]:

# ============================================================================
# CELL 1: Configuration and Setup
# ============================================================================

class EmbeddingConfig:
    """Configuration for embedding training and vector database"""
    
    def __init__(self, library_name: str = "pywinauto"):
        self.library_name = library_name
        
        # Paths
        self.data_dir = Path(f"rag_data/{library_name}")
        self.models_dir = Path(f"models/{library_name}")
        self.vectordb_dir = Path(f"vectordb/{library_name}")
        
        # Create directories
        self.models_dir.mkdir(parents=True, exist_ok=True)
        self.vectordb_dir.mkdir(parents=True, exist_ok=True)
        
        # Model settings
        self.base_model = "sentence-transformers/all-MiniLM-L6-v2"  # Fast and good
        # Alternative: "sentence-transformers/all-mpnet-base-v2"  # Better but slower
        
        # Training settings
        self.batch_size = 16
        self.epochs = 3
        self.learning_rate = 2e-5
        self.warmup_steps = 100
        
        # Chunking settings
        self.chunk_size = 512  # tokens
        self.chunk_overlap = 50  # tokens
        
        # Vector database settings
        self.collection_name = f"{library_name}_embeddings"

config = EmbeddingConfig("pywinauto")
print(f"Configuration initialized for: {config.library_name}")
print(f"Base model: {config.base_model}")



Configuration initialized for: pywinauto
Base model: sentence-transformers/all-MiniLM-L6-v2


In [85]:

# ============================================================================
# CELL 2: Data Loader and Preprocessor
# ============================================================================

class RAGDataLoader:
    """Load and preprocess collected data for embedding training"""
    
    def __init__(self, config: EmbeddingConfig):
        self.config = config
        self.data = []
        
    def load_data(self) -> pd.DataFrame:
        """Load combined data from all sources"""
        combined_file = self.config.data_dir / "combined" / f"{self.config.library_name}_combined.json"
        
        if not combined_file.exists():
            raise FileNotFoundError(f"Combined data file not found: {combined_file}")
        
        with open(combined_file, 'r', encoding='utf-8') as f:
            self.data = json.load(f)
        
        df = pd.DataFrame(self.data)
        print(f"Loaded {len(df)} items from {combined_file}")
        print(f"Sources: {df['source'].value_counts().to_dict()}")
        
        return df
    
    def preprocess_item(self, item: Dict) -> List[Dict]:
        """Preprocess a single item into chunks suitable for embedding"""
        chunks = []
        
        source_type = item.get('type', 'unknown')
        
        if source_type == 'documentation':
            # Process documentation
            content = item.get('content', '')
            title = item.get('title', '')
            headers = item.get('headers', [])
            
            # Create context-aware chunks
            context = f"Documentation: {title}\n"
            if headers:
                context += f"Section: {' > '.join(headers[:3])}\n"
            
            chunks.extend(self._chunk_text(content, context, item))
            
            # Add code blocks separately
            for i, code in enumerate(item.get('code_blocks', [])):
                chunks.append({
                    'id': f"{item['id']}_code_{i}",
                    'text': code,
                    'context': f"{context}Code Example {i+1}",
                    'metadata': {**item, 'chunk_type': 'code_block', 'chunk_index': i}
                })
        
        elif source_type == 'github_code':
            # Process GitHub code
            content = item.get('content', '')
            file_path = item.get('file_path', '')
            
            context = f"GitHub: {item.get('repo', '')}/{file_path}\n"
            
            if file_path.endswith('.md') or file_path.endswith('.rst'):
                # Documentation file
                chunks.extend(self._chunk_text(content, context, item))
            else:
                # Code file - chunk by function/class
                chunks.extend(self._chunk_code(content, context, item))
        
        elif source_type == 'stackoverflow':
            # Process StackOverflow
            title = item.get('title', '')
            context = f"StackOverflow Q&A: {title}\n"
            
            # Add question codes
            for i, code in enumerate(item.get('question_code', [])):
                chunks.append({
                    'id': f"{item['id']}_qcode_{i}",
                    'text': code,
                    'context': f"{context}Question Code",
                    'metadata': {**item, 'chunk_type': 'question_code', 'chunk_index': i}
                })
            
            # Add answer codes
            for i, code in enumerate(item.get('answer_codes', [])):
                chunks.append({
                    'id': f"{item['id']}_acode_{i}",
                    'text': code,
                    'context': f"{context}Answer Code (Score: {item.get('score', 0)})",
                    'metadata': {**item, 'chunk_type': 'answer_code', 'chunk_index': i}
                })
        
        return chunks
    
    def _chunk_text(self, text: str, context: str, metadata: Dict) -> List[Dict]:
        """Chunk text into smaller pieces with overlap"""
        chunks = []
        words = text.split()
        
        chunk_size = self.config.chunk_size
        overlap = self.config.chunk_overlap
        
        for i in range(0, len(words), chunk_size - overlap):
            chunk_words = words[i:i + chunk_size]
            chunk_text = ' '.join(chunk_words)
            
            if len(chunk_text.strip()) > 50:  # Minimum chunk size
                chunks.append({
                    'id': f"{metadata['id']}_chunk_{i}",
                    'text': chunk_text,
                    'context': context,
                    'metadata': {**metadata, 'chunk_type': 'text', 'chunk_index': i}
                })
        
        return chunks
    
    def _chunk_code(self, code: str, context: str, metadata: Dict) -> List[Dict]:
        """Chunk code by functions/classes"""
        chunks = []
        
        # Simple code chunking by functions and classes
        lines = code.split('\n')
        current_chunk = []
        chunk_index = 0
        
        for line in lines:
            current_chunk.append(line)
            
            # Split on function/class definitions or when chunk gets too large
            if (line.strip().startswith('def ') or 
                line.strip().startswith('class ') or 
                len(current_chunk) > 50):
                
                if len(current_chunk) > 5:  # Minimum lines
                    chunk_text = '\n'.join(current_chunk)
                    chunks.append({
                        'id': f"{metadata['id']}_code_{chunk_index}",
                        'text': chunk_text,
                        'context': context,
                        'metadata': {**metadata, 'chunk_type': 'code', 'chunk_index': chunk_index}
                    })
                    chunk_index += 1
                    current_chunk = []
        
        # Add remaining chunk
        if current_chunk and len(current_chunk) > 5:
            chunk_text = '\n'.join(current_chunk)
            chunks.append({
                'id': f"{metadata['id']}_code_{chunk_index}",
                'text': chunk_text,
                'context': context,
                'metadata': {**metadata, 'chunk_type': 'code', 'chunk_index': chunk_index}
            })
        
        return chunks
    
    def prepare_all_chunks(self) -> List[Dict]:
        """Prepare all chunks from loaded data"""
        all_chunks = []
        
        print("Processing all items into chunks...")
        for item in tqdm(self.data):
            chunks = self.preprocess_item(item)
            all_chunks.extend(chunks)
        
        print(f"Created {len(all_chunks)} chunks from {len(self.data)} items")
        
        # Save chunks
        chunks_file = self.config.models_dir / "processed_chunks.json"
        with open(chunks_file, 'w', encoding='utf-8') as f:
            json.dump(all_chunks, f, indent=2, ensure_ascii=False)
        
        print(f"Saved chunks to {chunks_file}")
        
        return all_chunks



In [86]:

# ============================================================================
# CELL 3: Training Dataset Creator
# ============================================================================

class EmbeddingTrainingDataset(TorchDataset):
    """Create training pairs for fine-tuning embeddings"""
    
    def __init__(self, chunks: List[Dict]):
        self.chunks = chunks
        self.examples = self._create_training_examples()
    
    def _create_training_examples(self) -> List[InputExample]:
        """Create positive pairs for contrastive learning"""
        examples = []
        
        # Group chunks by source document
        doc_groups = {}
        for chunk in self.chunks:
            doc_id = chunk['metadata'].get('id', 'unknown')
            if doc_id not in doc_groups:
                doc_groups[doc_id] = []
            doc_groups[doc_id].append(chunk)
        
        print(f"Creating training examples from {len(doc_groups)} document groups...")
        
        # Create positive pairs from same document
        for doc_id, group_chunks in doc_groups.items():
            if len(group_chunks) < 2:
                continue
            
            # Pair consecutive chunks
            for i in range(len(group_chunks) - 1):
                chunk1 = group_chunks[i]
                chunk2 = group_chunks[i + 1]
                
                text1 = f"{chunk1['context']}\n{chunk1['text']}"
                text2 = f"{chunk2['context']}\n{chunk2['text']}"
                
                # Create positive pair (similar texts)
                examples.append(InputExample(texts=[text1, text2], label=1.0))
        
        # Create some query-document pairs
        for chunk in self.chunks[:1000]:  # Limit to avoid too many examples
            text = f"{chunk['context']}\n{chunk['text']}"
            
            # Create a synthetic query from the chunk
            query = self._generate_query(chunk)
            if query:
                examples.append(InputExample(texts=[query, text], label=1.0))
        
        print(f"Created {len(examples)} training examples")
        return examples
    
    def _generate_query(self, chunk: Dict) -> str:
        """Generate a synthetic query for a chunk"""
        chunk_type = chunk['metadata'].get('chunk_type', '')
        
        if chunk_type == 'code' or chunk_type == 'code_block':
            # Extract first line or function name
            text = chunk['text'].strip()
            lines = text.split('\n')
            
            for line in lines[:5]:
                if 'def ' in line or 'class ' in line:
                    # Extract function/class name
                    name = line.split('(')[0].split()[-1]
                    return f"How to use {name} in {config.library_name}"
        
        elif chunk_type == 'question_code':
            title = chunk['metadata'].get('title', '')
            if title:
                return title
        
        return None
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        return self.examples[idx]



In [87]:
# ============================================================================
# CELL 4: Model Training (FIXED)
# ============================================================================

class EmbeddingTrainer:
    """Train and fine-tune embedding model"""
    
    def __init__(self, config: EmbeddingConfig):
        self.config = config
        self.model = None
        
    def initialize_model(self):
        """Load base model"""
        print(f"Loading base model: {self.config.base_model}")
        self.model = SentenceTransformer(self.config.base_model)
        print(f"Model loaded. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        
    def train(self, training_examples: List[InputExample]):
        """Train the model with contrastive learning"""
        print("\nStarting model training...")
        print(f"Training examples: {len(training_examples)}")
        
        # Convert InputExample list to proper format
        train_samples = []
        for example in training_examples:
            train_samples.append(example)
        
        # Create data loader directly from InputExample objects
        from torch.utils.data import DataLoader
        train_dataloader = DataLoader(
            train_samples,
            shuffle=True,
            batch_size=self.config.batch_size
        )
        
        # Define loss function
        train_loss = losses.MultipleNegativesRankingLoss(self.model)
        
        # Training with safer parameters
        try:
            self.model.fit(
                train_objectives=[(train_dataloader, train_loss)],
                epochs=self.config.epochs,
                warmup_steps=self.config.warmup_steps,
                output_path=str(self.config.models_dir / "finetuned_model"),
                show_progress_bar=True,
                save_best_model=False,
                use_amp=False  # Disable AMP to avoid issues
            )
            print("Training completed!")
        except Exception as e:
            print(f"Error during training: {e}")
            print("Saving base model instead...")
            self.save_model("embedding_model")
        
    def save_model(self, model_name: str = "embedding_model"):
        """Save the trained model"""
        model_path = self.config.models_dir / model_name
        self.model.save(str(model_path))
        print(f"Model saved to {model_path}")
        
    def load_model(self, model_name: str = "embedding_model"):
        """Load a trained model"""
        model_path = self.config.models_dir / model_name
        self.model = SentenceTransformer(str(model_path))
        print(f"Model loaded from {model_path}")
        
    def evaluate_model(self, test_chunks: List[Dict], num_samples: int = 100):
        """Evaluate model with semantic similarity tests"""
        print("\nEvaluating model...")
        
        # Sample random chunks
        sample_chunks = np.random.choice(test_chunks, min(num_samples, len(test_chunks)), replace=False)
        
        # Compute embeddings
        texts = [f"{c['context']}\n{c['text']}" for c in sample_chunks]
        embeddings = self.model.encode(texts, show_progress_bar=True)
        
        # Compute similarity matrix
        similarity_matrix = cos_sim(embeddings, embeddings)
        
        # Calculate average similarity for positive pairs (same document)
        doc_groups = {}
        for i, chunk in enumerate(sample_chunks):
            doc_id = chunk['metadata'].get('id', 'unknown')
            if doc_id not in doc_groups:
                doc_groups[doc_id] = []
            doc_groups[doc_id].append(i)
        
        positive_similarities = []
        for doc_id, indices in doc_groups.items():
            if len(indices) >= 2:
                for i in range(len(indices) - 1):
                    sim = similarity_matrix[indices[i]][indices[i + 1]].item()
                    positive_similarities.append(sim)
        
        if positive_similarities:
            avg_positive_sim = np.mean(positive_similarities)
            print(f"Average similarity for related chunks: {avg_positive_sim:.4f}")
        
        return embeddings, similarity_matrix


In [91]:
# ============================================================================
# CELL 5: Vector Database Creation
# ============================================================================

class VectorDatabase:
    """Create and manage vector database with ChromaDB"""
    
    def __init__(self, config: EmbeddingConfig):
        self.config = config
        self.client = None
        self.collection = None
        self.model = None
        
    def initialize_database(self):
        """Initialize ChromaDB"""
        print("Initializing ChromaDB...")
        
        self.client = chromadb.PersistentClient(
            path=str(self.config.vectordb_dir)
        )
        
        # Delete existing collection if it exists
        try:
            self.client.delete_collection(name=self.config.collection_name)
        except:
            pass
        
        # Create new collection
        self.collection = self.client.create_collection(
            name=self.config.collection_name,
            metadata={"hnsw:space": "cosine"}
        )
        
        print(f"Collection created: {self.config.collection_name}")
    
    def load_embedding_model(self, model_path: str = None):
        """Load the trained embedding model"""
        if model_path is None:
            model_path = self.config.models_dir / "embedding_model"
        
        self.model = SentenceTransformer(str(model_path))
        print(f"Embedding model loaded from {model_path}")
    
    def add_chunks(self, chunks: List[Dict], batch_size: int = 100):
        """Add chunks to vector database"""
        print(f"\nAdding {len(chunks)} chunks to vector database...")
        
        for i in tqdm(range(0, len(chunks), batch_size)):
            batch = chunks[i:i + batch_size]
            
            # Prepare data
            ids = [chunk['id'] for chunk in batch]
            texts = [f"{chunk['context']}\n{chunk['text']}" for chunk in batch]
            
            # Clean metadata - convert lists to strings, remove None values
            metadatas = []
            for chunk in batch:
                clean_meta = {}
                for key, value in chunk['metadata'].items():
                    if value is None:
                        continue
                    elif isinstance(value, list):
                        # Convert list to string
                        clean_meta[key] = ', '.join(str(v) for v in value) if value else ''
                    elif isinstance(value, dict):
                        # Convert dict to JSON string
                        clean_meta[key] = json.dumps(value)
                    elif isinstance(value, (str, int, float, bool)):
                        clean_meta[key] = value
                    else:
                        # Convert any other type to string
                        clean_meta[key] = str(value)
                metadatas.append(clean_meta)
            
            # Generate embeddings
            embeddings = self.model.encode(texts, show_progress_bar=False)
            
            # Add to collection
            self.collection.add(
                ids=ids,
                embeddings=embeddings.tolist(),
                documents=texts,
                metadatas=metadatas
            )
        
        print(f"Added {len(chunks)} chunks to vector database")
        print(f"Total items in collection: {self.collection.count()}")
    
    def search(self, query: str, n_results: int = 5) -> Dict:
        """Search for similar chunks"""
        # Generate query embedding
        query_embedding = self.model.encode([query])[0]
        
        # Search
        results = self.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=n_results
        )
        
        return results
    
    def get_collection_stats(self):
        """Get statistics about the collection"""
        count = self.collection.count()
        
        print(f"\nVector Database Statistics:")
        print(f"Total chunks: {count}")
        print(f"Collection name: {self.config.collection_name}")
        print(f"Storage path: {self.config.vectordb_dir}")

In [92]:

# ============================================================================
# CELL 6: Complete Pipeline Execution
# ============================================================================

def run_complete_pipeline(library_name: str = "pywinauto", 
                         skip_training: bool = False,
                         use_pretrained: bool = True):
    """
    Run the complete embedding training and vector database creation pipeline
    
    Args:
        library_name: Name of the library to process
        skip_training: If True, skip model training and use base model
        use_pretrained: If True, use existing fine-tuned model if available
    """
    print("=" * 80)
    print(f"EMBEDDING TRAINING PIPELINE FOR: {library_name}")
    print("=" * 80)
    
    # Initialize configuration
    config = EmbeddingConfig(library_name)
    
    # Step 1: Load and preprocess data
    print("\n[STEP 1/5] Loading and preprocessing data...")
    loader = RAGDataLoader(config)
    df = loader.load_data()
    chunks = loader.prepare_all_chunks()
    
    # Step 2: Initialize or train model
    trainer = EmbeddingTrainer(config)
    
    model_path = config.models_dir / "embedding_model"
    
    if use_pretrained and model_path.exists():
        print("\n[STEP 2/5] Loading existing fine-tuned model...")
        trainer.load_model("embedding_model")
    elif not skip_training:
        print("\n[STEP 2/5] Training embedding model...")
        trainer.initialize_model()
        
        # Create training dataset
        dataset = EmbeddingTrainingDataset(chunks)
        
        # Train model
        trainer.train(dataset.examples)
        trainer.save_model("embedding_model")
    else:
        print("\n[STEP 2/5] Using base model without training...")
        trainer.initialize_model()
        trainer.save_model("embedding_model")
    
    # Step 3: Evaluate model
    print("\n[STEP 3/5] Evaluating model...")
    trainer.evaluate_model(chunks, num_samples=100)
    
    # Step 4: Create vector database
    print("\n[STEP 4/5] Creating vector database...")
    vectordb = VectorDatabase(config)
    vectordb.initialize_database()
    vectordb.load_embedding_model()
    vectordb.add_chunks(chunks)
    
    # Step 5: Test search
    print("\n[STEP 5/5] Testing search functionality...")
    test_queries = [
        f"How to click a button using {library_name}",
        f"How to connect to an application with {library_name}",
        f"Example code for {library_name}"
    ]
    
    for query in test_queries:
        print(f"\nQuery: '{query}'")
        results = vectordb.search(query, n_results=3)
        
        for i, doc in enumerate(results['documents'][0][:2]):
            print(f"\nResult {i+1} (distance: {results['distances'][0][i]:.4f}):")
            print(doc[:200] + "...")
    
    # Get statistics
    vectordb.get_collection_stats()
    
    print("\n" + "=" * 80)
    print("PIPELINE COMPLETED SUCCESSFULLY!")
    print("=" * 80)
    print(f"\nGenerated files:")
    print(f"  - Model: {config.models_dir / 'embedding_model'}")
    print(f"  - Vector DB: {config.vectordb_dir}")
    print(f"  - Chunks: {config.models_dir / 'processed_chunks.json'}")
    
    return vectordb, trainer, chunks



In [93]:

# ============================================================================
# CELL 7: Quick Test and Usage Examples
# ============================================================================

# Run the complete pipeline
if __name__ == "__main__":
    # Execute pipeline
    vectordb, trainer, chunks = run_complete_pipeline(
        library_name="pywinauto",
        skip_training=False,  # Set to True to skip training
        use_pretrained=False  # Set to True to use existing model
    )
    
    # Example: Search for specific functionality
    print("\n" + "="*80)
    print("EXAMPLE SEARCHES")
    print("="*80)
    
    queries = [
        "How do I automate clicking a button?",
        "Connect to a running application",
        "Type text into a field",
        "Handle dialog boxes"
    ]
    
    for query in queries:
        print(f"\nüîç Query: {query}")
        results = vectordb.search(query, n_results=2)
        
        for i, (doc, dist) in enumerate(zip(results['documents'][0], results['distances'][0])):
            print(f"\n  üìÑ Result {i+1} (relevance: {1-dist:.2%}):")
            print(f"  {doc[:300]}...")

EMBEDDING TRAINING PIPELINE FOR: pywinauto

[STEP 1/5] Loading and preprocessing data...
Loaded 369 items from rag_data\pywinauto\combined\pywinauto_combined.json
Sources: {'documentation': 151, 'github': 120, 'stackoverflow': 98}
Processing all items into chunks...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 369/369 [00:00<00:00, 1481.97it/s]


Created 4414 chunks from 369 items
Saved chunks to models\pywinauto\processed_chunks.json

[STEP 2/5] Training embedding model...
Loading base model: sentence-transformers/all-MiniLM-L6-v2
Model loaded. Embedding dimension: 384
Creating training examples from 367 document groups...
Created 4054 training examples

Starting model training...
Training examples: 4054
Error during training: name 'Dataset' is not defined
Saving base model instead...
Model saved to models\pywinauto\embedding_model
Model saved to models\pywinauto\embedding_model

[STEP 3/5] Evaluating model...

Evaluating model...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:01<00:00,  2.40it/s]


Average similarity for related chunks: 0.7225

[STEP 4/5] Creating vector database...
Initializing ChromaDB...
Collection created: pywinauto_embeddings
Embedding model loaded from models\pywinauto\embedding_model

Adding 4414 chunks to vector database...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 45/45 [01:43<00:00,  2.30s/it]


Added 4414 chunks to vector database
Total items in collection: 4414

[STEP 5/5] Testing search functionality...

Query: 'How to click a button using pywinauto'

Result 1 (distance: 0.2247):
StackOverflow Q&A: How to perform Click action on Button or Text field by pywinauto
Question Code
Button - '&About'   (L750, T388, R834, B411)
    '&About' '&AboutButton' 'Button3'...

Result 2 (distance: 0.2370):
StackOverflow Q&A: How to perform Click action on Button or Text field by pywinauto
Answer Code (Score: 8)
app.window(best_match='Dialog', top_level_only=True).child_window(best_match='About').click()...

Query: 'How to connect to an application with pywinauto'

Result 1 (distance: 0.1391):
Documentation: pywinauto.application module ‚Äî pywinauto 0.6.8 documentation
Section: pywinauto.application module√Ç¬∂
Code Example 1
Application.connect()...

Result 2 (distance: 0.2547):
Documentation: How To√¢¬Ä¬ôs ‚Äî pywinauto 0.6.8 documentation
Section: How To‚Äôs√Ç¬∂ > Definitions√Ç¬∂ > How to