In [6]:
%pip install openai nltk yake textgrad rouge-score


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from openai import OpenAI
from textgrad.engine.local_model_openai_api import ChatExternalClient
import textgrad as tg
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import time
from datetime import datetime, timedelta
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import itertools
from threading import Lock
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import yake
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Ensure NLTK punkt is downloaded
def ensure_punkt():
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        logging.info("Downloading NLTK 'punkt' tokenizer...")
        nltk.download('punkt')
        

ensure_punkt()

nltk.download('punkt_tab')

# Rate limiting constants
RATE_LIMIT = 30  # requests per minute per API key
DELAY_BETWEEN_REQUESTS = 60 / RATE_LIMIT  # seconds between requests
BATCH_SIZE = 25  # Slightly less than rate limit to account for overhead
API_KEYS = [
    "2405b5bd-7cf3-4536-b632-5bb16db66f34",
    "1bac03be-d6ac-4b73-9aea-9ae7bd81a793",
    "9718ad80-530a-4719-a677-05f06d144eff",
    "7aad8b10-3d4e-4be0-b96e-5de389ae85b3"
]
api_key_cycle = itertools.cycle(API_KEYS)
key_lock = Lock()

def calculate_metrics(reference, candidate):
    metrics = {}
    
    # Ensure NLTK punkt is available
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        logging.info("Downloading NLTK 'punkt' tokenizer within thread...")
        nltk.download('punkt')
    
    # Content Similarity using TF-IDF Cosine Similarity
    vectorizer = TfidfVectorizer()
    try:
        tfidf_matrix = vectorizer.fit_transform([reference, candidate])
        metrics['content_similarity'] = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    except Exception as e:
        logging.error(f"TF-IDF Cosine Similarity failed: {e}")
        metrics['content_similarity'] = 0

    # Word Overlap
    try:
        ref_words = set(reference.lower().split())
        cand_words = set(candidate.lower().split())
        metrics['word_overlap'] = len(ref_words.intersection(cand_words)) / len(ref_words) if len(ref_words) > 0 else 0
    except Exception as e:
        logging.error(f"Word Overlap calculation failed: {e}")
        metrics['word_overlap'] = 0

    # BLEU Score
    try:
        smoothie = SmoothingFunction().method4
        reference_tokens = nltk.word_tokenize(reference.lower())
        candidate_tokens = nltk.word_tokenize(candidate.lower())
        metrics['bleu_score'] = sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothie)
    except Exception as e:
        logging.error(f"BLEU Score calculation failed: {e}")
        metrics['bleu_score'] = 0

    # ROUGE Scores
    try:
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        rouge_scores = scorer.score(reference, candidate)
        metrics['rouge1'] = rouge_scores['rouge1'].fmeasure
        metrics['rouge2'] = rouge_scores['rouge2'].fmeasure
        metrics['rougeL'] = rouge_scores['rougeL'].fmeasure
    except Exception as e:
        logging.error(f"ROUGE Scores calculation failed: {e}")
        metrics['rouge1'] = metrics['rouge2'] = metrics['rougeL'] = 0

    # Keyword Matching
    try:
        kw_extractor = yake.KeywordExtractor()
        ref_keywords = {kw[0] for kw in kw_extractor.extract_keywords(reference)}
        cand_keywords = {kw[0] for kw in kw_extractor.extract_keywords(candidate)}
        metrics['keyword_overlap'] = len(ref_keywords.intersection(cand_keywords)) / len(ref_keywords) if len(ref_keywords) > 0 else 0
    except Exception as e:
        logging.error(f"Keyword Matching calculation failed: {e}")
        metrics['keyword_overlap'] = 0

    return metrics

def get_gpt_response(client, prompt, retry_count=3):
    """Get GPT response with rate limiting and character limit"""
    system_prompt = """You are a medical expert assistant. Provide accurate, clear, and well-structured medical advice.
Focus on: accuracy, clear explanation, practical advice, and professional yet accessible language.
Please limit your response to a maximum of 1500 characters."""

    for attempt in range(retry_count):
        try:
            time.sleep(DELAY_BETWEEN_REQUESTS)  # Rate limiting
            response = client.chat.completions.create(
                model="Meta-Llama-3.1-70B-Instruct",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.7,
                max_tokens=400  # Approximate to 1500 characters
            )
            return response.choices[0].message.content
        except Exception as e:
            logging.warning(f"Attempt {attempt + 1} failed: {e}")
            if attempt < retry_count - 1:
                time.sleep(5)  # Wait longer between retries
    logging.error("All retry attempts failed for GPT response.")
    return ""

def estimate_completion_time(total_samples):
    """Calculate estimated completion time based on rate limits"""
    # Each sample needs 2 API calls (GPT and TextGrad)
    total_requests = total_samples * 2

    # Calculate total minutes needed
    total_minutes = total_requests / RATE_LIMIT

    # Add 20% buffer for overhead and potential retries
    total_minutes *= 1.2

    return timedelta(minutes=total_minutes)

def limit_response(response, limit=1500):
    """Truncate the response to the specified character limit"""
    return response[:limit] if len(response) > limit else response

def process_sample(row, start_time):
    batch_results = {}
    try:
        with key_lock:
            api_key = next(api_key_cycle)
        client = OpenAI(base_url="https://api.sambanova.ai/v1", api_key=api_key)
        engine = ChatExternalClient(client=client, model_string='Meta-Llama-3.1-70B-Instruct')
        tg.set_backward_engine(engine, override=True)

        loss_system_prompt = tg.Variable(
        """Evaluate the medical response based on the following criteria:
        - Accuracy: Ensure all information is factually correct and evidence-based.
        - Clarity: The response should be clear, concise, and easy to understand.
        - Completeness: Address all aspects of the medical query comprehensively.
        - Practicality: Provide actionable and practical advice that can be implemented.
        - Professionalism: Maintain a professional tone and uphold medical ethical standards.
        - Relevance: Ensure all information provided is directly related to the query.
        - Consistency: Maintain consistency in terminology and presentation throughout the response.
        - No Questions: The response should not contain any questions or prompts for additional information.
       """,
            requires_grad=False,
            role_description="medical evaluation system"
        )
        # Get GPT response
        gpt_response = get_gpt_response(client, row['input'])

        if not gpt_response:
            logging.error(f"GPT response for index {row.name} is empty.")
            return {}

        # TextGrad optimization (includes its own API call)
        solution = tg.Variable(row['output'], requires_grad=True, role_description="medical response")
        loss_fn = tg.TextLoss(loss_system_prompt)
        optimizer = tg.TGD([solution])
        loss = loss_fn(solution)
        loss.backward()
        optimizer.step()
        textgrad_response = solution.value

        # Limit TextGrad response to 1500 characters
        textgrad_response = limit_response(textgrad_response, 1500)

        # Calculate metrics
        metrics_gpt = calculate_metrics(row['output'], gpt_response)
        metrics_textgrad = calculate_metrics(row['output'], textgrad_response)

        batch_results = {
            'index': row.name,
            'responses': {
                'gpt': gpt_response,
                'textgrad': textgrad_response
            },
            'metrics': {
                'gpt': metrics_gpt,
                'textgrad': metrics_textgrad
            }
        }

        # Show time progress
        elapsed = datetime.now() - start_time
        samples_processed = row.name + 1
        logging.info(f"Progress: {samples_processed} samples processed in {elapsed}")

    except Exception as e:
        logging.error(f"Error processing index {row.name}: {e}")

    return batch_results

def main():
    print("Loading dataset...")
    try:
        df = pd.read_parquet(r'/workspaces/codespaces-jupyter/data/train-00000-of-00001-5e7cb295b9cff0bf.parquet').head(200)
    except Exception as e:
        logging.error(f"Failed to load dataset: {e}")
        return

    total_samples = len(df)

    # Calculate and display time estimate
    estimated_duration = estimate_completion_time(total_samples)
    start_time = datetime.now()
    estimated_completion = start_time + estimated_duration

    print(f"\nProcessing {total_samples} samples:")
    print(f"Start time: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Estimated completion: {estimated_completion.strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Estimated duration: {estimated_duration}")

    all_results = []

    try:
        with ThreadPoolExecutor(max_workers=4) as executor:
            future_to_index = {executor.submit(process_sample, row, start_time): idx for idx, row in df.iterrows()}
            for future in tqdm(as_completed(future_to_index), total=total_samples, desc="Processing samples"):
                result = future.result()
                if result and result.get('metrics'):
                    all_results.append(result)
    except KeyboardInterrupt:
        print("\nProcess interrupted by user. Saving partial results...")
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
    finally:
        # Save final results
        final_results = {
            'metadata': {
                'total_samples_processed': len(all_results),
                'start_time': start_time.isoformat(),
                'end_time': datetime.now().isoformat(),
                'actual_duration': str(datetime.now() - start_time),
            },
            'results': all_results
        }

        try:
            with open('final_results.json', 'w') as f:
                json.dump(final_results, f, indent=2)
            logging.info("Results saved to 'final_results.json'")
        except Exception as e:
            logging.error(f"Failed to save results: {e}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loading dataset...

Processing 200 samples:
Start time: 2024-11-16 17:05:18
Estimated completion: 2024-11-16 17:21:18
Estimated duration: 0:16:00


Processing samples:   0%|          | 0/200 [00:00<?, ?it/s]2024-11-16 17:05:22,640 - INFO - HTTP Request: POST https://api.sambanova.ai/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-16 17:05:22,642 - INFO - LLMCall function forward
2024-11-16 17:05:22,643 - INFO - _backward_through_llm prompt
2024-11-16 17:05:22,644 - INFO - _backward_through_llm gradient
2024-11-16 17:05:22,645 - INFO - TextualGradientDescent prompt for update
2024-11-16 17:05:22,645 - INFO - TextualGradientDescent optimizer response
2024-11-16 17:05:22,646 - INFO - TextualGradientDescent updated text
2024-11-16 17:05:22,652 - INFO - Using default tokenizer.
2024-11-16 17:05:22,725 - INFO - Using default tokenizer.
2024-11-16 17:05:22,818 - INFO - Progress: 3 samples processed in 0:00:04.200147
2024-11-16 17:05:22,819 - INFO - HTTP Request: POST https://api.sambanova.ai/v1/chat/completions "HTTP/1.1 200 OK"
Processing samples:   0%|          | 1/200 [00:04<13:52,  4.18s/it]2024-11-16 17:05:22,822 - INFO - LLMCall func