In [1]:
import json
import pandas as pd
from typing import Dict, List, Optional, Union
from pydantic import BaseModel, Field
from datasets import Dataset
from bespokelabs import curator

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# ============================================================================
# Pydantic Models for Structured Outputs
# ============================================================================

class MolecularProperty(BaseModel):
    """Molecular property prediction with confidence and reasoning."""
    property_name: str = Field(description="Name of the molecular property")
    predicted_value: float = Field(description="Predicted numerical value")
    confidence: str = Field(description="Confidence level: high, medium, low")
    reasoning: str = Field(description="Chemical reasoning for the prediction")
    key_structural_features: List[str] = Field(description="Important structural features affecting the property")


In [None]:
# ============================================================================
# Core Dataset Generators
# ============================================================================

class PropertyPredictor(curator.LLM):
    """Generates property prediction tasks with reasoning."""
    
    response_format = MolecularProperty
    
    def prompt(self, input: Dict) -> str:
        smiles = input['canonical_smiles']
        property_type = input.get('standard_type', 'LogD')

        return f"""You are an expert medicinal chemistry research assistant. You have knowledge in areas including:

- Drug design and development - structure-activity relationships (SAR), lead optimization, pharmacophore modeling
- Synthetic chemistry - reaction mechanisms, synthetic routes, protecting group strategies
- Pharmacology - drug-target interactions, pharmacokinetics (ADME), pharmacodynamics
- Computational chemistry - molecular modeling, QSAR and ADME predictions
- Analytical techniques - NMR, MS, HPLC and other characterization methods
- Medicinal chemistry strategies - bioisosteres, prodrugs, fragment-based drug design
Analyze the following molecule and predict its {property_type}.

Molecule (SMILES): {smiles}

Provide a detailed prediction including:
1. The predicted {property_type} value
2. Your confidence level
3. Chemical reasoning based on structural features
4. Key structural features that influence this property

Consider factors like:
- Lipophilicity and hydrophilicity
- Molecular weight and size
- Functional groups present
- Aromatic systems
- Hydrogen bonding potential
- Charge distribution"""

    def parse(self, input: Dict, response: MolecularProperty) -> Dict:
        return {
            "smiles": input['canonical_smiles'],
            "actual_value": input.get('standard_value'),
            "predicted_value": response.predicted_value,
            "property_type": input.get('standard_type', 'LogD'),
            "confidence": response.confidence,
            "reasoning": response.reasoning,
            "key_features": response.key_structural_features,
            "compound_id": input.get('compound_chembl_id', '')
        }
