In [12]:
import json
import pandas as pd
import io
import asyncio
import time
import os

# Import the new Google Generative AI SDK
import google.generativeai as genai
# Import nest_asyncio to allow running asyncio in environments with an existing event loop
import nest_asyncio

# Apply the patch to allow nested event loops
nest_asyncio.apply()

In [13]:
pollen_data_csv = """
"","Hi_2338_1","Hi_GW21_1","Hi_2339_1"
"ADAM33",25.5,0.5,30.1
"CHI3L1",15.2,1.1,18.9
"GFAP",0.8,45.7,1.2
"SOX2",1.3,50.2,0.9
"KRT14",40.1,0.4,55.3
"COL1A1",35.6,0.7,42.8
"""

pollen_celldata_csv = """
"","cell_type1","cell_type2"
"Hi_2338_1","2338","dermal"
"Hi_GW21_1","GW21","neural"
"Hi_2339_1","2339","dermal"
"""

In [14]:
def load_and_process_data():

    # Read the CSV data into pandas DataFrames
    gene_df = pd.read_csv(io.StringIO(pollen_data_csv), index_col=0)
    cell_df = pd.read_csv(io.StringIO(pollen_celldata_csv), index_col=0)


    gene_df = gene_df.T


    full_df = gene_df.join(cell_df)

    dataset = []
    for cell_id, row in full_df.iterrows():
        # Find the top 2 genes for this cell
        top_genes = row.drop(['cell_type1', 'cell_type2']).astype(float).nlargest(2).index.tolist()
        
        # Create the data entry for our simulation
        dataset.append({
            "cell_id": cell_id,
            "dataset_name": "PollenData",
            "marker_genes": top_genes,
            "spatial_norm": 15.0 + hash(cell_id) % 5, # Simulate a spatial norm
            "true_cell_type": row['cell_type2']
        })
    return dataset, list(full_df['cell_type2'].unique())

In [15]:
API_KEY = "AIzaSyDsilPc6UJw0fRsBOXOLiKmRFxZFQUMnfg"
genai.configure(api_key=API_KEY)

In [16]:
async def call_gemini_with_sdk(prompt, json_schema=None):
    """
    Calls the Gemini API using the Python SDK.
    """
    try:
        # Initialize the model
        model = genai.GenerativeModel('gemini-2.0-flash')
        
        # Set up generation config for JSON output if a schema is provided
        generation_config = None
        if json_schema:
            generation_config = genai.GenerationConfig(
                response_mime_type="application/json",
                response_schema=json_schema
            )
        
        print(f"--- Calling Gemini with SDK ---")
        # Generate content asynchronously
        response = await model.generate_content_async(
            prompt,
            generation_config=generation_config
        )
        
        # The SDK automatically parses the response.
        # For JSON mode, response.text is a string that needs to be loaded.
        if json_schema:
            return json.loads(response.text)
        else:
            return response.text

    except Exception as e:
        print(f"An error occurred with the Gemini SDK: {e}")
        return None

In [17]:
async def junior_bioinformatician(dataset_name, marker_genes, spatial_norm, cell_classes):
    """
    Simulates the Junior Bioinformatician by calling the Gemini LLM via the SDK.
    """
    prompt = f"""
    You are a junior computational biologist. Your goal is to generate the most probable cell type for a given cell.
    Provide a justification for each proposed label with clear biological reasoning.
    
    We are classifying a {dataset_name} cell using known marker genes and spatial context.

    Chain-of-thought:
    1. The cell expresses the following marker genes: {marker_genes}.
    2. GAE embedding norm is {spatial_norm:.2f}, indicating local neighborhood structure.
    3. Possible cell classes are: {cell_classes}

    Combining the marker genes, spatial distribution, and possible cell classes, generate the most probable cell class.
    Output your response in the required JSON format.
    """
    
    json_schema = {
        "type": "object",
        "properties": {
            "predictions": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "cell_type": {"type": "string"},
                        "confidence": {"type": "number"},
                        "justification": {"type": "string"}
                    },
                    "required": ["cell_type", "confidence", "justification"]
                }
            }
        },
        "required": ["predictions"]
    }
    
    return await call_gemini_with_sdk(prompt, json_schema=json_schema)


In [18]:

async def senior_bioinformatician(junior_output, cell_classes):
    """
    Simulates the Senior QA Bioinformatician by calling the Gemini LLM via the SDK.
    """
    junior_output_str = json.dumps(junior_output, indent=2)
    prompt = f"""
    You are a senior scientist and quality assurance expert with 15+ years of experience in transcriptomics.
    Your goal is to review and refine the output of the junior agent.
    
    Carefully evaluate the proposed cell types from the junior agent. Check the accuracy of gene marker claims, confidence score calibration, and biological plausibility.
    
    Junior Agent Output:
    {junior_output_str}

    Based on your expert review, provide the single, most accurate cell type classification as a one-word answer from the possible list: {cell_classes}.
    """
    
    response = await call_gemini_with_sdk(prompt)
    # Clean up the response to be a single word
    return response.strip().replace(".", "") if response else "Unclassified"


In [19]:

async def main():
    """
    Main asynchronous function to run the simulation.
    """
    # 1. Generate a meaningful dataset from the provided CSV data
    custom_dataset, cell_classes = load_and_process_data()
    
    print(f"Successfully loaded and processed data. Found {len(custom_dataset)} cells and the following classes: {cell_classes}\n")

    # 2. Run the TACTIC Workflow Simulation
    for sample in custom_dataset:
        print(f"\n{'='*20} Processing {sample['cell_id']} {'='*20}")

        # Junior Bioinformatician's turn
        junior_prediction = await junior_bioinformatician(
            sample["dataset_name"],
            sample["marker_genes"],
            sample["spatial_norm"],
            cell_classes
        )
        print("\n--- Junior Bioinformatician Output ---")
        print(json.dumps(junior_prediction, indent=2))

        if not junior_prediction or not junior_prediction.get("predictions"):
            print("Junior agent failed to provide a valid prediction. Skipping.")
            continue

        # Senior Bioinformatician's turn
        final_prediction = await senior_bioinformatician(junior_prediction, cell_classes)
        print("\n--- Senior Bioinformatician Final Decision ---")
        print(final_prediction)

        # 3. Compare with Ground Truth
        print("\n--- Comparison ---")
        print(f"Ground Truth:     {sample['true_cell_type']}")
        print(f"Final Prediction: {final_prediction}")
        print(f"Match: {'✅' if sample['true_cell_type'].lower() == final_prediction.lower() else '❌'}")
    
    print(f"\n{'='*20} Simulation Complete {'='*21}")

if __name__ == "__main__":
    # Use asyncio.run() to execute the main async function
    asyncio.run(main())


Successfully loaded and processed data. Found 3 cells and the following classes: ['dermal', 'neural']


--- Calling Gemini with SDK ---

--- Junior Bioinformatician Output ---
{
  "predictions": [
    {
      "cell_type": "dermal",
      "confidence": 0.95,
      "justification": "The cell expresses KRT14 and COL1A1, which are characteristic markers for dermal cells, particularly keratinocytes and fibroblasts, respectively. KRT14 is a type I keratin expressed in the basal layer of the epidermis. COL1A1 encodes for collagen type I alpha 1 chain, a major component of the extracellular matrix in the dermis. Given these markers and the possible cell classes, 'dermal' is the most probable cell type."
    }
  ]
}
--- Calling Gemini with SDK ---

--- Senior Bioinformatician Final Decision ---
dermal

--- Comparison ---
Ground Truth:     dermal
Final Prediction: dermal
Match: ✅

--- Calling Gemini with SDK ---

--- Junior Bioinformatician Output ---
{
  "predictions": [
    {
      "cell_type"