In [1]:
import pandas as pd
import json
import torch
import re
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
# ==========================================
# 1. SETUP & MODEL LOADING
# ==========================================
LOCAL_QWEN_PATH = "**/Phase2/models_phase2/Qwen2.5-1.5B-Instruct/Qwen2.5-1.5B-Instruct_downloaded"

# Defining DEVICE (Your function uses 'DEVICE' in caps)
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"üöÄ Device: {DEVICE}")

tokenizer = AutoTokenizer.from_pretrained(LOCAL_QWEN_PATH, local_files_only=True)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    LOCAL_QWEN_PATH,
    device_map=DEVICE,
    torch_dtype=torch.float32,
    trust_remote_code=True,
    local_files_only=True
)


`torch_dtype` is deprecated! Use `dtype` instead!


üöÄ Device: mps


In [3]:
# ==========================================
# 1. TOP 20 MARKER GENES (EVIDENCE 0)
# ==========================================

TOP_20_MARKERS = {
    1: "IL7R, CCL19, CD2, CCR7, ITK, TRBC2, GZMK, CD3E, FCN1, CD40LG, TRAC, CTLA4, SELL, FOXP3, CD163, GIMAP7, TIGIT, CD4, CXCR4, CCL5",
    2: "ITGB6, KRT17, LCN2, MLPH, CTTNBP2, CTNND2, CXCL1, TACSTD2, DEFB1, AGR2, TSPAN13, KIF26B, ANXA13, RARRES1, TMC5, FGG, UGT1A1, RERG, CXCL3, SNAP25",
    3: "ADH4, SULT2A1, ARG1, SERPINC1, HAMP, AHSG, AGXT, CYP8B1, SERPINA11, AFM, HAO2, SPP2, HAO1, PCK1, ADH1A, ALDOB, APOF, F13B, F2, ITIH1",
    4: "APCDD1, CHST4, TACSTD2, ALKAL2, KIF12, SLC44A3, F5, FGFR1, FGFR2, CXCL8, NUAK2, PKHD1, FAM171A1, GLIS3, TESC, DCDC2, ANXA9, FXYD2, BICC1, ITGB8",
    5: "KIT, NGFR, ARHGAP24, MFAP4, DSEL, WNK2, FGFR3, EVC, CD44, CHST9, F5, SLC12A2, AQP1, ANXA4, ATP1A1, FTH1, ITGB1, TMSB10, HSP90B1, SOX4",
    6: "CFHR5, COCH, CFHR4, CALB2, GDF15, RGS4, F2, EPHB6, FGFR1, MMP15, CFH, CXCL8, FBLN1, F5, SLC12A2, TACSTD2, FGFR4, ANXA9, HOMER2, GJB1",
    7: "KCNJ16, PDGFRA, CTNND2, HNF1B, CX3CL1, APCDD1, UGT1A1, SOX6, CDH2, ANXA9, WNK2, CDH6, C5, PDGFD, NEO1, FXYD2, CCL2, ABCC4, CHST9, S100A10",
    8: "AKR1B10, TMC5, UGT1A1, GDF15, MUC20, GPX2, CXCL8, TSPAN13, CRYAB, AGR2, S100A13, MAP2, FGA, AKR1C1, TALDO1, TACSTD2, FGG, CHST4, SLPI, POSTN",
    9: "RERG, ADH6, ARSE, RGS4, COCH, CYP4A11, SMIM24, GCNT3, HUNK, NOVA1, GCNT4, KCNJ16, UNC5CL, CCL20, MAP2, OGDHL, FAM149A, BAMBI, SNAP25, CX3CL1",
    10: "IER5, CRYAB, NUAK2, PDGFRA, TSPAN1, FXYD2, C4BPA, CYP4A11, VEPH1, ANXA13, SLPI, CYP3A5, CYP3A4, S100A13, CX3CL1, ARSE, ANXA9, PAQR5, ANPEP, CHST9",
    11: "ACE2, FGB, CCL20, MLPH, FGA, LOXL4, TMEM156, FGG, ALKAL2, TFR2, CHST4, GPX2, SMIM24, MUC1, VEPH1, C5, HOMER2, CXCL2, TMC5, SERPINA6",
    12: "C9ORF152, CALB2, ANXA3, FGA, ALKAL2, TMEM156, ARHGAP24, C5, SLC12A2, CHST9, SH3YL1, WNK2, ABCC4, TMC4, HOMER2, DCDC2, VCAN, ALDH1A1, FGG, FGFR3",
    13: "FBLN1, TRIM45, CXCL3, TMC5, CRYAB, VEPH1, HUNK, HKDC1, SLC44A3, SEMA6A, SPNS2, CXCL2, TUT4, CXCL12, CXCL1, SLPI, SERPINA6, EVC, ELF3, SEMA4G",
    14: "ALDOB, SMIM24, TRIM45, RGS4, TFR2, OGDHL, GCNT4, G6PC, GPT, SLC5A9, AFF3, CDH6, MUC13, SMAD5, IFI27, SDCBP2, AKR1B10, RIC8B, GDF15, SLC17A4",
    15: "CALCA, AKR1B10, SPNS2, UGT1A1, COCH, RGS4, NCAM1, CYP4A11, ELOVL7, MYRF, ALDH1A1, SLC5A9, ANXA9, CTTNBP2, GCNT4, CX3CL1, DSEL, FXYD2, PDGFD, HKDC1",
    16: "HMGCS2, NCAM1, VSIG1, IL1RN, CAPN8, TFF1, APOA5, BCL2L15, DUOX2, HPGD, ACSL5, UGT2B10, G6PC, MMP15, RORC, TM4SF5, GJB2, BATF, PPARG, RAP1GAP",
    17: "CALB1, AGR3, MYB, SDR16C5, PLAC8, S100P, GALNT5, ST6GALNAC1, SLC2A1, AGR2, CAPN8, MSLN, TFF2, CDA, SLCO1B3, TM4SF5, GCNT3, C15ORF48, GJB2, IFI27",
    18: "MUC3A, CALCA, DUOX2, HPGD, GCNT3, ANXA13, KRT19, CDH6, ITGB1, BAMBI, VSIG1, IGFBP5, CFH, TSPAN1, MUC1, FTH1, HK2, TMC5, JPT1, TCN1",
    19: "CXCR4, TESC, CXCL8, FTH1, FGG, CDH1, FCGR3A, CD24, TACSTD2, VIM, C15ORF48, FGA, C1QC, CLU, FGFR2, FGB, IGFBP5, SERPINA6, ANPEP, CTSB",
    20: "DUOX2, LCN2, AGXT2, C4BPA, LIPC, APOD, PROM1, CYP4A11, ACE2, FCGBP, KCNJ16, MUC20, DSEL, CHST4, AKR1C3, CYP3A5, UNC5CL, ANPEP, FXYD2, CCL2",
    21: "ERN2, MS4A8, ST6GALNAC1, SLCO1B3, AGR3, S100P, MYB, KCNN4, TFF2, GALNT5, SDR16C5, TFF1, AGR2, GCNT3, DUOX2, RARRES1, FUT2, SLC2A1, TFF3, CAPN8",
    22: "MSLN, KRT17, ITGB6, VSIG1, KIF26B, PTGDS, MUC1, MRS2, CDH2, CFH, MAOB, GPX2, FAM171A1, S100A10, PMEPA1, CLU, CD24, PDGFRA, LAD1, KCNJ16",
    23: "C15ORF48, FAM83E, CANX, ATP1A1, ITGB1, PSAP, TMSB10, CTSB, CLU, VIM, CD63, ANXA4, MYL6, ANPEP, AQP1, TIMP1, DCDC2, LGALS3BP, CDH1, CD24",
}

In [4]:

# ==========================================
# 1. CLEANED CANDIDATE LIST (Top 20,)
# ==========================================
CANDIDATE_DICT = {
    1: ['T cells', 'B cells', 'Astrocytes', 'B cells naive', 'Cancer cell', 'Dendritic cells', 'Endothelial cells', 'Eosinophils', 'Macrophages', 'Monocytes', 'NK cells', 'Nuocytes', 'Plasmacytoid dendritic cells', 'Platelets', 'T helper cells', 'T regulatory cells'],
    2: ['Basal cells', 'Cholangiocytes', 'Hepatocytes', 'Acinar cells', 'Airway goblet cells', 'Astrocytes', 'Chromaffin cells', 'Epithelial cells', 'Epsilon cells', 'Fibroblasts', 'Germ cells', 'Mast cells', 'Neurons', 'Pulmonary alveolar type II cells'],
    3: ['Adipocytes', 'Dendritic cells', 'Acinar cells', 'Airway epithelial cells', 'Beta cells', 'Erythroid-like and erythroid precursor cells', 'Proximal tubule cells'],
    4: ['Beta cells', 'Cholangiocytes', 'Chondrocytes', 'Dendritic cells', 'Distal tubule cells', 'Ductal cells', 'Epithelial cells', 'Luminal epithelial cells', 'Neurons', 'Osteoblasts', 'Proximal tubule cells', 'T regulatory cells'],
    5: ['Airway epithelial cells', 'Astrocytes', 'Basophils', 'Bergmann glia', 'Cancer cell', 'Cardiac stem and precursor cells', 'Chromaffin cells', 'Ductal cells', 'Epithelial cells', 'Hepatic stellate cells', 'Podocytes', 'Retinal ganglion cells', 'Sertoli cells'],
    6: ['Cajal-Retzius cells', 'Acinar cells', 'Alpha cells', 'Cholangiocytes', 'Chromaffin cells', 'Dendritic cells', 'Embryonic stem cells', 'Epithelial cells', 'Fibroblasts', 'Hepatocytes', 'Osteoblasts', 'Stromal cells'],
    7: ['Ductal cells', 'Hematopoietic stem cells', 'Beta cells', 'Cholangiocytes', 'Chondrocytes', 'Distal tubule cells', 'Fibroblasts', 'Hepatocytes', 'Loop of Henle cells', 'Osteoblasts', 'Platelets', 'Pulmonary alveolar type II cells', 'Radial glia cells', 'Sertoli cells'],
    8: ['Hepatocytes', 'Airway goblet cells', 'Acinar cells', 'Adipocytes', 'Cholangiocytes', 'Dendritic cells', 'Ductal cells', 'Endothelial cells', 'Epithelial cells', 'Foveolar cells', 'Paneth cells', 'Pyramidal cells', 'Schwann cells'],
    9: ['Alpha cells', 'Acinar cells', 'Cajal-Retzius cells', 'Chromaffin cells', 'Distal tubule cells', 'Ductal cells', 'Epithelial cells', 'Oligodendrocytes', 'Osteoblasts', 'Pyramidal cells', 'Sebocytes', 'Transient cells'],
    10: ['Epithelial cells', 'Acinar cells', 'Beta cells', 'Cancer cell', 'Ductal cells', 'Endothelial cells', 'Enterocytes', 'Epsilon cells', 'Fibroblasts', 'Oligodendrocytes', 'Osteoblasts', 'Schwann cells', 'Sebocytes', 'Sertoli cells'],
    11: ['Hepatocytes', 'Alpha cells', 'Acinar cells', 'Alveolar macrophages', 'B cells memory', 'Basal cells', 'Cajal-Retzius cells', 'Epithelial cells', 'Erythroblasts', 'Gamma (PP) cells', 'Mast cells', 'Paneth cells'],
    12: ['Cajal-Retzius cells', 'Hepatocytes', 'Alpha cells', 'Astrocytes', 'B cells memory', 'Chromaffin cells', 'Dendritic cells', 'Ductal cells', 'Enterocytes', 'Epsilon cells', 'Germ cells', 'Platelets', 'Podocytes', 'Sertoli cells'],
    13: ['Cholangiocytes', 'Fibroblasts', 'Alveolar macrophages', 'Distal tubule cells', 'Ductal cells', 'Endothelial cells', 'Epiblast cells', 'Hepatocytes', 'Peri-islet Schwann cells', 'Retinal ganglion cells', 'Schwann cells', 'Sertoli cells'],
    14: ['Acinar cells', 'Alpha cells', 'Hepatocytes', 'Cajal-Retzius cells', 'Embryonic stem cells', 'Endothelial cells', 'Enterocytes', 'Epsilon cells', 'Erythroblasts', 'Foveolar cells', 'Loop of Henle cells', 'Olfactory epithelial cells', 'Reticulocytes', 'Transient cells'],
    15: ['Alpha cells', 'Oligodendrocytes', 'Sebocytes', 'Beta cells', 'Endothelial cells', 'Foveolar cells', 'Hepatocytes', 'Natural killer T cells', 'Neurons', 'Osteoblasts', 'Radial glia cells', 'Reticulocytes'],
    16: ['Enterocytes', 'Hepatocytes', 'Acinar cells', 'Adipocyte progenitor cells', 'Cholangiocytes', 'Dendritic cells', 'Epsilon cells', 'Erythroid-like and erythroid precursor cells', 'Fibroblasts', 'Germ cells', 'Macrophages', 'Monocytes', 'Natural killer T cells', 'Stromal cells', 'T helper cells'],
    17: ['Airway goblet cells', 'Dendritic cells', 'Endothelial cells', 'Acinar cells', 'Basophils', 'Distal tubule cells', 'Enteric glia cells', 'Enterocytes', 'Ependymal cells', 'Fibroblasts', 'Trophoblast cells'],
    18: ['Acinar cells', 'Bergmann glia', 'Cholangiocytes', 'Ductal cells', 'Epithelial cells', 'Epsilon cells', 'Erythroid-like and erythroid precursor cells', 'Gamma delta T cells', 'Leydig cells', 'Loop of Henle cells', 'Macrophages', 'Pluripotent stem cells', 'Sebocytes'],
    19: ['Hepatocytes', 'Acinar cells', 'B cells', 'Dendritic cells', 'Airway epithelial cells', 'Alpha cells', 'Astrocytes', 'Cholangiocytes', 'Gamma (PP) cells', 'Kupffer cells', 'Leydig cells', 'Luminal epithelial cells', 'M√ºller cells', 'Neurons'],
    20: ['Acinar cells', 'Airway goblet cells', 'Astrocytes', 'Basal cells', 'Beta cells', 'Cancer cell', 'Crypt cells', 'Distal tubule cells', 'Ductal cells', 'Goblet cells', 'Hepatocytes', 'Proximal tubule cells', 'Schwann cells', 'Sebocytes'],
    21: ['Airway goblet cells', 'Cholangiocytes', 'Acinar cells', 'Basal cells', 'Basophils', 'Delta cells', 'Endothelial cells', 'Enteric glia cells', 'Ependymal cells', 'Erythroid-like and erythroid precursor cells', 'Trophoblast cells'],
    22: ['Basal cells', 'Acinar cells', 'Adipocytes', 'Airway goblet cells', 'B cells', 'Distal tubule cells', 'Ductal cells', 'Endothelial cells', 'Enterocytes', 'Erythroid-like and erythroid precursor cells', 'Fibroblasts', 'Germ cells', 'Hematopoietic stem cells', 'Macrophages', 'M√ºller cells', 'Oligodendrocyte progenitor cells', 'Paneth cells'],
    23: ['Acinar cells', 'Airway epithelial cells', 'Astrocytes', 'Bergmann glia', 'Ductal cells', 'Adipocytes', 'Alpha cells', 'B cells', 'Embryonic stem cells', 'Monocytes', 'M√ºller cells']
}

In [5]:
import torch
import pandas as pd
import re
from tqdm import tqdm

# ==========================================
# 1. FIXED PROMPT (Logic Corrected)
# ==========================================
PROMPT_TEMPLATE = """
You are an expert Cell Biologist Annotator.
Your task is to identify the **specific cell type** of a cluster based on its functional terms and marker genes.

---

---
**EVIDENCE 0: TOP 20 MARKER GENES (Raw Data):**
{top_20_genes}

**EVIDENCE 1: DATA-DRIVEN CANDIDATES (From Database):**
{candidate_list}


**EVIDENCE 2: INPUT MARKER GENES:**
{input_data}

---
**ALLOWED TAXONOMY (You MUST output one of these exact strings):**
- Hepatocyte
- Cholangiocyte (Tumor)
- Cholangiocyte (Reactive/EMT-like)
- Fibroblast / Stroma
- Mesenchymal progenitors
- T Cell
- Macrophage / Monocyte

---

# **INSTRUCTIONS:**
# 1. Analyze the **Input Data** and cross-reference with the **Candidates**.
# 2. **Map the Candidate** to the **Allowed Taxonomy**:
#    - If Candidate is "Cancer cell" or "Cholangiocyte" AND genes show tumor markers -> Map to **Cholangiocyte (Tumor)**.
#    - If Candidate is "Hepatic Stellate Cell" -> Map to **Fibroblast / Stroma**.
# 3. **Immune Check:** If you see **CD3D, CD3E, CD4, CD8, TRAC**, you MUST label as **T Cell**.
# 4. **Specificity Rule:** If genes support **Hepatocyte** (FGA, FGB, FGG, ALB, CYP), choose that over generic "Normal cell".



---
**OUTPUT FORMAT (Strict JSON):**
{{
    "reasoning": "Explain why you chose this taxonomy label based on the evidence.",
    "label": "EXACT STRING FROM ALLOWED TAXONOMY"
}}
"""

In [6]:
# ==========================================
# 3. YOUR INFERENCE FUNCTION (Integrated)
# ==========================================
def ask_local_qwen(raw_prompt):
    try:
        # 1. Format input using Qwen's chat template
        messages = [{"role": "user", "content": raw_prompt}]
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        
        # 2. Prepare inputs
        model_inputs = tokenizer([text], return_tensors="pt").to(DEVICE)

        # 3. Generate Answer
        with torch.no_grad():
            generated_ids = model.generate(
                model_inputs.input_ids,
                max_new_tokens=512,      # Correctly increased length
                do_sample=False,         # Deterministic
                temperature=0.0,         # Greedy decoding
                repetition_penalty=1.1   # Prevents looping
            )

        # 4. Decode ONLY the answer
        input_length = model_inputs.input_ids.shape[1]
        new_tokens = generated_ids[0][input_length:]
        response = tokenizer.decode(new_tokens, skip_special_tokens=True)
        
        return response.strip()

    except Exception as e:
        return f"Error: {str(e)}"

In [7]:
# ==========================================
# 4. ROBUST PARSER (THIS WAS MISSING)
# ==========================================
def extract_fields_with_regex(text):
    """
    Extracts 'label' and 'reasoning' directly from text using Regex.
    Works even if JSON syntax is broken.
    """
    # Clean Markdown
    text = text.replace("```json", "").replace("```", "").strip()
    
    # Extract Label
    label_match = re.search(r'"label"\s*:\s*"([^"]+)"', text, re.IGNORECASE)
    label = label_match.group(1) if label_match else "Error"

    # Extract Reasoning
    reasoning_match = re.search(r'"reasoning"\s*:\s*"([^"]+)"', text, re.IGNORECASE | re.DOTALL)
    reasoning = reasoning_match.group(1) if reasoning_match else "No reasoning found"
    
    # Clean newlines for CSV safety
    reasoning = reasoning.replace("\n", " ").replace('"', "'")
    
    return {"label": label, "reasoning": reasoning}

In [8]:
# ==========================================
# 5. DATA PREPARATION (Clusters 1-23)
# ==========================================
cluster_inputs = {
    1: """
    Function: immune system process
    Supporting Genes: CCL19;CCL5;CCR7;CD2;CD3E;CD4;CD40LG;CTLA4;CXCR4;FCN1;FOXP3;IL7R;ITK;TRAC;TRBC2
    Function: cell motility
    Supporting Genes: CCL19;CCL5;CCR7;CXCR4
    Function: positive regulation of T cell proliferation
    Supporting Genes: CCL19;CCL5;CD3E;CD4;CD40LG
    Function: positive regulation of interleukin-12 production
    Supporting Genes: CCL19;CCR7;CD40LG
    Function: positive regulation of interleukin-4 production
    Supporting Genes: CD3E;CD40LG;FOXP3
    Function: response to virus
    Supporting Genes: CCL19;CCL5;CXCR4;FOXP3
    Function: cell adhesion
    Supporting Genes: CCL5;CD40LG;SELL
    Function: CCL19-activated CCR7 signaling pathway
    Supporting Genes: CCL19;CCR7
    """,
    2: """ 
    Function: Unresolved functional enrichment
    Supporting Genes: AGR2;ANXA13;CTNND2;CTTNBP2;CXCL1;CXCL3;DEFB1;FGG;ITGB6;KIF26B;KRT17;LCN2;MLPH;RARRES1;RERG;SNAP25;TACSTD2;TMC5;TSPAN13;UGT1A1 
    """,
    3: """ 
    Function: lipid metabolic process
    Supporting Genes: ADH1A;ADH4;APOF;CYP8B1;HAO1;HAO2;PCK1;SULT2A1
    Function: carbohydrate metabolic process
    Supporting Genes: ALDOB;PCK1
    Function: hemostasis
    Supporting Genes: F13B;F2;SERPINC1
    Function: regulation of blood coagulation
    Supporting Genes: F2;SERPINC1
    Function: blood coagulation
    Supporting Genes: F13B;F2;SERPINC1
    Function: response to nutrient levels
    Supporting Genes: AFM;CYP8B1
    Function: alcohol metabolic process
    Supporting Genes: ADH1A;ADH4
    Function: inflammatory response
    Supporting Genes: AHSG;F2
    """,
    4: """ 
    Function: anatomical structure development
    Supporting Genes: BICC1;FGFR1;FGFR2;PKHD1
    Function: branching involved in salivary gland morphogenesis
    Supporting Genes: FGFR1;FGFR2
    Function: cell differentiation
    Supporting Genes: FGFR1;FGFR2
    Function: fibroblast growth factor receptor signaling pathway involved in orbitofrontal cortex development
    Supporting Genes: FGFR1;FGFR2
    Function: ventricular zone neuroblast division
    Supporting Genes: FGFR1;FGFR2
    Function: cellular response to fibroblast growth factor stimulus
    Supporting Genes: CXCL8;FGFR1;FGFR2
    Function: positive regulation of phospholipase activity
    Supporting Genes: FGFR1;FGFR2
    Function: regulation of ERK1 and ERK2 cascade
    Supporting Genes: FGFR2;PKHD1
    """,
    5: """ 
    Function: cell motility
    Supporting Genes: ARHGAP24;CD44;ITGB1;KIT
    Function: transmembrane transport
    Supporting Genes: AQP1;ATP1A1;SLC12A2
    Function: camera-type eye morphogenesis
    Supporting Genes: AQP1;SOX4
    Function: hyperosmotic response
    Supporting Genes: AQP1;SLC12A2
    Function: regulation of spontaneous synaptic transmission
    Supporting Genes: ITGB1;SLC12A2
    Function: potassium ion transport
    Supporting Genes: AQP1;ATP1A1;SLC12A2
    Function: cell volume homeostasis
    Supporting Genes: AQP1;SLC12A2
    Function: endochondral bone growth
    Supporting Genes: EVC;FGFR3
    """,
    6: """ 
    Function: defense response to other organism
    Supporting Genes: CFH;CFHR5;F2
    Function: immune system process
    Supporting Genes: CFH;CFHR4;CFHR5
    Function: nervous system process
    Supporting Genes: COCH;FGFR1;HOMER2
    Function: negative regulation of substrate adhesion-dependent cell spreading
    Supporting Genes: FBLN1;TACSTD2
    Function: negative regulation of G protein-coupled receptor signaling pathway
    Supporting Genes: CXCL8;RGS4
    Function: protein maturation
    Supporting Genes: F2;F5
    Function: fibroblast growth factor receptor signaling pathway
    Supporting Genes: FGFR1;FGFR4
    Function: negative regulation of cell motility
    Supporting Genes: FBLN1;TACSTD2
    """,
    7: """ 
    Function: cell adhesion
    Supporting Genes: ANXA9;CCL2;CDH2;CDH6;CTNND2;CX3CL1;NEO1
    Function: cell motility
    Supporting Genes: APCDD1;C5;CCL2;CDH2;CDH6;CX3CL1;PDGFRA
    Function: platelet-derived growth factor receptor signaling pathway
    Supporting Genes: PDGFD;PDGFRA
    Function: chemotaxis
    Supporting Genes: C5;CCL2;CX3CL1;PDGFRA
    Function: positive regulation of cell migration
    Supporting Genes: CCL2;CX3CL1;PDGFD;PDGFRA
    Function: cell activation
    Supporting Genes: PDGFRA;UGT1A1
    Function: cell junction organization
    Supporting Genes: CDH2;CDH6
    Function: cellular response to transforming growth factor beta stimulus
    Supporting Genes: PDGFD;SOX6
    """,
    8: """
    Function: carbohydrate derivative metabolic process
    Supporting Genes: AKR1B10;AKR1C1
    Function: cell adhesion
    Supporting Genes: FGA;FGG
    Function: circulatory system process
    Supporting Genes: FGA;FGG
    Function: fibrinolysis
    Supporting Genes: FGA;FGG
    Function: positive regulation of exocytosis
    Supporting Genes: FGA;FGG
    Function: positive regulation of heterotypic cell-cell adhesion
    Supporting Genes: FGA;FGG
    Function: positive regulation of peptide hormone secretion
    Supporting Genes: FGA;FGG
    Function: positive regulation of protein secretion
    Supporting Genes: FGA;FGG
    """,
    9: """
    Function: tissue morphogenesis
    Supporting Genes: GCNT3;GCNT4
    Function: glycoprotein biosynthetic process
    Supporting Genes: GCNT3;GCNT4
    Function: kidney morphogenesis
    Supporting Genes: GCNT3;GCNT4
    Function: transmembrane transport
    Supporting Genes: KCNJ16;SNAP25
    Function: regulation of cell shape
    Supporting Genes: BAMBI;COCH
    Function: carbohydrate metabolic process
    Supporting Genes: GCNT3;GCNT4
    """,
    10: """
    Function: lipid metabolic process
    Supporting Genes: CYP3A4;CYP3A5;PDGFRA
    Function: alkaloid catabolic process
    Supporting Genes: CYP3A4;CYP3A5
    Function: aflatoxin metabolic process
    Supporting Genes: CYP3A4;CYP3A5
    Function: oxidative demethylation
    Supporting Genes: CYP3A4;CYP3A5
    Function: wound healing
    Supporting Genes: CX3CL1;PDGFRA
    Function: xenobiotic catabolic process
    Supporting Genes: CYP3A4;CYP3A5
    """,
    11: """
    Function: positive regulation of peptide hormone secretion
    Supporting Genes: FGA;FGB;FGG;TFR2
    Function: protein-containing complex assembly
    Supporting Genes: FGA;FGB;FGG
    Function: fibrinolysis
    Supporting Genes: FGA;FGB;FGG
    Function: protein maturation
    Supporting Genes: FGA;FGB;FGG
    Function: response to calcium ion
    Supporting Genes: FGA;FGB;FGG
    Function: cell adhesion
    Supporting Genes: FGA;FGB;FGG
    Function: circulatory system process
    Supporting Genes: FGA;FGB;FGG
    Function: positive regulation of exocytosis
    Supporting Genes: FGA;FGB;FGG
    """,
    12: """
    Function: circulatory system process
    Supporting Genes: ABCC4;FGA;FGG;SLC12A2
    Function: positive regulation of ERK1 and ERK2 cascade
    Supporting Genes: ALKAL2;FGA;FGFR3;FGG
    Function: transmembrane transport
    Supporting Genes: ABCC4;C5;SLC12A2;TMC4
    Function: cell adhesion
    Supporting Genes: FGA;FGG
    Function: fibrinolysis
    Supporting Genes: FGA;FGG
    Function: positive regulation of exocytosis
    Supporting Genes: FGA;FGG
    Function: positive regulation of heterotypic cell-cell adhesion
    Supporting Genes: FGA;FGG
    Function: positive regulation of peptide hormone secretion
    Supporting Genes: FGA;FGG
    """,
    13: """
    Function: defense response
    Supporting Genes: CXCL1;CXCL12;CXCL2;CXCL3
    Function: killing of cells of another organism
    Supporting Genes: CXCL1;CXCL12;CXCL2;CXCL3
    Function: negative chemotaxis
    Supporting Genes: SEMA4G;SEMA6A
    Function: semaphorin-plexin signaling pathway
    Supporting Genes: SEMA4G;SEMA6A
    Function: cell motility
    Supporting Genes: CXCL1;CXCL12;CXCL2;SEMA4G;SEMA6A
    Function: defense response to other organism
    Supporting Genes: CXCL1;CXCL12;CXCL2;CXCL3
    Function: axon guidance
    Supporting Genes: CXCL12;SEMA4G;SEMA6A
    Function: chemotaxis
    Supporting Genes: CXCL1;CXCL12;CXCL2;CXCL3
    """,
    14: """
    Function: protein catabolic process
    Supporting Genes: IFI27;TRIM45
    Function: nucleobase-containing small molecule metabolic process
    Supporting Genes: ALDOB;OGDHL
    Function: regulation of G protein-coupled receptor signaling pathway
    Supporting Genes: RGS4;RIC8B
    Function: transmembrane transport
    Supporting Genes: SLC17A4;SLC5A9
    Function: intracellular iron ion homeostasis
    Supporting Genes: SMAD5;TFR2
    """,
    15: """
    Function: lipid metabolic process
    Supporting Genes: AKR1B10;ALDH1A1;CYP4A11;ELOVL7;SPNS2;UGT1A1
    Function: detoxification
    Supporting Genes: AKR1B10;ALDH1A1
    Function: response to cocaine
    Supporting Genes: MYRF;RGS4
    Function: nervous system process
    Supporting Genes: COCH;SPNS2
    Function: positive regulation of smooth muscle cell proliferation
    Supporting Genes: CX3CL1;PDGFD
    Function: sodium ion transport
    Supporting Genes: FXYD2;SLC5A9
    """,
    16: """
    Function: lipid metabolic process
    Supporting Genes: ACSL5;APOA5;HMGCS2;HPGD;IL1RN;PPARG;UGT2B10
    Function: anatomical structure development
    Supporting Genes: HMGCS2;HPGD;RORC
    Function: cell differentiation
    Supporting Genes: BATF;RORC
    Function: cellular response to insulin stimulus
    Supporting Genes: HMGCS2;PPARG
    Function: digestive system process
    Supporting Genes: TFF1;VSIG1
    Function: intracellular receptor signaling pathway
    Supporting Genes: PPARG;RORC
    Function: response to cAMP
    Supporting Genes: DUOX2;HMGCS2
    Function: response to estradiol
    Supporting Genes: HPGD;MMP15
    """,
    17: """
    Function: glycoprotein biosynthetic process
    Supporting Genes: GALNT5;GCNT3
    Function: transmembrane transport
    Supporting Genes: GJB2;SLC2A1;SLCO1B3
    """,
    18: """
    Function: transforming growth factor beta receptor signaling pathway
    Supporting Genes: BAMBI;HPGD;ITGB1
    Function: cytoskeleton organization
    Supporting Genes: ITGB1;KRT19
    Function: cell adhesion
    Supporting Genes: CALCA;CDH6;ITGB1
    Function: negative regulation of osteoblast differentiation
    Supporting Genes: BAMBI;IGFBP5
    Function: positive regulation of wound healing
    Supporting Genes: DUOX2;ITGB1
    """,
    19: """
    Function: fibrinolysis
    Supporting Genes: FGA;FGB;FGG
    Function: positive regulation of peptide hormone secretion
    Supporting Genes: FGA;FGB;FGG
    Function: protein maturation
    Supporting Genes: FGA;FGB;FGG
    Function: cell adhesion
    Supporting Genes: FGA;FGB;FGG
    """,
    20: """
    Function: positive regulation of reactive oxygen species metabolic process
    Supporting Genes: ACE2;AKR1C3;LCN2
    """,
    21: """
    Function: carbohydrate metabolic process
    Supporting Genes: FUT2;GCNT3;ST6GALNAC1;TFF1
    Function: digestive system process
    Supporting Genes: TFF1;TFF2;TFF3
    Function: endoplasmic reticulum unfolded protein response
    Supporting Genes: AGR2;ERN2
    Function: glycoprotein biosynthetic process
    Supporting Genes: GALNT5;GCNT3
    Function: calcium ion transport
    Supporting Genes: KCNN4;MYB
    """,
    22: """
    Function: cell motility
    Supporting Genes: CD24;CDH2;ITGB6;PDGFRA
    Function: cell morphogenesis
    Supporting Genes: CDH2;CLU;ITGB6
    Function: cell activation
    Supporting Genes: CD24;PDGFRA
    Function: intrinsic apoptotic signaling pathway
    Supporting Genes: CD24;CLU
    Function: wound healing
    Supporting Genes: ITGB6;PDGFRA
    Function: immune system process
    Supporting Genes: CFH;CLU
    """,
    23: """
    Function: cellular defense response
    Supporting Genes: DCDC2;ITGB1;LGALS3BP
    Function: cell motility
    Supporting Genes: CD24;CD63;CDH1;ITGB1
    Function: cell differentiation
    Supporting Genes: ANXA4;CD63;CTSB
    Function: symbiont entry into host cell
    Supporting Genes: ANPEP;CTSB;ITGB1
    Function: dendrite/neuron projection development
    Supporting Genes: DCDC2;CDH1;ITGB1
    Function: protein folding
    Supporting Genes: CANX;CLU
    """

}


In [9]:
# ==========================================
# 3. MODIFIED EXECUTION LOOP
# ==========================================
results = []
print("‚è≥ Starting Inference (Evidence 0 + 1 + 2)...")

for cluster_id, input_block in tqdm(cluster_inputs.items()):
    
    # 1. Get Evidence 0 (Top 20 Genes)
    current_top20 = TOP_20_MARKERS.get(cluster_id, "No markers found")
    
    # 2. Get Evidence 1 (Candidates)
    current_candidates = CANDIDATE_DICT.get(cluster_id, ["Unknown"])
    candidate_str = str(current_candidates)
    
    # 3. Format Prompt (Inject Evidence 0, 1, and 2)
    # Evidence 2 is 'input_block' from the loop
    final_prompt = PROMPT_TEMPLATE.format(
        top_20_genes=current_top20,
        candidate_list=candidate_str, 
        input_data=input_block
    )
    
    # 4. Inference
    raw_output = ask_local_qwen(final_prompt)
    
    # 5. Parsing
    parsed = extract_fields_with_regex(raw_output)
    
    results.append({
        "Cluster": cluster_id,
        "Predicted_Label": parsed["label"],
        "Reasoning": parsed["reasoning"],
        "Candidates": candidate_str,
        "Top20_Genes": current_top20
    })

# ==========================================
# 4. SAVE RESULTS
# ==========================================
df_results = pd.DataFrame(results)
print("\n‚úÖ Final Results:")
print(df_results[['Cluster', 'Predicted_Label']].head())
df_results.to_csv("All_cluster_v5_baseline_knoweldge_A4_EV0_EV1_EV2_reso.csv", index=False)

‚è≥ Starting Inference (Evidence 0 + 1 + 2)...


  0%|          | 0/23 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23/23 [10:49<00:00, 28.24s/it]


‚úÖ Final Results:
   Cluster                    Predicted_Label
0        1                             T Cell
1        2            Mesenchymal progenitors
2        3                          Adipocyte
3        4  Cholangiocyte (Reactive/EMT-like)
4        5            Airway epithelial cells





In [10]:
# ==========================================
# 7. FORMAT & SAVE FINAL OUTPUT
# ==========================================

# 1. Select only the columns you need
final_output = df_results[['Cluster', 'Predicted_Label']].copy()

# 2. Rename columns to match your desired format
final_output.columns = ['Cluster', 'Cell Type']

# 3. Clean up the 'Cell Type' column if needed (optional)
# For example, removing "Cholangiocyte (Tumor)" -> just "Tumor" if you wanted, 
# but keeping the full name is usually better for scientific accuracy.

# 4. Display the first few rows to verify
print("\nüìù Final Formatted Table:")
print(final_output.head())

# 5. Save to CSV
output_filename = "All_cluster_v5_baseline_knoweldge_A4_EV0_EV1_EV2.csv"
final_output.to_csv(output_filename, index=False)

print(f"\n‚úÖ Successfully saved results to: {output_filename}")


üìù Final Formatted Table:
   Cluster                          Cell Type
0        1                             T Cell
1        2            Mesenchymal progenitors
2        3                          Adipocyte
3        4  Cholangiocyte (Reactive/EMT-like)
4        5            Airway epithelial cells

‚úÖ Successfully saved results to: All_cluster_v5_baseline_knoweldge_A4_EV0_EV1_EV2.csv
