In [1]:
import pandas as pd

# 1. Load your files
# Make sure these files are in the same folder as your script
markers_df = pd.read_csv("top_20_marker_gene_per_cluster_hsnmf.csv")
knowledge_df = pd.read_csv("gene_knowledge_clean_phase1.csv")

In [2]:
# ==========================================
# 2. PRE-PROCESSING
# ==========================================
# Standardize gene names to uppercase to ensure matching works
markers_df['gene'] = markers_df['gene'].astype(str).str.upper().str.strip()
knowledge_df['gene'] = knowledge_df['gene'].astype(str).str.upper().str.strip()

# ==========================================
# 3. THE LOGIC: MERGE & COUNT
# ==========================================
# Step A: Link every marker gene to its known cell types
# We use an 'inner' join so we only keep genes that exist in both files
merged_df = pd.merge(markers_df, knowledge_df, on='gene', how='inner')
merged_df

Unnamed: 0,cluster,cluster_size,gene,p_val,p_adj,mean_in,mean_out,log2FC,pct_in,pct_out,source,match_type,score,matched_col,cell_type,description
0,0,29921,IL7R,0.000000e+00,0.000000e+00,2.064286,0.241346,3.096460,0.361619,0.046990,PanglaoDB,exact,100,symbol,Nuocytes,interleukin 7 receptor
1,0,29921,CCL19,0.000000e+00,0.000000e+00,0.520981,0.064259,3.019245,0.099863,0.013523,PanglaoDB,exact,100,symbol,Endothelial cells,C-C motif chemokine ligand 19
2,0,29921,CD2,0.000000e+00,0.000000e+00,0.490244,0.063933,2.938839,0.095518,0.013015,PanglaoDB,exact,100,symbol,B cells,CD2 molecule
3,0,29921,CCR7,0.000000e+00,0.000000e+00,0.407275,0.058994,2.787334,0.079008,0.013722,PanglaoDB,exact,100,symbol,Astrocytes,C-C motif chemokine receptor 7
4,0,29921,ITK,0.000000e+00,0.000000e+00,0.389937,0.058155,2.745241,0.076435,0.012587,PanglaoDB,exact,100,symbol,T cells,IL2 inducible T cell kinase
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,22,1886,TIMP1,3.438307e-231,1.031492e-229,0.132968,2.026496,-3.929829,0.025451,0.414656,PanglaoDB,exact,100,symbol,Adipocytes,TIMP metallopeptidase inhibitor 1
431,22,1886,DCDC2,4.252779e-230,1.200785e-228,0.361007,2.446550,-2.760647,0.062036,0.481703,PanglaoDB,exact,100,symbol,Ductal cells,doublecortin domain containing 2
432,22,1886,LGALS3BP,5.067424e-224,1.351313e-222,0.465804,2.421851,-2.378312,0.083245,0.512685,PanglaoDB,fuzzy,94,alias,Bergmann glia,BTB domain containing 17
433,22,1886,CDH1,4.775344e-219,1.206403e-217,0.260354,2.087029,-3.002900,0.047720,0.442293,PanglaoDB,exact,100,symbol,Airway epithelial cells,cadherin 1


In [3]:
# Step B: Count unique supporting genes for each (Cluster, Cell Type) pair
# Logic: "How many distinct genes in Cluster X support Cell Type Y?"
ranking_df = merged_df.groupby(['cluster', 'cell_type'])['gene'].nunique().reset_index()
ranking_df.rename(columns={'gene': 'gene_count'}, inplace=True)

# Step C: Sort the results
# Sort first by Cluster (ascending), then by Score (descending)
ranking_df = ranking_df.sort_values(by=['cluster', 'gene_count'], ascending=[True, False])

# ==========================================
# 4. EXTRACT TOP 5 CANDIDATES
# ==========================================
top_candidates = ranking_df.groupby('cluster').head(10)
top_candidates

Unnamed: 0,cluster,cell_type,gene_count
14,0,T cells,3
1,0,B cells,2
0,0,Astrocytes,1
2,0,B cells naive,1
3,0,Cancer cell,1
...,...,...,...
315,22,Ductal cells,2
309,22,Adipocytes,1
311,22,Alpha cells,1
313,22,B cells,1


In [4]:
# ==========================================
# 5. FORMAT FOR PAPER/PROMPT
# ==========================================
# Create a dictionary for easy printing or JSON use
cluster_candidate_dict = {}

print("--- CANDIDATE GENERATION LOGIC REPORT ---\n")

for cluster_id in sorted(top_candidates['cluster'].unique()):
    # Get the data for this cluster
    subset = top_candidates[top_candidates['cluster'] == cluster_id]
    
    # Format the list as strings: "CellType"
    # Note: We add +1 to cluster_id to match your 1-23 indexing preference
    display_id = cluster_id + 1 
    candidates = subset['cell_type'].tolist()
    
    # Store in dictionary
    cluster_candidate_dict[display_id] = candidates
    
    # Print for verification
    print(f"Cluster {display_id}: {candidates}")

# ==========================================
# 6. SAVE TO FILE
# ==========================================
# Save the detailed ranking table for your supplementary materials
top_candidates['cluster_display'] = top_candidates['cluster'] + 1
top_candidates.to_csv("Supplementary_Table_Candidate_Rankings_top10.csv", index=False)
print("\n✅ Logic applied. Detailed rankings saved to 'Supplementary_Table_Candidate_Rankings.csv'")

--- CANDIDATE GENERATION LOGIC REPORT ---

Cluster 1: ['T cells', 'B cells', 'Astrocytes', 'B cells naive', 'Cancer cell', 'Dendritic cells', 'Endothelial cells', 'Eosinophils', 'Macrophages', 'Monocytes']
Cluster 2: ['Basal cells', 'Cholangiocytes', 'Hepatocytes', 'Acinar cells', 'Airway goblet cells', 'Astrocytes', 'Chromaffin cells', 'Epithelial cells', 'Epsilon cells', 'Fibroblasts']
Cluster 3: ['Adipocytes', 'Dendritic cells', 'Normal cell', 'Acinar cells', 'Airway epithelial cells', 'Beta cells', 'Erythroid-like and erythroid precursor cells', 'Proximal tubule cells']
Cluster 4: ['Normal cell', 'Beta cells', 'Cholangiocytes', 'Chondrocytes', 'Dendritic cells', 'Distal tubule cells', 'Ductal cells', 'Epithelial cells', 'Luminal epithelial cells', 'Neurons']
Cluster 5: ['Normal cell', 'Airway epithelial cells', 'Astrocytes', 'Basophils', 'Bergmann glia', 'Cancer cell', 'Cardiac stem and precursor cells', 'Chromaffin cells', 'Ductal cells', 'Epithelial cells']
Cluster 6: ['Normal ce

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_candidates['cluster_display'] = top_candidates['cluster'] + 1


In [5]:
import pandas as pd

# 1. LOAD DATA
# Load your marker list and knowledge base
markers_df = pd.read_csv("top_20_marker_gene_per_cluster_hsnmf.csv")
knowledge_df = pd.read_csv("gene_knowledge_clean_phase1.csv")

# 2. STANDARDIZE TEXT
# Convert all gene names to Uppercase to ensure they match perfectly
markers_df['gene'] = markers_df['gene'].astype(str).str.upper().str.strip()
knowledge_df['gene'] = knowledge_df['gene'].astype(str).str.upper().str.strip()

# 3. MERGE (THE "VOTING" STEP)
# This links every marker in your cluster to its potential cell types
merged_df = pd.merge(markers_df, knowledge_df, on='gene', how='inner')

# 4. CALCULATE SCORES
# Logic: Count how many UNIQUE genes support each cell type per cluster
ranking_df = merged_df.groupby(['cluster', 'cell_type'])['gene'].nunique().reset_index()
ranking_df.rename(columns={'gene': 'support_score'}, inplace=True)

# 5. RANK AND SLICE
# Sort by Cluster (asc) and Score (desc)
ranking_df = ranking_df.sort_values(by=['cluster', 'support_score'], ascending=[True, False])

# Keep only the Top 5 for each cluster
top_5_df = ranking_df.groupby('cluster').head(5)

# 6. GENERATE THE DICTIONARY FOR YOUR PROMPT
candidate_dict = {}

print("--- TOP 5 CANDIDATES PER CLUSTER (Ranked by Gene Support) ---")
for cluster_id in sorted(top_5_df['cluster'].unique()):
    # Extract the list of cell types
    types = top_5_df[top_5_df['cluster'] == cluster_id]['cell_type'].tolist()
    
    # Adjust for 1-based indexing if your prompt uses Cluster 1-23
    display_id = cluster_id + 1
    candidate_dict[display_id] = types
    
    print(f"Cluster {display_id}: {types}")

# Optional: Save this "Evidence Table" for your paper
top_5_df.to_csv("Supplementary_Table_Top5_Candidates.csv", index=False)

--- TOP 5 CANDIDATES PER CLUSTER (Ranked by Gene Support) ---
Cluster 1: ['T cells', 'B cells', 'Astrocytes', 'B cells naive', 'Cancer cell']
Cluster 2: ['Basal cells', 'Cholangiocytes', 'Hepatocytes', 'Acinar cells', 'Airway goblet cells']
Cluster 3: ['Adipocytes', 'Dendritic cells', 'Normal cell', 'Acinar cells', 'Airway epithelial cells']
Cluster 4: ['Normal cell', 'Beta cells', 'Cholangiocytes', 'Chondrocytes', 'Dendritic cells']
Cluster 5: ['Normal cell', 'Airway epithelial cells', 'Astrocytes', 'Basophils', 'Bergmann glia']
Cluster 6: ['Normal cell', 'Cajal-Retzius cells', 'Acinar cells', 'Alpha cells', 'Cholangiocytes']
Cluster 7: ['Ductal cells', 'Hematopoietic stem cells', 'Normal cell', 'Beta cells', 'Cholangiocytes']
Cluster 8: ['Hepatocytes', 'Normal cell', 'Airway goblet cells', 'Acinar cells', 'Adipocytes']
Cluster 9: ['Normal cell', 'Alpha cells', 'Acinar cells', 'Cajal-Retzius cells', 'Chromaffin cells']
Cluster 10: ['Normal cell', 'Epithelial cells', 'Acinar cells', 'B

In [6]:
import pandas as pd

# ==========================================
# 1. LOAD DATA
# ==========================================
# Make sure these files are in the same directory as your script
markers_df = pd.read_csv("top_20_marker_gene_per_cluster_hsnmf.csv")
knowledge_df = pd.read_csv("gene_knowledge_clean_phase1.csv")

# ==========================================
# 2. STANDARDIZE TEXT
# ==========================================
# Convert to Uppercase to ensure perfect matching
markers_df['gene'] = markers_df['gene'].astype(str).str.upper().str.strip()
knowledge_df['gene'] = knowledge_df['gene'].astype(str).str.upper().str.strip()

# ==========================================
# 3. MERGE (THE "VOTING" STEP)
# ==========================================
# Link marker genes to potential cell types
merged_df = pd.merge(markers_df, knowledge_df, on='gene', how='inner')

# ==========================================
# 4. CALCULATE SCORES
# ==========================================
# Logic: Count UNIQUE genes supporting each cell type per cluster
ranking_df = merged_df.groupby(['cluster', 'cell_type'])['gene'].nunique().reset_index()
ranking_df.rename(columns={'gene': 'support_score'}, inplace=True)

# ==========================================
# 5. RANK AND SLICE (TOP 20)
# ==========================================
# Sort by Cluster (asc) and Score (desc)
ranking_df = ranking_df.sort_values(by=['cluster', 'support_score'], ascending=[True, False])

# Extract Top 20 Candidates (instead of Top 5)
top_20_df = ranking_df.groupby('cluster').head(20)

# ==========================================
# 6. GENERATE DICTIONARY
# ==========================================
candidate_dict = {}

print("--- CANDIDATE_DICT (Top 20 per Cluster) ---")
print("CANDIDATE_DICT = {")

for cluster_id in sorted(top_20_df['cluster'].unique()):
    # Get the list of cell types
    types = top_20_df[top_20_df['cluster'] == cluster_id]['cell_type'].tolist()
    
    # Optional: Filter out 'Normal cell' here if you want cleaner lists
    # types = [t for t in types if t != 'Normal cell']
    
    # Adjust for 1-based indexing (Cluster 0 -> Cluster 1)
    display_id = cluster_id + 1
    
    # Store and Print in Python Dictionary format
    candidate_dict[display_id] = types
    print(f"    {display_id}: {types},")

print("}")

# Save to CSV for your records
top_20_df.to_csv("Supplementary_Table_Top20_Candidates.csv", index=False)

--- CANDIDATE_DICT (Top 20 per Cluster) ---
CANDIDATE_DICT = {
    1: ['T cells', 'B cells', 'Astrocytes', 'B cells naive', 'Cancer cell', 'Dendritic cells', 'Endothelial cells', 'Eosinophils', 'Macrophages', 'Monocytes', 'NK cells', 'Normal cell', 'Nuocytes', 'Plasmacytoid dendritic cells', 'Platelets', 'T helper cells', 'T regulatory cells'],
    2: ['Basal cells', 'Cholangiocytes', 'Hepatocytes', 'Acinar cells', 'Airway goblet cells', 'Astrocytes', 'Chromaffin cells', 'Epithelial cells', 'Epsilon cells', 'Fibroblasts', 'Germ cells', 'Mast cells', 'Neurons', 'Normal cell', 'Pulmonary alveolar type II cells'],
    3: ['Adipocytes', 'Dendritic cells', 'Normal cell', 'Acinar cells', 'Airway epithelial cells', 'Beta cells', 'Erythroid-like and erythroid precursor cells', 'Proximal tubule cells'],
    4: ['Normal cell', 'Beta cells', 'Cholangiocytes', 'Chondrocytes', 'Dendritic cells', 'Distal tubule cells', 'Ductal cells', 'Epithelial cells', 'Luminal epithelial cells', 'Neurons', 'Osteo