#This file is for: I will use cluster based marker genes and using this gene I will extract the genes "product description" column from RAG file.  

In [1]:
import pandas as pd

In [None]:
marker_per_cluster= "**/Phase2/data_processingPhase2_hsnmf/marker_gene_percluster_hsnmf_output/top_20_marker_gene_per_cluster_hsnmf.csv"
RAG_file= "**ß/Data_processing/Data_processing_output/RAG_output/gene_knowledge_clean_phase1.csv"


In [3]:
# 2. Load the data
df_markers = pd.read_csv(marker_per_cluster)
df_rag = pd.read_csv(RAG_file)

print("--- Data Loaded ---")
print(f"Marker File Columns: {df_markers.columns.tolist()}")
print(f"RAG File Columns: {df_rag.columns.tolist()}")

--- Data Loaded ---
Marker File Columns: ['cluster', 'cluster_size', 'gene', 'p_val', 'p_adj', 'mean_in', 'mean_out', 'log2FC', 'pct_in', 'pct_out']
RAG File Columns: ['gene', 'source', 'match_type', 'score', 'matched_col', 'cell_type', 'description']


In [4]:
# What is the column name for Genes in the marker file? (e.g., 'Gene', 'gene', 'symbol')
marker_gene_col = 'gene' 

# What is the column name for Genes in the RAG file?
rag_gene_col = 'gene'    

# What is the column name for the Product Description in the RAG file?
desc_col = 'description' 
# ==========================================

In [8]:
# 3. Clean Gene Names for Matching
# We normalize both columns to uppercase and remove spaces to ensure "GeneA" matches "geneA"
df_markers['join_key'] = df_markers['gene'].astype(str).str.upper().str.strip()
df_rag['join_key'] = df_rag['gene'].astype(str).str.upper().str.strip()

# 4. Handle Duplicate Genes in RAG File (Optional but Recommended)
# If the RAG file has multiple rows for the same gene (e.g. for different cell types), 
# this step ensures we don't duplicate rows in your marker list.
# We will aggregate text. If you want ALL rows, comment out the next 4 lines.
df_rag_unique = df_rag.groupby('join_key').agg({
    'description': 'first',              # Take the first description found
    'cell_type': lambda x: '; '.join(x.dropna().unique()) # Combine all associated cell types
}).reset_index()

# 5. Merge (Left Join)
# We map the RAG info onto the Marker file
print("Merging data...")
merged_df = pd.merge(
    df_markers, 
    df_rag_unique[['join_key', 'description']], # Selecting columns to add
    on='join_key', 
    how='left'
)

# 6. Cleanup and Save
output_path = "/Users/mmahmud4/Library/CloudStorage/OneDrive-UniversityOfHouston/UH_PhD/Research/Genomics_Research/MDACC_UH/CPRIT_2025/NanoLLM_CellAnnotation/NanoLLM_local_Code/Phase2/data_processingPhase2_hsnmf/marker_gene_percluster_hsnmf_output/marker_genes_withRAG_info.csv"
merged_df.drop(columns=['join_key'], inplace=True)
merged_df.to_csv(output_path, index=False)

print(f"Success! Annotated file saved to:\n{output_path}")

# Preview the results
print("\nPreview of the first 5 rows:")
print(merged_df[['cluster', 'gene', 'description']].head())

Merging data...
Success! Annotated file saved to:
/Users/mmahmud4/Library/CloudStorage/OneDrive-UniversityOfHouston/UH_PhD/Research/Genomics_Research/MDACC_UH/CPRIT_2025/NanoLLM_CellAnnotation/NanoLLM_local_Code/Phase2/data_processingPhase2_hsnmf/marker_gene_percluster_hsnmf_output/marker_genes_withRAG_info.csv

Preview of the first 5 rows:
   cluster   gene                     description
0        0   IL7R          interleukin 7 receptor
1        0  CCL19   C-C motif chemokine ligand 19
2        0    CD2                    CD2 molecule
3        0   CCR7  C-C motif chemokine receptor 7
4        0    ITK     IL2 inducible T cell kinase


In [9]:
output_df=pd.read_csv(output_path)
output_df

Unnamed: 0,cluster,cluster_size,gene,p_val,p_adj,mean_in,mean_out,log2FC,pct_in,pct_out,description
0,0,29921,IL7R,0.000000e+00,0.000000e+00,2.064286,0.241346,3.096460,0.361619,0.046990,interleukin 7 receptor
1,0,29921,CCL19,0.000000e+00,0.000000e+00,0.520981,0.064259,3.019245,0.099863,0.013523,C-C motif chemokine ligand 19
2,0,29921,CD2,0.000000e+00,0.000000e+00,0.490244,0.063933,2.938839,0.095518,0.013015,CD2 molecule
3,0,29921,CCR7,0.000000e+00,0.000000e+00,0.407275,0.058994,2.787334,0.079008,0.013722,C-C motif chemokine receptor 7
4,0,29921,ITK,0.000000e+00,0.000000e+00,0.389937,0.058155,2.745241,0.076435,0.012587,IL2 inducible T cell kinase
...,...,...,...,...,...,...,...,...,...,...,...
455,22,1886,TIMP1,3.438307e-231,1.031492e-229,0.132968,2.026496,-3.929829,0.025451,0.414656,TIMP metallopeptidase inhibitor 1
456,22,1886,DCDC2,4.252779e-230,1.200785e-228,0.361007,2.446550,-2.760647,0.062036,0.481703,doublecortin domain containing 2
457,22,1886,LGALS3BP,5.067424e-224,1.351313e-222,0.465804,2.421851,-2.378312,0.083245,0.512685,BTB domain containing 17
458,22,1886,CDH1,4.775344e-219,1.206403e-217,0.260354,2.087029,-3.002900,0.047720,0.442293,cadherin 1


In [11]:
total_enteries= len(output_df)
total_enteries

460

In [12]:
total_gene= output_df['gene'].unique()
total_gene= len(total_gene)
total_gene

259

In [13]:
# --- Question 2: Did every gene get a description? Is there NaN? ---

# Count how many rows have a missing (NaN) description
missing_count = output_df['description'].isna().sum()
found_count = output_df['description'].notna().sum()

print(f"\n--- Description Coverage ---")
print(f"Rows with description:    {found_count}")
print(f"Rows MISSING description: {missing_count}")

# If you want to see exactly WHICH genes are missing descriptions:
if missing_count > 0:
    print(f"\nList of genes that returned NaN (No description found):")
    # Get unique genes that are missing descriptions
    missing_genes = output_df[output_df['description'].isna()]['gene'].unique()
    print(missing_genes)
else:
    print("\nGreat! No NaN values. Every gene has a description.")


--- Description Coverage ---
Rows with description:    435
Rows MISSING description: 25

List of genes that returned NaN (No description found):
['CYP8B1' 'AFM' 'HAO2' 'HAO1' 'ADH1A' 'APOF' 'F13B' 'ITIH1' 'ANXA9' 'WNK2'
 'CFHR5' 'EPHB6' 'GCNT4' 'C4BPA' 'HKDC1']
