In [3]:
from transformers import pipeline

classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    framework="pt"
)


description = "Tumor suppressor gene involved in preventing cancer."
categories = ["Cancer", "Metabolism", "Immune Response", "Neurodegenerative", "Other"]

result = classifier(description, candidate_labels=categories)
print(result["labels"][0])  # Top category


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


Cancer


In [18]:
import pandas as pd
from tqdm import tqdm

tqdm.pandas()  # Enable tqdm for pandas
# Example DataFrame
data = {
    "Gene Name": ["TP53", "INS", "APOE", "IL6"],
    "Gene Description": [
        "Tumor suppressor gene involved in preventing cancer.",
        "Hormone that regulates glucose metabolism.",
        "Gene associated with Alzheimer's and lipid metabolism.",
        "Cytokine that plays a role in immune response."
    ]
}

df = pd.DataFrame(data)

# Define Categories
categories = ["Cancer", "Metabolism", "Immune", "Neurodegenerative", "Signal Transduction", "Cell Cycle Regulation", "Unknown function", "Other"]

# Function to classify a single gene description
def classify_gene(gene, description, msigdb_genes, dgidb_genes, half_done_df):
    if gene in msigdb_genes or gene in dgidb_genes:
        matching_row = half_done_df[half_done_df['Symbol'] == gene]
        if not matching_row.empty:
            # Assuming there's only one match for the gene, otherwise you could decide how to handle multiple
            predicted_category = matching_row.iloc[0]['Predicted_Category']
            category_scores = matching_row.iloc[0]['Category_Scores']
            all_categories = matching_row.iloc[0]['All_Categories']
            return pd.Series([predicted_category, category_scores, all_categories])
        result = classifier(description, candidate_labels=categories)
        return pd.Series([result["labels"][0], result["scores"], result["labels"]]) 
    else:
        return pd.Series(["Not in DGIDB or MSigDB", None, None])


# # Apply classification to all rows
# df[["Predicted Category", "Category Scores", "All Categories"]] = df["Gene Description"].progress_apply(classify_gene)

# # Display Results
# print(df)

In [5]:

NCBI_INFO = pd.read_csv("./Homo_sapiens.gene_info.gz", sep='\t', compression='gzip')

In [6]:
NCBI_INFO.head()    

Unnamed: 0,#tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type
0,9606,1,A1BG,-,A1B|ABG|GAB|HYST2477,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410...,19,19q13.43,alpha-1-B glycoprotein,protein-coding,A1BG,alpha-1-B glycoprotein,O,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...,20250326,-
1,9606,2,A2M,-,A2MD|CPAMD5|FWP007|S863-7,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899...,12,12p13.31,alpha-2-macroglobulin,protein-coding,A2M,alpha-2-macroglobulin,O,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...,20250326,-
2,9606,3,A2MP1,-,A2MP,HGNC:HGNC:8|Ensembl:ENSG00000291190|AllianceGe...,12,12p13.31,alpha-2-macroglobulin pseudogene 1,pseudo,A2MP1,alpha-2-macroglobulin pseudogene 1,O,pregnancy-zone protein pseudogene,20250304,-
3,9606,9,NAT1,-,AAC1|MNAT|NAT-1|NATI,MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171...,8,8p22,N-acetyltransferase 1,protein-coding,NAT1,N-acetyltransferase 1,O,arylamine N-acetyltransferase 1|N-acetyltransf...,20250326,-
4,9606,10,NAT2,-,AAC2|NAT-2|PNAT,MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156...,8,8p22,N-acetyltransferase 2,protein-coding,NAT2,N-acetyltransferase 2,O,arylamine N-acetyltransferase 2|N-acetyltransf...,20250326,-


In [7]:
NCBI_9606_filtered = NCBI_INFO[NCBI_INFO["#tax_id"] == 9606][["GeneID", "Symbol", "description"]]
print(NCBI_9606_filtered.head())  # Display the first few rows

   GeneID Symbol                         description
0       1   A1BG              alpha-1-B glycoprotein
1       2    A2M               alpha-2-macroglobulin
2       3  A2MP1  alpha-2-macroglobulin pseudogene 1
3       9   NAT1               N-acetyltransferase 1
4      10   NAT2               N-acetyltransferase 2


In [9]:
# Define chunk size
# CURRENTLY ON CHUNK 24 * 5000
chunk_size = 5000  
total_rows = len(NCBI_9606_filtered)
print(total_rows)
output_file = "NCBI_9606_classified.csv"

# Process in chunks and append results
for i in range(40000, total_rows, chunk_size):
    chunk_num = i // chunk_size + 1  # Calculate chunk number
    print(f"Processing chunk {chunk_num}...")

    chunk = NCBI_9606_filtered.iloc[i : i + chunk_size].copy()  # Extract chunk

    # Apply classification
    chunk[["Predicted_Category", "Category_Scores", "All_Categories"]] = chunk["description"].progress_apply(classify_gene)

    # Save to file (append, but include header only for the first chunk)
    chunk.to_csv(output_file, mode='a', header=(i == 0), index=False)

    print(f"Chunk {chunk_num} saved.")

print("Processing complete! All results saved to", output_file)

Processing chunk 9...


100%|██████████| 5000/5000 [2:49:36<00:00,  2.04s/it]      


Chunk 9 saved.
Processing chunk 10...


100%|██████████| 5000/5000 [11:27:07<00:00,  8.25s/it]       


Chunk 10 saved.
Processing chunk 11...


100%|██████████| 5000/5000 [1:12:50<00:00,  1.14it/s]


Chunk 11 saved.
Processing chunk 12...


100%|██████████| 5000/5000 [51:29<00:00,  1.62it/s] 


Chunk 12 saved.
Processing chunk 13...


100%|██████████| 5000/5000 [49:57<00:00,  1.67it/s] 


Chunk 13 saved.
Processing chunk 14...


100%|██████████| 5000/5000 [47:19<00:00,  1.76it/s]


Chunk 14 saved.
Processing chunk 15...


100%|██████████| 5000/5000 [49:18<00:00,  1.69it/s]


Chunk 15 saved.
Processing chunk 16...


100%|██████████| 5000/5000 [1:09:02<00:00,  1.21it/s]


Chunk 16 saved.
Processing chunk 17...


100%|██████████| 5000/5000 [1:12:10<00:00,  1.15it/s]


Chunk 17 saved.
Processing chunk 18...


100%|██████████| 5000/5000 [1:13:04<00:00,  1.14it/s]


Chunk 18 saved.
Processing chunk 19...


100%|██████████| 5000/5000 [1:12:36<00:00,  1.15it/s]


Chunk 19 saved.
Processing chunk 20...


100%|██████████| 5000/5000 [1:13:21<00:00,  1.14it/s]


Chunk 20 saved.
Processing chunk 21...


100%|██████████| 5000/5000 [7:19:22<00:00,  5.27s/it]       


Chunk 21 saved.
Processing chunk 22...


100%|██████████| 5000/5000 [27:36:22<00:00, 19.88s/it]        


Chunk 22 saved.
Processing chunk 23...


100%|██████████| 5000/5000 [2:56:22<00:00,  2.12s/it]  


Chunk 23 saved.
Processing chunk 24...


 68%|██████▊   | 3406/5000 [11:22:10<5:19:15, 12.02s/it]     


KeyboardInterrupt: 

In [13]:
# NEW STRAT => 
# Only process if it is in MSigDB OR DGIDB
# If it has finished processing before, use that one
import json
DGIDB = pd.read_csv("../Data/DGIDB/converted/human/dgidb_ncbi_v2.csv")
dgidb_genes = set(DGIDB["gene_name"])
dgidb_genes
len(dgidb_genes)

4774

In [12]:
with open("../Data/MSigDB/converted/v1_pathway.json", "r") as file:
    MSIGDB = json.load(file)
msigdb_genes = list({gene for genes in MSIGDB.values() for gene in genes})
msigdb_genes
len(msigdb_genes)

22192

In [14]:
half_done_df = pd.read_csv("./NCBI_9606_classified.csv")

In [19]:
# Define chunk size
# CURRENTLY ON CHUNK 24 * 5000
chunk_size = 5000  
total_rows = len(NCBI_9606_filtered)
print(total_rows)
output_file = "NCBI_9606_DGIDB_MSIGDB_genes_ONLY.csv"

# Process in chunks and append results
for i in range(0, total_rows, chunk_size):
    chunk_num = i // chunk_size + 1  # Calculate chunk number
    print(f"Processing chunk {chunk_num}...")

    chunk = NCBI_9606_filtered.iloc[i : i + chunk_size].copy()  # Extract chunk

    # Apply classification
    chunk[["Predicted_Category", "Category_Scores", "All_Categories"]] = chunk.progress_apply(
        lambda row: classify_gene(row["Symbol"], row["description"], msigdb_genes, dgidb_genes, half_done_df), axis=1)

    # Save to file (append, but include header only for the first chunk)
    chunk.to_csv(output_file, mode='a', header=(i == 0), index=False)

    print(f"Chunk {chunk_num} saved.")

print("Processing complete! All results saved to", output_file)

193438
Processing chunk 1...


100%|██████████| 5000/5000 [00:33<00:00, 148.11it/s]


Chunk 1 saved.
Processing chunk 2...


100%|██████████| 5000/5000 [00:40<00:00, 124.04it/s]


Chunk 2 saved.
Processing chunk 3...


100%|██████████| 5000/5000 [00:34<00:00, 143.74it/s]


Chunk 3 saved.
Processing chunk 4...


100%|██████████| 5000/5000 [00:31<00:00, 160.10it/s]


Chunk 4 saved.
Processing chunk 5...


100%|██████████| 5000/5000 [00:19<00:00, 257.54it/s]


Chunk 5 saved.
Processing chunk 6...


100%|██████████| 5000/5000 [00:08<00:00, 584.94it/s] 


Chunk 6 saved.
Processing chunk 7...


100%|██████████| 5000/5000 [00:03<00:00, 1311.05it/s]


Chunk 7 saved.
Processing chunk 8...


100%|██████████| 5000/5000 [00:06<00:00, 783.76it/s] 


Chunk 8 saved.
Processing chunk 9...


100%|██████████| 5000/5000 [00:03<00:00, 1613.77it/s]


Chunk 9 saved.
Processing chunk 10...


100%|██████████| 5000/5000 [00:02<00:00, 1788.60it/s]


Chunk 10 saved.
Processing chunk 11...


100%|██████████| 5000/5000 [00:03<00:00, 1412.77it/s]


Chunk 11 saved.
Processing chunk 12...


100%|██████████| 5000/5000 [00:02<00:00, 2336.97it/s]


Chunk 12 saved.
Processing chunk 13...


100%|██████████| 5000/5000 [00:01<00:00, 2741.70it/s]


Chunk 13 saved.
Processing chunk 14...


100%|██████████| 5000/5000 [00:01<00:00, 2539.54it/s]


Chunk 14 saved.
Processing chunk 15...


100%|██████████| 5000/5000 [00:02<00:00, 2380.17it/s]


Chunk 15 saved.
Processing chunk 16...


100%|██████████| 5000/5000 [00:01<00:00, 2738.53it/s]


Chunk 16 saved.
Processing chunk 17...


100%|██████████| 5000/5000 [00:02<00:00, 2490.89it/s]


Chunk 17 saved.
Processing chunk 18...


100%|██████████| 5000/5000 [00:01<00:00, 2957.43it/s]


Chunk 18 saved.
Processing chunk 19...


100%|██████████| 5000/5000 [00:01<00:00, 2818.28it/s]


Chunk 19 saved.
Processing chunk 20...


100%|██████████| 5000/5000 [00:02<00:00, 2487.35it/s]


Chunk 20 saved.
Processing chunk 21...


100%|██████████| 5000/5000 [00:01<00:00, 2743.09it/s]


Chunk 21 saved.
Processing chunk 22...


100%|██████████| 5000/5000 [00:01<00:00, 2780.85it/s]


Chunk 22 saved.
Processing chunk 23...


100%|██████████| 5000/5000 [00:02<00:00, 2347.61it/s]


Chunk 23 saved.
Processing chunk 24...


100%|██████████| 5000/5000 [00:01<00:00, 2611.48it/s]


Chunk 24 saved.
Processing chunk 25...


100%|██████████| 5000/5000 [00:01<00:00, 2955.09it/s]


Chunk 25 saved.
Processing chunk 26...


100%|██████████| 5000/5000 [00:01<00:00, 2525.06it/s]


Chunk 26 saved.
Processing chunk 27...


100%|██████████| 5000/5000 [00:01<00:00, 2757.29it/s]


Chunk 27 saved.
Processing chunk 28...


100%|██████████| 5000/5000 [00:01<00:00, 2725.64it/s]


Chunk 28 saved.
Processing chunk 29...


100%|██████████| 5000/5000 [00:11<00:00, 430.73it/s] 


Chunk 29 saved.
Processing chunk 30...


100%|██████████| 5000/5000 [00:04<00:00, 1207.48it/s]


Chunk 30 saved.
Processing chunk 31...


100%|██████████| 5000/5000 [00:01<00:00, 3180.71it/s]


Chunk 31 saved.
Processing chunk 32...


100%|██████████| 5000/5000 [00:01<00:00, 2595.06it/s]


Chunk 32 saved.
Processing chunk 33...


100%|██████████| 5000/5000 [00:01<00:00, 2956.21it/s]


Chunk 33 saved.
Processing chunk 34...


100%|██████████| 5000/5000 [00:01<00:00, 3018.90it/s]


Chunk 34 saved.
Processing chunk 35...


100%|██████████| 5000/5000 [00:02<00:00, 2318.42it/s]


Chunk 35 saved.
Processing chunk 36...


100%|██████████| 5000/5000 [00:01<00:00, 2746.96it/s]


Chunk 36 saved.
Processing chunk 37...


100%|██████████| 5000/5000 [00:01<00:00, 2975.76it/s]


Chunk 37 saved.
Processing chunk 38...


100%|██████████| 5000/5000 [00:02<00:00, 2358.60it/s]


Chunk 38 saved.
Processing chunk 39...


100%|██████████| 3438/3438 [00:03<00:00, 1032.26it/s]

Chunk 39 saved.
Processing complete! All results saved to NCBI_9606_DGIDB_MSIGDB_genes_ONLY.csv



