## Modify `fantasia` output 
- **Input**: `results.csv` (output file from fantasia)  
- **Action**: Add two columns (Gene_ID and GOterm_depth)

In [1]:
# Import necessary libraries
import os
import sys
import re
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from itertools import combinations
import plotly.express as px
import plotly.graph_objects as go
from upsetplot import from_indicators, UpSet
from Bio import SeqIO
# Opcional / útil para estilos de plot
sns.set(style="whitegrid", context="notebook")

# Para mostrar los plots dentro del notebook
%matplotlib inline

In [2]:
# Define paths and read input files
proteome_info_path = '/Users/ingridmipe/Documents/UM/Browne_Lab/data/GenProteomes/Mle_2023_oneseq_proteome.csv'
proteome_fasta_path = '/Users/ingridmipe/Documents/UM/Browne_Lab/data/GenProteomes/Mle_2023_oneseq_proteome.fasta'

proteome_info = pd.read_csv(proteome_info_path)
proteome_sequences = list(SeqIO.parse(proteome_fasta_path, "fasta"))

proteome_info

Unnamed: 0,Gene_ID,Num_Transcripts,Transcripts_IDs,Transcript_Lengths,Difference_Length,Longest_Transcript,Length
0,g_16414,1,['anno2.3794_t'],[169],0,anno2.3794_t,169
1,g_666,1,['anno1.g689.t1'],[208],0,anno1.g689.t1,208
2,g_667,1,['anno1.g690.t1'],[255],0,anno1.g690.t1,255
3,g_16419,1,['anno2.3801_t'],[193],0,anno2.3801_t,193
4,g_668,2,"['anno1.g691.t1', 'anno2.MSTRG.14.2']","[715, 654]",61,anno1.g691.t1,715
...,...,...,...,...,...,...,...
15540,g_11239,2,"['anno1.g11650.t1', 'anno1.g11650.t2']","[149, 134]",15,anno1.g11650.t1,149
15541,g_11240,3,"['anno1.g11651.t1', 'anno1.g11651.t2', 'anno2....","[2514, 2516, 2556]",42,anno2.MSTRG.20491.3,2556
15542,g_11241,1,['anno1.g11652.t1'],[402],0,anno1.g11652.t1,402
15543,g_16809,1,['anno2.6432_t'],[144],0,anno2.6432_t,144


In [19]:
# Define paths and read input files from corresponding fantasia experiment
fantasia_results_path = '/Users/ingridmipe/Documents/UM/Browne_Lab/results/fantasia/experiments/AllCells_20251030164736_MarkerGenes/results.csv'
fantasia_results = pd.read_csv(fantasia_results_path)

fantasia_results

Unnamed: 0,accession,go_id,category,evidence_code,go_description,distance,model_name,protein_id,organism,gene_name,...,collapsed_support,n_collapsed_terms,collapsed_terms,identity,similarity,alignment_score,gaps_percentage,alignment_length,length_query,length_reference
0,anno1.g10.t1,GO:0000122,P,IGI,negative regulation of transcription by RNA po...,0.0119,ESM,A4LBC3_CAEEL,Caenorhabditis elegans.,"met-1 {ECO:0000313|EMBL:CCD65167.1, ECO:000031...",...,0,0,,17.0657,28.4223,-198,37.3849,1629,1059,1590
1,anno1.g10.t1,GO:0000242,C,IDA,pericentriolar material,0.0410,ESM3c,Q9VI72_DROME,Drosophila melanogaster (Fruit fly).,"Sas-4 {ECO:0000313|EMBL:AAF54053.2, ECO:000031...",...,0,0,,17.9765,33.8753,29,22.9449,1107,1059,901
2,anno1.g10.t1,GO:0001725,C,IDA,stress fiber,0.0300,Prost-T5,MPRIP_RAT,Rattus norvegicus (Rat).,Mprip,...,0,0,,18.9787,34.3830,82,22.2979,1175,1059,1029
3,anno1.g10.t1,GO:0005768,C,IDA,endosome,0.0425,Ankh3-Large,GGYF2_HUMAN,Homo sapiens (Human).,"GIGYF2 {ECO:0000303|PubMed:12771153, ECO:00003...",...,0,0,,20.2834,36.6145,266,24.1611,1341,1059,1299
4,anno1.g10.t1,GO:0005783,C,IDA,endoplasmic reticulum,0.0425,Ankh3-Large,GGYF2_HUMAN,Homo sapiens (Human).,"GIGYF2 {ECO:0000303|PubMed:12771153, ECO:00003...",...,0,0,,20.2834,36.6145,266,24.1611,1341,1059,1299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167876,anno2.MSTRG.9998.2,GO:0060298,P,IMP,positive regulation of sarcomere organization,0.1139,Prot-T5,PARV_CAEEL,Caenorhabditis elegans.,pat-6 {ECO:0000312|WormBase:T21D12.4},...,0,0,,34.1014,50.6912,531,14.2857,434,431,375
167877,anno2.MSTRG.9998.2,GO:0070252,P,IMP,actin-mediated cell contraction,0.0683,Prost-T5,PARVA_MOUSE,Mus musculus (Mouse).,Parva,...,0,0,,37.4429,53.4247,645,16.6667,438,431,372
167878,anno2.MSTRG.9998.2,GO:0071670,P,IMP,smooth muscle cell chemotaxis,0.0683,Prost-T5,PARVA_MOUSE,Mus musculus (Mouse).,Parva,...,0,0,,37.4429,53.4247,645,16.6667,438,431,372
167879,anno2.MSTRG.9998.2,GO:0071963,P,IMP,establishment or maintenance of cell polarity ...,0.0085,ESM,PARVB_HUMAN,Homo sapiens (Human).,PARVB,...,0,0,,34.4749,51.1416,597,18.4932,438,431,364


In [20]:
# Add a column before first column of fantasia_results with the column Gene_ID from proteome_info matching on 'accession' in fantasia_results and 'Longest_Transcript' in proteome_info
fantasia_results.insert(0, 'Gene_ID', fantasia_results['accession'].map(proteome_info.set_index('Longest_Transcript')['Gene_ID']))
fantasia_results

Unnamed: 0,Gene_ID,accession,go_id,category,evidence_code,go_description,distance,model_name,protein_id,organism,...,collapsed_support,n_collapsed_terms,collapsed_terms,identity,similarity,alignment_score,gaps_percentage,alignment_length,length_query,length_reference
0,g_8,anno1.g10.t1,GO:0000122,P,IGI,negative regulation of transcription by RNA po...,0.0119,ESM,A4LBC3_CAEEL,Caenorhabditis elegans.,...,0,0,,17.0657,28.4223,-198,37.3849,1629,1059,1590
1,g_8,anno1.g10.t1,GO:0000242,C,IDA,pericentriolar material,0.0410,ESM3c,Q9VI72_DROME,Drosophila melanogaster (Fruit fly).,...,0,0,,17.9765,33.8753,29,22.9449,1107,1059,901
2,g_8,anno1.g10.t1,GO:0001725,C,IDA,stress fiber,0.0300,Prost-T5,MPRIP_RAT,Rattus norvegicus (Rat).,...,0,0,,18.9787,34.3830,82,22.2979,1175,1059,1029
3,g_8,anno1.g10.t1,GO:0005768,C,IDA,endosome,0.0425,Ankh3-Large,GGYF2_HUMAN,Homo sapiens (Human).,...,0,0,,20.2834,36.6145,266,24.1611,1341,1059,1299
4,g_8,anno1.g10.t1,GO:0005783,C,IDA,endoplasmic reticulum,0.0425,Ankh3-Large,GGYF2_HUMAN,Homo sapiens (Human).,...,0,0,,20.2834,36.6145,266,24.1611,1341,1059,1299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167876,g_4909,anno2.MSTRG.9998.2,GO:0060298,P,IMP,positive regulation of sarcomere organization,0.1139,Prot-T5,PARV_CAEEL,Caenorhabditis elegans.,...,0,0,,34.1014,50.6912,531,14.2857,434,431,375
167877,g_4909,anno2.MSTRG.9998.2,GO:0070252,P,IMP,actin-mediated cell contraction,0.0683,Prost-T5,PARVA_MOUSE,Mus musculus (Mouse).,...,0,0,,37.4429,53.4247,645,16.6667,438,431,372
167878,g_4909,anno2.MSTRG.9998.2,GO:0071670,P,IMP,smooth muscle cell chemotaxis,0.0683,Prost-T5,PARVA_MOUSE,Mus musculus (Mouse).,...,0,0,,37.4429,53.4247,645,16.6667,438,431,372
167879,g_4909,anno2.MSTRG.9998.2,GO:0071963,P,IMP,establishment or maintenance of cell polarity ...,0.0085,ESM,PARVB_HUMAN,Homo sapiens (Human).,...,0,0,,34.4749,51.1416,597,18.4932,438,431,364


In [21]:
from goatools.obo_parser import GODag

# Cargar el archivo de ontología GO
go_obo_path = "/Users/ingridmipe/Documents/UM/Browne_Lab/data/go-basic.obo"  # Asegúrate de descargar este archivo y proporcionar la ruta correcta
go_dag = GODag(go_obo_path)

# Función para obtener la profundidad de un término GO
def get_go_depth(go_id):
    go_term = go_dag.get(go_id)
    return go_term.depth if go_term else None

# Calcular la profundidad para cada término GO en la columna 'go_id'
fantasia_results['GOterm_depth'] = fantasia_results['go_id'].apply(get_go_depth)

# Colocar la nueva columna 'GOterm_depth' después de 'go_id'
go_id_index = fantasia_results.columns.get_loc('go_id')
cols = fantasia_results.columns.tolist()
cols.insert(go_id_index + 1, cols.pop(cols.index('GOterm_depth')))
fantasia_results = fantasia_results[cols]

fantasia_results

/Users/ingridmipe/Documents/UM/Browne_Lab/data/go-basic.obo: fmt(1.2) rel(2025-07-22) 43,230 Terms


Unnamed: 0,Gene_ID,accession,go_id,GOterm_depth,category,evidence_code,go_description,distance,model_name,protein_id,...,collapsed_support,n_collapsed_terms,collapsed_terms,identity,similarity,alignment_score,gaps_percentage,alignment_length,length_query,length_reference
0,g_8,anno1.g10.t1,GO:0000122,11.0,P,IGI,negative regulation of transcription by RNA po...,0.0119,ESM,A4LBC3_CAEEL,...,0,0,,17.0657,28.4223,-198,37.3849,1629,1059,1590
1,g_8,anno1.g10.t1,GO:0000242,2.0,C,IDA,pericentriolar material,0.0410,ESM3c,Q9VI72_DROME,...,0,0,,17.9765,33.8753,29,22.9449,1107,1059,901
2,g_8,anno1.g10.t1,GO:0001725,4.0,C,IDA,stress fiber,0.0300,Prost-T5,MPRIP_RAT,...,0,0,,18.9787,34.3830,82,22.2979,1175,1059,1029
3,g_8,anno1.g10.t1,GO:0005768,7.0,C,IDA,endosome,0.0425,Ankh3-Large,GGYF2_HUMAN,...,0,0,,20.2834,36.6145,266,24.1611,1341,1059,1299
4,g_8,anno1.g10.t1,GO:0005783,5.0,C,IDA,endoplasmic reticulum,0.0425,Ankh3-Large,GGYF2_HUMAN,...,0,0,,20.2834,36.6145,266,24.1611,1341,1059,1299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167876,g_4909,anno2.MSTRG.9998.2,GO:0060298,10.0,P,IMP,positive regulation of sarcomere organization,0.1139,Prot-T5,PARV_CAEEL,...,0,0,,34.1014,50.6912,531,14.2857,434,431,375
167877,g_4909,anno2.MSTRG.9998.2,GO:0070252,4.0,P,IMP,actin-mediated cell contraction,0.0683,Prost-T5,PARVA_MOUSE,...,0,0,,37.4429,53.4247,645,16.6667,438,431,372
167878,g_4909,anno2.MSTRG.9998.2,GO:0071670,6.0,P,IMP,smooth muscle cell chemotaxis,0.0683,Prost-T5,PARVA_MOUSE,...,0,0,,37.4429,53.4247,645,16.6667,438,431,372
167879,g_4909,anno2.MSTRG.9998.2,GO:0071963,7.0,P,IMP,establishment or maintenance of cell polarity ...,0.0085,ESM,PARVB_HUMAN,...,0,0,,34.4749,51.1416,597,18.4932,438,431,364


In [22]:
# Save the updated fantasia_results to a new CSV file
output_path = '/Users/ingridmipe/Documents/UM/Browne_Lab/results/fantasia/experiments/AllCells_20251030164736_MarkerGenes/results_AllCells-MarkerGenes_GeneID-GOdepth.csv'
fantasia_results.to_csv(output_path, index=False)
