## Gene Ontology Analysis

in terminal:

```pip install pandas goatools biopython```

installs gene set enrichment analysis (GSEApy)

```pip install gseapy```

In [3]:
import pandas as pd
import gseapy as gp
import matplotlib.pyplot as plt

#### starting with significant genes from warm vs. control for phase 1 (ignoring phase 2)
sig_p1_wc_genes.csv

In [28]:
# reading in csv file of sig genes
sig_p1wc = pd.read_csv("/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/CE_methyl_analysis/significant_genes/sig_p1_wc_genes.csv")
sig_p1wc.head()

Unnamed: 0.1,Unnamed: 0,Row.names,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,BBB-WBO-B21,BBB-WBV-B70,...,WPV-BPR-G11,seqname,source,feature,start,end,score,strand,frame,attributes
0,1,LOC111099115,46.830689,1.788586,0.388555,4.60317,4e-06,0.0002,137,141,...,5,NC_035780.1,Gnomon,gene,1840607,1842557,.,-,.,"gene_id ""LOC111103177""; db_xref ""GeneID:111103..."
1,2,LOC111099216,29.511649,1.482861,0.440151,3.368985,0.000754,0.01255,61,59,...,1,NC_035780.1,Gnomon,gene,3517611,3518657,.,+,.,"gene_id ""LOC111132154""; db_xref ""GeneID:111132..."
2,3,LOC111099417,51.700107,2.748743,0.687936,3.995637,6.5e-05,0.001957,101,98,...,3,NC_035780.1,Gnomon,gene,3920144,3928549,.,+,.,"gene_id ""LOC111112068""; db_xref ""GeneID:111112..."
3,4,LOC111099424,16.12747,3.438445,0.826965,4.157907,3.2e-05,0.001075,57,50,...,2,NC_035780.1,Gnomon,gene,4012455,4013027,.,+,.,"gene_id ""LOC111130177""; db_xref ""GeneID:111130..."
4,5,LOC111099722,593.6223,0.899352,0.23246,3.868847,0.000109,0.00287,1591,1713,...,153,NC_035780.1,Gnomon,gene,8951007,8951628,.,-,.,"gene_id ""LOC111114945""; db_xref ""GeneID:111114..."


In [29]:
# cleaning data frame
sig_p1wc.columns
del sig_p1wc['Unnamed: 0']
sig_p1wc.head()

Unnamed: 0,Row.names,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,BBB-WBO-B21,BBB-WBV-B70,BBO-BBO-B16,...,WPV-BPR-G11,seqname,source,feature,start,end,score,strand,frame,attributes
0,LOC111099115,46.830689,1.788586,0.388555,4.60317,4e-06,0.0002,137,141,70,...,5,NC_035780.1,Gnomon,gene,1840607,1842557,.,-,.,"gene_id ""LOC111103177""; db_xref ""GeneID:111103..."
1,LOC111099216,29.511649,1.482861,0.440151,3.368985,0.000754,0.01255,61,59,55,...,1,NC_035780.1,Gnomon,gene,3517611,3518657,.,+,.,"gene_id ""LOC111132154""; db_xref ""GeneID:111132..."
2,LOC111099417,51.700107,2.748743,0.687936,3.995637,6.5e-05,0.001957,101,98,62,...,3,NC_035780.1,Gnomon,gene,3920144,3928549,.,+,.,"gene_id ""LOC111112068""; db_xref ""GeneID:111112..."
3,LOC111099424,16.12747,3.438445,0.826965,4.157907,3.2e-05,0.001075,57,50,10,...,2,NC_035780.1,Gnomon,gene,4012455,4013027,.,+,.,"gene_id ""LOC111130177""; db_xref ""GeneID:111130..."
4,LOC111099722,593.6223,0.899352,0.23246,3.868847,0.000109,0.00287,1591,1713,1323,...,153,NC_035780.1,Gnomon,gene,8951007,8951628,.,-,.,"gene_id ""LOC111114945""; db_xref ""GeneID:111114..."


In [30]:
sig_p1wc.columns

Index(['Row.names', 'baseMean', 'log2FoldChange', 'lfcSE', 'stat', 'pvalue',
       'padj', 'BBB-WBO-B21', 'BBB-WBV-B70', 'BBO-BBO-B16', 'BBO-BBY-B27',
       'BBO-WBO-B16', 'BBO-WBV-B64', 'BBR-BBB-B50', 'BBR-BBG-B38',
       'BBR-BBY-B26', 'BBY-WBG-B42', 'BPO-BPO-O16', 'BPR-BPG-O38',
       'BPR-BPR-O02', 'BPY-BPG-O42', 'BPY-BPY-O29', 'WBB-WBV-W69',
       'WBG-BBB-W56', 'WBG-WBG-W44', 'WBO-BBR-W03', 'WBO-WBV-W64',
       'WBR-BBY-W25', 'WBV-WBO-W23', 'WBV-WBR-W12', 'WBY-BBV-W65',
       'WBY-BBY-W30', 'WPB-BPG-G45', 'WPO-BPO-G16', 'WPO-BPY-G28',
       'WPR-BPY-G25', 'WPV-BPR-G11', 'seqname', 'source', 'feature', 'start',
       'end', 'score', 'strand', 'frame', 'attributes'],
      dtype='object')

In [31]:
samples = ['BBB-WBO-B21', 'BBB-WBV-B70', 'BBO-BBO-B16', 'BBO-BBY-B27',
       'BBO-WBO-B16', 'BBO-WBV-B64', 'BBR-BBB-B50', 'BBR-BBG-B38',
       'BBR-BBY-B26', 'BBY-WBG-B42', 'BPO-BPO-O16', 'BPR-BPG-O38',
       'BPR-BPR-O02', 'BPY-BPG-O42', 'BPY-BPY-O29', 'WBB-WBV-W69',
       'WBG-BBB-W56', 'WBG-WBG-W44', 'WBO-BBR-W03', 'WBO-WBV-W64',
       'WBR-BBY-W25', 'WBV-WBO-W23', 'WBV-WBR-W12', 'WBY-BBV-W65',
       'WBY-BBY-W30', 'WPB-BPG-G45', 'WPO-BPO-G16', 'WPO-BPY-G28',
       'WPR-BPY-G25', 'WPV-BPR-G11']

In [33]:
# Import necessary libraries
import pandas as pd
from goatools import obo_parser
from goatools.associations import read_ncbi_gene2go
from goatools.test_data.genes_NCBI_9606_ProteinCoding import GeneID2nt as GeneID2nt_hsa
from goatools.go_enrichment import GOEnrichmentStudy

# Load your accession numbers and counts matrix into a DataFrame
accession_numbers = sig_p1wc['Row.names']
counts_matrix = sig_p1wc[samples]

# Load gene annotation (GAF) file
obo_dag = obo_parser.GODag("go-basic.obo")  # Replace with the actual path to the go-basic.obo file
gene2go = read_ncbi_gene2go("gene2go", taxids=[6565])  # Replace with the appropriate taxid for your organism

# Map accession numbers to gene symbols using a suitable method
# For example, you can use the Biopython library to retrieve gene symbols
from Bio import Entrez

def get_gene_symbol(accession_number):
    handle = Entrez.efetch(db="gene", id=accession_number, retmode="xml")
    record = Entrez.read(handle)
    return record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_locus']

gene_symbols = [get_gene_symbol(accession) for accession in accession_numbers]

# Create a mapping between gene symbols and counts
gene_counts = pd.DataFrame({'GeneSymbol': gene_symbols})
gene_counts = gene_counts.merge(counts_matrix, left_on='GeneSymbol', right_index=True)

# Perform GO enrichment analysis
pop = GeneID2nt_hsa.keys()  # Use the appropriate population set for your organism
assoc = gene2go.values()
goeaobj = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=True, alpha=0.05, methods=['fdr_bh'])

# Run the analysis
geneids_study = gene_counts.index.tolist()
goea_results_all = goeaobj.run_study(geneids_study)

# Print or visualize the results
for item in goea_results_all:
    print(item)


ImportError: cannot import name 'GeneID2nt' from 'goatools.test_data.genes_NCBI_9606_ProteinCoding' (/home/julia_mcdonough_student_uml_edu/.local/lib/python3.11/site-packages/goatools/test_data/genes_NCBI_9606_ProteinCoding.py)

In [35]:
# Import necessary libraries
import pandas as pd
from goatools import obo_parser
from goatools.associations import read_ncbi_gene2go
from goatools.go_enrichment import GOEnrichmentStudy

# Load your accession numbers and counts matrix into a DataFrame
accession_numbers = sig_p1wc['Row.names']
counts_matrix = sig_p1wc[samples]

# Load gene annotation (GAF) file
obo_dag = obo_parser.GODag("/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/GO_analysis/go-basic.obo")  # Replace with the actual path to the go-basic.obo file
gene2go = read_ncbi_gene2go("gene2go", taxids=[6565])  # Replace with the appropriate taxid for your organism

# Map accession numbers to gene symbols using a suitable method
# For example, you can use the Biopython library to retrieve gene symbols
from Bio import Entrez

def get_gene_symbol(accession_number):
    handle = Entrez.efetch(db="gene", id=accession_number, retmode="xml")
    record = Entrez.read(handle)
    return record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_locus']

gene_symbols = [get_gene_symbol(accession) for accession in accession_numbers]

# Create a mapping between gene symbols and counts
gene_counts = pd.DataFrame({'GeneSymbol': gene_symbols})
gene_counts = gene_counts.merge(counts_matrix, left_on='GeneSymbol', right_index=True)

# Get the population set directly from gene2go
pop = set(gene2go.keys())

# Perform GO enrichment analysis
assoc = gene2go.values()
goeaobj = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=True, alpha=0.05, methods=['fdr_bh'])

# Run the analysis
geneids_study = gene_counts.index.tolist()
goea_results_all = goeaobj.run_study(geneids_study)

# Print or visualize the results
for item in goea_results_all:
    print(item)

/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/GO_analysis/go-basic.obo: fmt(1.2) rel(2023-11-15) 46,228 Terms
DEPRECATED read_ncbi_gene2go: USE Gene2GoReader FROM goatools.anno.genetogo_reader
DEPRECATED read_ncbi_gene2go CALLED FROM: /tmp/ipykernel_3938201/3184945840.py BY <module>
-1
 0) tax_id        
 1) DB_ID         
 2) GO_ID         
 3) Evidence_Code 
 4) Qualifier     
 5) GO_term       
 6) DB_Reference  
 7) NS            


Traceback (most recent call last):
  File "/home/julia_mcdonough_student_uml_edu/.local/lib/python3.11/site-packages/goatools/anno/init/reader_genetogo.py", line 63, in init_associations
    with open(fin_anno) as ifstrm:
         ^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: 'gene2go'

  **FATAL: [Errno 2] No such file or directory: 'gene2go'

**FATAL: gene2go[-1]:
								

AssertionError: 

In [None]:
# Import necessary libraries
import pandas as pd
from goatools import obo_parser
from goatools.associations import read_ncbi_gene2go
from goatools.go_enrichment import GOEnrichmentStudy

# Load your accession numbers and counts matrix into a DataFrame
# Assuming sig_p1wc is a DataFrame with 'Row.names' and 'samples' columns
accession_numbers = sig_p1wc['Row.names']
counts_matrix = sig_p1wc[samples]

# Load gene annotation (GAF) file
obo_dag = obo_parser.GODag("/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/GO_analysis/go-basic.obo")  # Replace with the actual path to the go-basic.obo file

# Download gene2go file for Crassostrea virginica
# Replace "path/to/crassostrea_gene2go.txt" with the actual path where you save the file
# You can find the file on the Gene Ontology website or other relevant databases.
gene2go_file = "path/to/crassostrea_gene2go.txt"
gene2go = read_ncbi_gene2go(gene2go_file, taxids=[29159]) 

# Map accession numbers to gene symbols using a suitable method
# For example, you can use the Biopython library to retrieve gene symbols
from Bio import Entrez

def get_gene_symbol(accession_number):
    handle = Entrez.efetch(db="gene", id=accession_number, retmode="xml")
    record = Entrez.read(handle)
    return record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_locus']

gene_symbols = [get_gene_symbol(accession) for accession in accession_numbers]

# Create a mapping between gene symbols and counts
gene_counts = pd.DataFrame({'GeneSymbol': gene_symbols})
gene_counts = gene_counts.merge(counts_matrix, left_on='GeneSymbol', right_index=True)

# Get the population set directly from gene2go
pop = set(gene2go.keys())

# Perform GO enrichment analysis
assoc = gene2go.values()
goeaobj = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=True, alpha=0.05, methods=['fdr_bh'])

# Run the analysis
geneids_study = gene_counts.index.tolist()
goea_results_all = goeaobj.run_study(geneids_study)

# Print or visualize the results
for item in goea_results_all:
    print(item)
