# GO PCA

I want to try to apply the GO-PCA algorithm [<sup>[1](https://github.com/flo-compbio/gopca)</sup>] to our single cell clusters to see if anything pops out.

In [196]:
import os
import sys
from pathlib import Path
from collections import defaultdict
import re
from tempfile import NamedTemporaryFile

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from goatools.obo_parser import GODag
from goatools.go_search import GoSearch

# Project level imports
sys.path.insert(0, '../lib')
from larval_gonad.notebook import Nb
from larval_gonad.plotting import make_figs
from larval_gonad.config import memory

# Setup notebook
nbconfig = Nb.setup_notebook()


last updated: 2018-03-15 
Git hash: b4607e6f1d22f5de926471c9887f399587814dd0


In [195]:
obo = '../data/external/go-basic.obo'
oboDag = GODag(obo)

# Import fly associations
association = '../data/external/gene_association.fb'
fly = defaultdict(set)
with open(association) as fh:
    for row in fh.readlines():
        if row.startswith('!'):
            continue
        cols = row.split('\t')
        fbgn = cols[1]
        goterm = cols[4]
        gtype = cols[11]
        #if gtype != 'protein':
        #    continue
        if goterm not in oboDag:
            continue
        fly[fbgn].add(goterm)

# Convert from fly2go to go2fly
go2fly = defaultdict(set)
for k, v in fly.items():
    for t in v:
        if oboDag[t].namespace != 'biological_process':
            continue
            
        go2fly[t].add(k)
        
# Init searcher
searcher = GoSearch(obo, go2fly)

load obo file ../data/external/go-basic.obo
../data/external/go-basic.obo: fmt(1.2) rel(2018-02-02) 47,109 GO Terms
load obo file ../data/external/go-basic.obo
../data/external/go-basic.obo: fmt(1.2) rel(2018-02-02) 47,109 GO Terms


## Cell Cycle

In [197]:
# Search for cell cycle
cell_cycle_all = re.compile(r'cell cycle', flags=re.IGNORECASE)
cell_cycle_not = re.compile(r'cell cycle.independent', flags=re.IGNORECASE)

tmp = NamedTemporaryFile()
with open(tmp.name, 'w') as log:
    
    # find gos matching cell cycle
    gos_cc_all = searchRes.get_matching_gos(cell_cycle_all, prt=log)
    
    # Find any GOs matching 'cell cycle-independent' (e.g., "lysosome")
    gos_no_cc = searchRes.get_matching_gos(cell_cycle_not, gos=gos_cc_all, prt=log)
    
    # Remove GO terms that are not "cell cycle" GOs
    gos = gos_cc_all.difference(gos_no_cc)
    
    # Add children GOs of cell cycle GOs
    gos_all = searchRes.add_children_gos(gos)
    
    # Only focus on biological process
    keepers = set()
    for go in gos_all:
        if oboDag[go].namespace == 'biological_process':
            keepers.add(go)
            
    cell_cycle_geneids = searchRes.get_items(keepers)
    
tmp.close()

In [215]:
len(cell_cycle_geneids)

992

In [222]:
cell_cycle_str = '\t'.join([
    'GO:0007049',
    'GO',
    'BP', 
    'cell cycle',
    ','.join(cell_cycle_geneids),
    'cell-division cycle'
])

## Meiosis

In [200]:
# Search for meiosis
meiosis_all = re.compile(r'meiosis', flags=re.IGNORECASE)
meiotic_all = re.compile(r'meiotic', flags=re.IGNORECASE)

tmp = NamedTemporaryFile()
with open(tmp.name, 'w') as log:
    
    gos_meiosis_all = searchRes.get_matching_gos(meiosis_all, prt=log)
    gos_meiotic_all = searchRes.get_matching_gos(meiotic_all, prt=log)
    
    # Remove GO terms that are not "cell cycle" GOs
    gos = gos_meiosis_all.union(gos_meiotic_all)
    
    # Add children GOs of cell cycle GOs
    gos_all = searchRes.add_children_gos(gos)
    
    # Only focus on biological process
    keepers = set()
    for go in gos_all:
        if oboDag[go].namespace == 'biological_process':
            keepers.add(go)
            
    meiotic_geneids = searchRes.get_items(keepers)
    
tmp.close()

In [214]:
len(meiotic_geneids)

1101

In [223]:
meiotic_str = '\t'.join([
    'GO:0051321',
    'GO',
    'BP', 
    'meiotic cell cycle	',
    ','.join(meiotic_geneids),
    'meiotic cell-division cycle'
])

## Mitosis

In [202]:
# Search for mitosis
mitosis_all = re.compile(r'mitosis', flags=re.IGNORECASE)
mitotic_all = re.compile(r'mitotic', flags=re.IGNORECASE)

tmp = NamedTemporaryFile()
with open(tmp.name, 'w') as log:
    
    gos_mitosis_all = searchRes.get_matching_gos(mitosis_all, prt=log)
    gos_mitotic_all = searchRes.get_matching_gos(mitotic_all, prt=log)
    gos = gos_mitosis_all.union(gos_mitotic_all)
    
    # Add children GOs of cell cycle GOs
    gos_all = searchRes.add_children_gos(gos)
    
    # Only focus on biological process
    keepers = set()
    for go in gos_all:
        if oboDag[go].namespace == 'biological_process':
            keepers.add(go)
            
    mitotic_geneids = searchRes.get_items(keepers)
    
tmp.close()

In [224]:
len(mitotic_geneids)

809

In [225]:
mitotic_str = '\t'.join([
    'GO:0000278',
    'GO',
    'BP', 
    'mitotic cell cycle',
    ','.join(mitotic_geneids),
    'mitotic cell-division cycle'
])

## Spermatid

In [212]:
# Search for spermatid
spermatid_all = re.compile(r'spermatid', flags=re.IGNORECASE)

tmp = NamedTemporaryFile()
with open(tmp.name, 'w') as log:
    
    sperm_gos = searchRes.get_matching_gos(spermatid_all, prt=log)
    cyst_gos = searchRes.get_matching_gos(spermatid_cyst_all, prt=log)
    gos = sperm_gos.difference(cyst_gos)
    
    # Add children GOs of cell cycle GOs
    gos_all = searchRes.add_children_gos(gos)
    
    # Only focus on biological process
    keepers = set()
    for go in gos_all:
        if oboDag[go].namespace == 'biological_process':
            keepers.add(go)
            
    spermatid_geneids = searchRes.get_items(keepers)
    
tmp.close()

In [213]:
len(spermatid_geneids)

154

In [226]:
spermatid_str = '\t'.join([
    'GO:0007286',
    'GO',
    'BP', 
    'spermatid development',
    ','.join(spermatid_geneids),
    'spermatid cell development'
])

## Output

In [227]:
with open('../output/gopca_test_gene_set.tsv', 'w') as fh:
    fh.write('\n'.join([cell_cycle_str, meiotic_str, mitotic_str, spermatid_str]))