In [2]:
import time
import pandas as pd
from Bio import Entrez, SeqIO
Entrez.email = "jeremyf@cmu.edu"

In [7]:
fp = "../data/raw/zebrafish_protein_ontology.tsv"

In [9]:
df = pd.read_csv(fp, delimiter="\t")
df.columns = df.columns.map(lambda s: s.replace(" ", "_").lower())
df = df[["gene_product_id", "symbol", "qualifier", "go_name"]]

In [10]:
gene2seq = {}
genes = df["gene_product_id"].unique()
for i, gene in enumerate(genes):
    if i % 10 == 0:
        print(f"loading {i+1}/{len(genes)}")
    try:
        with Entrez.efetch(db="protein", id=gene, rettype="fasta", retmax=1) as query:
            record = next(SeqIO.parse(query, "fasta"))
            gene2seq[gene] = str(record.seq)
    except:  # HTTPError -- I know this is terrible practice 🤫
        gene2seq[gene] = None
    finally:
        time.sleep(1)
df["seq"] = df["gene_product_id"].map(gene2seq)

loading 1/214
loading 11/214
loading 21/214
loading 31/214
loading 41/214
loading 51/214
loading 61/214
loading 71/214
loading 81/214
loading 91/214
loading 101/214
loading 111/214
loading 121/214
loading 131/214
loading 141/214
loading 151/214
loading 161/214
loading 171/214
loading 181/214
loading 191/214
loading 201/214
loading 211/214


In [24]:
relevant_subset = df[df.qualifier.isin(["enables", "involved_in"])].dropna()
it = relevant_subset.go_name.value_counts().to_dict().items()
interesting_go_names = [name for (name, freq) in it if 1 < freq]
interesting_go_names

['ATP binding',
 'nucleotide binding',
 'catalytic activity',
 'ATPase-coupled transmembrane transporter activity',
 'biosynthetic process',
 'transmembrane transport',
 'asparagine biosynthetic process',
 'ATPase activity']

In [59]:
_ = df[df.go_name.isin(interesting_go_names)].dropna()[["seq", "go_name"]]
one_hot = pd.DataFrame(index=_.seq.unique(), columns=interesting_go_names).fillna(0)
for idx, row in _.iterrows():
    one_hot.loc[row.seq, row.go_name] = 1

In [60]:
one_hot

Unnamed: 0,ATP binding,nucleotide binding,catalytic activity,ATPase-coupled transmembrane transporter activity,biosynthetic process,transmembrane transport,asparagine biosynthetic process,ATPase activity
MDFFVRLARETGDRKREFLELGRKAGRFPAASTSNGEISIWCSNDYLGMGQHPDVLDAMKRSVDEYGGGSGGSRNTGGTNHFHVALEREPAEPHGKEDAVLFTSGYSANEGSLSVLAGAVDDCQVFSDSANHASIIDGLRHSGARKHVFRHKDGRHLEELLAAADRDKPKFIALESVHSMRGDIALLAEIAGLAKRYGAVTFLDEVHAVGMYGPGGAGIAARDGVHCEFTVVMGTLAKAFGMTGGYVAGPAVLMDAVRARARSFVFTTALPPAVAAGALAAVRHLRGSDEERRRPAENARLTHGLLRERDIPVLSDRSPIVPVLVGEDRMCKRMSALPLERHGAYVQAIDAPSVPAGEEILRIAPSAVHETEEIHRFVDALDGIWSELGAARRV,0,0,1,0,1,0,0,0
MLRGSARTYWTLTGLWVLLRAGTLVVGLLFQRLFDALGAGGGVWLIIALVAAIEAGRLFLQFGVMINRLEPRVQYGTTARLRHALLGSALRGSEVTARTSPGESLRTVGEDVDETGFFVAWAPTNLAHWLFVAASVTVMMRIDAVVTGALLALLVLLTLVTALAHSRFLRHRRATRAASGEVAGALREMVGAVGAVQAAAAEPQVAAHVAGLNGARAEAAVREELYAVVQRTVIGNPAPIGVGVVLLLVAGRMDEGTFSVGDLALFAFYLQILTEALGSIGMLSVRLQRVSVALGRITNNLGCRLRRSLERASPPIASDAPGGTGEGAAAPDAGPEPAPPLRELAVRGLTARHPGAGHGIEDVDLVVERHTVTVVTGRVGSGKSTLVRAVLGLLPHERGTVLWNGEPIADPASFLVAPRCGYTPQVPCLFSGTVRENVLLGRDGAAFDEAVRLAVAEPDLAAMQDGPDTVVGPRGLRLSGGQIQRVAIARMLVGDPELVVLDDVSSALDPETEHLLWERLLDGTRTVLAVSHRPALLRAADRVVVLEGGRVEASGTFEEVMAVSAEMGRIWTGAGPGGGDAGPAPQSPPAG,1,1,0,1,0,1,0,1
MRGERTAVALLALLVPAGMGLQLVAPYLLRGFIDGALSGDSRKTLLDLAAWSLAAAVGTLVVTAGTEALSSRVAWRSTNRLRADLVEHCLSRPPGFYRKHPPGELVERMDGDVTRLAAVMSTLLLELLAQALLIVGILVALFRLEWRLALVVAPFAAGTLLLLRTLVGRAMPFVTARQRVAADLQGFLEERLAAAEDLRVNGASRYTLRELGDRQDDLYRKARDAARASVRWPATVQGLSAVSVVLALAVSAWLHARGQLSTGTAFASLSYAMLLRRPLLAVTTRFRELEDAAASAQRLRDLLGHGTAAPRTGRGTLPAGLPGVRFDGVSFGYEPDEPVLRDVSFTLRPGERLGVVGRTGSGKSTVVRLLFGLHHPGAGSVSAGGLDLTEIDPRALRSRVALVTQEVHVFHASLRDNLTFFDRSVPDDRLRAALGEAGLGPWLRTLPDGLDTPLGAGARGMSAGEEQQLALARVFLRDPGLVLMDEPTARLDPYSERLLMPALERLLEGRTAVVVEHRPHLLRNVDRILVLEEGKVAEEGERRVLAADPGSRFHALLRTAGATR,1,1,0,1,0,1,0,1
MSSDTHGTDLADGDVLVTGAAGFIGSHLVTELRNSGRNVVAVDRRPLPDDLESTSPPFTGSLREIRGDLNSLNLVDCLKNISTVFHLAALPGVRPSWTQFPEYLRCNVLATQRLMEACVQAGVERVVVASSSSVYGGADGVMSEDDLPRPLSPYGVTKLAAERLALAFAARGDAELSVGALRFFTVYGPGQRPDMFISRLIRATLRGEPVEIYGDGTQLRDFTHVSDVVRALMLTASVRDRGSAVLNIGTGSAVSVNEVVSMTAELTGLRPCTAYGSARIGDVRSTTADVRQAQSVLGFTARTGLREGLATQIEWTRRSLSGAEQDTVPVGGSSVSVPRL,0,0,1,0,0,0,0,0
MCGFVGFSDAGAGQEDARVTAERMLAAVAHRGPDGSDWCHHRGVTLAHCALTFTDPDHGAQPFVSASGATAVVFNGELYNHAVLGDGALPCAPGGDTEVPGGTLRVAGHADARPAAGHVRLRAAGRPHRHHGAGRDRWGRAPLLTPACETDIAFASELTSLLRHPAAPRTPEVRALADYLVLQAFCAPASAVSGVCKVRPGSYVTHRHGALDETEFWRPRLTPDRGAGRGPGRREAARRFEELFRAAVARRMTSTDRRLGVLLSGGLDSSAVAAVAQQLLPGRPVPTFSAGFADPDFDESDHARAVARHLGTEHHVVRIGGADLAGVVESELAVADEPLADPSLLPTRLVCRAAREHVRGVLTGDGADELLLGYRYFQAERAIELLLRVLPAPRLEALVRLLVRRLPARSGNLPVTHALGLLAKGLRAAPEHRFYLSTAPFGPGELPRLLTPEAGAELTGHDPFTEVSRLLRGQPGLTGVQRSQLAVVTHFLRDVILTKTDRGGMRSSLELRSPFLDLDLVEYGNSLPTGLKLHRFTGKYLLRQVAAGWLPPSVVQRTKLGFRAPVAALLRGELRPLLLDTLSPSSLRRGGLFDTGAVRLLIDDHLGGRRDTSRKLWALLVYQLWFESLTAGPRALESPAYPALS,1,1,0,0,0,0,1,0
MGYIHTALKSAGFHHVIQVDTPALGLDSEGLRKLLADFEPDLVGVSTTTPGLPGAIEACEAAKSTGAKVILGGPHTEVYAHENLVHESIDYVGVGEGVTIMPELAEAMERGEEPEGIRGLVTRKHDGGAAPMVNLEEVGWPERAGLPMDRYYSIMAPRPFATMISSRGCPFKCSFCFKQAVDKKSMYRSPEDVVGEMTELKERWGVKEIMFYDDVFTLHRGRVREICGLIGETGLKVRWEAPTRVDLVPEPLLEAMAGAGCVRLRFGIEHGDSEILERMRKESDIQKIEKAVTSAHEAGIKGFGYFIVGWLGETREQFRRTVDLACRLPLDYASFYTATPLPGTPLHTESVAAGQIPPDYWDRFSCGASSTRGSGTWCRTRRSAPSGRTAPSSCAAPWSSRCCRTWR,0,0,1,0,0,0,0,0
MKVLSLHSAGHDTGVAYFEDGRLVFAVETERLTRVKHDHRSDVALRHVLEQECVDTDGIDLVAVSTPVRSGLLRIPDLDRAMERIGAGALHHRTVCEMLGRRVECVVVTHEVSHAALAAHYADWEEGTVVLVNEGRGQLTRSSLFRVTGGALEWVDKDPLPWYGNGFGWTAIGYLLGFGPSPSVAGKVMAMGGYGQPDPRIREQLLSVDPEVMNDRELAERVRADLAGRPEFAPGFETASQVVATFQEMFTEAVRAVLDRHVTRTDAGVGPIALGGGCALNIVANSALREEYGRDVAIPPACGDAGHLTGAGLYALAQVAGVKPEPFSVYRNGGGEARAAVLEAVEGAGLRAVPYDRSAVAGVLAGGGVVALTQGAAELGPRALGHRSLLGSPAVPGMRERMSEKLKRREWFRPLGAVMRDERFAGLYPGRAPSPYMLFEYRLPDGIAPEARHVNGTCRIQTLGPEEDRLYGLLAEFEELSGVPALINTSLNGPGKPIAHTARDVLDDFARTDVDLFVFDDLMVRGAAAR,0,0,1,0,1,0,0,0


In [None]:
_.