In [None]:
# default_exp readuniprot

# readuniprot

> Parse .dat files (Uniprot/Swissprot flat file format)

In [None]:
#hide
%load_ext autoreload
%autoreload 2
from nbdev import show_doc

In [None]:
#export

import collections
import csv
import itertools
import random
import re

from proteinscan import utils

In [None]:
# export

def iterDat(datFPath) :
    """
    Iterator yielding a list of lines for each entry in the .dat file.
    """
    with utils.openGzipOrText(datFPath) as f :
        for (isEnd,it) in itertools.groupby(f, lambda ln : ln.strip()=='//') :
            if not isEnd :
                yield(list(it))

def datEntryLnsWithCode(datEntry,code) :
    "Returns a list of the lines in a .dat file entry with the given code."
    res = []
    for ln in datEntry :
        ln = ln.strip()
        if ln.startswith(code+' ') :
            res.append(ln[len(code):].lstrip())
    return res

def datEntryPrimaryAC(datEntry) :
    "Returns the primary accession number from a .det file entry."
    acLns = datEntryLnsWithCode(datEntry,'AC')
    return acLns[0].split(';')[0]

In [None]:
l = list(iterDat('uniprotTest.dat.gz'))
assert (len(l)==185
        and len(l[2])==61
        and l[2][0]=='ID   002R_IIV3               Reviewed;         458 AA.\n'
        and l[2][10]=='OX   NCBI_TaxID=345201;\n'
        and l[2][-1]=='     QSIDRYFCSL DSNYNSEDED FEYDSDSEDD DSDSEDDC\n'
        and datEntryLnsWithCode(l[2],'DT') ==
            ['16-JUN-2009, integrated into UniProtKB/Swiss-Prot.',
             '11-JUL-2006, sequence version 1.',
             '02-JUN-2021, entry version 28.']
        and datEntryPrimaryAC(l[2])=='Q197F8')

In [None]:
# export

def scanDat(datFPath, fn, returnFull=False, **kwargs) :
    """
    Scan a function across each entry in a .dat file, accumulating the results.
    If the function returns None, the entry is ignored; otherwise the result
    is added to the returned results list. Accepts additional keyword arguments,
    which are passed on to the function to be scanned.

    If returnFull is True, also returns a list of the corresponding full entries.
    """
    res = []
    if returnFull : fullRes = []
    for datEntry in iterDat(datFPath) :
        item = fn(datEntry, **kwargs)
        if item is not None :
            res.append(item)
            if returnFull :
                fullRes.append(datEntry)
    return (res,fullRes) if returnFull else res
def allPrimaryACsInDat(datFPath) :
    "Returns a list of all primary accession numbers in a .dat file."
    return scanDat(datFPath,datEntryPrimaryAC)


In [None]:
acs = allPrimaryACsInDat('uniprotTest.dat.gz')
assert (len(acs)==185
        and acs[:5]==['Q6GZX4', 'Q6GZX3', 'Q197F8', 'Q197F7', 'Q6GZX2']
        and acs[-5:]==['P0C9G3', 'P0C9G4', 'P0C9G1', 'P18559', 'P0C9G6'])