In [1]:
import csv
from IPython.display import clear_output

In [2]:
idr_annot = dict()

with open('idr_annotations_pfam_proteins_merged.csv') as f:
    rdr = csv.reader(f)
    for idx, line in enumerate(rdr):
        idr_annot[line[0]] = line[1]
        if idx % 1000000 == 0:
            clear_output()
            print(f'Processing Row {idx}')

with open('idr_annotations_pfam_proteins_2_merged.csv') as f:
    rdr = csv.reader(f)
    for idx, line in enumerate(rdr):
        idr_annot[line[0]] = line[1]
        if idx % 1000000 == 0:
            clear_output()
            print(f'Processing Row {idx}')

Processing Row 22000000


In [7]:
def parse_doms(dom_str):

    dom_list = [list(map(int, it[1].split(':'))) for it in (dom.split(';') for dom in dom_str.split('|'))]

    return dom_list


def parse_idr(idr_str):
    return [list(map(int, idr.split(':'))) for idr in idr_str.split('|')]


def check_idr_contained(idr, dom):
    start, end = idr
    start2, end2 = dom
    return (start2 <= start) and (end <= end2)


def check_overlaps(idr, dom):
    start, end = idr
    start2, end2 = dom
    return min(end, end2) - max(start, start2) >= 0


def check_idr_contained_dom_list(idr, dom_list):
    for dom in dom_list:
        if check_idr_contained(idr, dom):
            return True
    return False

def check_overlaps_dom_list(idr, dom_list):
    for dom in dom_list:
        if check_overlaps(idr, dom):
            return True
    return False

def domain_list_to_str(domain_list):
    dom_str = ''
    for dom in domain_list:
        dom_str += dom[0] + ';' + str(dom[1][0]) + ':' + str(dom[1][1]) + '|'
    return dom_str[:-1]

In [7]:
with open('pfam_domains_grouped_len_no_motif.csv') as f:
    with open('complete_dataset.csv', 'w') as fOut:
        for idx, line in enumerate(f):

            if idx % 1000000 == 0:
                clear_output()
                print(f'Processing Row {idx}')

            l = line[:-1].split(',')
            if l[0] not in idr_annot.keys():
                fOut.write(line)
            else:
                idrs = idr_annot[l[0]]
                dom_list = parse_doms(l[1])
                idr_list = parse_idr(idrs)
                idr_to_keep = []
                for idr in idr_list:
                    if check_idr_contained_dom_list(idr, dom_list):
                        continue
                    elif check_overlaps_dom_list(idr, dom_list):
                        break
                    else:
                        idr_to_keep.append(idr)
                fOut.write(l[0]+','+l[1])
                if len(idr_to_keep) > 0:
                    for idr in idr_to_keep:
                        fOut.write(f'|d;{idr[0]}:{idr[1]}')
                fOut.write(','+l[2]+'\n')

Processing Row 167000000


In [3]:
motifs = set()
with open('conserved_site.tsv') as f:
    rdr = csv.reader(f, delimiter='\t')
    next(rdr)
    for line in rdr:
        motifs.add(line[0])

In [8]:
def parse_domain(dom_str):
    
    def sort_domains(domain_list):
        idx = sorted(range(len(domain_list)), key=lambda k: domain_list[k][1][0])
        return [domain_list[i] for i in idx]

    unsorted = [[it[0], list(map(int, it[1].split(':')))] for it in (dom.split(';') for dom in dom_str.split('|'))]

    return sort_domains(unsorted)

In [16]:
motifs_and_disorder = motifs
motifs_and_disorder.add('d')

In [17]:
def check_for_consec_disorder_motif(dom_list):
    if len(dom_list) == 1:
        return False
    else:
        for i, dom in enumerate(dom_list[:-1]):
            if dom[0] in motifs_and_disorder and dom_list[i+1][0] in motifs_and_disorder:
                return True
        return False

In [24]:
with open('complete_dataset.csv') as f:
    with open('complete_dataset_no_consec.csv', 'w') as fOut:
        with open('complete_dataset_with_consec.csv', 'w') as fOut2:
            for idx, line in enumerate(f):
                if check_for_consec_disorder_motif(parse_domain(line.split(',')[1])):
                    fOut2.write(line)
                else:
                    fOut.write(line)
                if idx % 1000000 == 0:
                    clear_output()
                    print(idx)

167000000
