In [1]:
import pickle
import pandas as pd

#### Identificator to sequence mapping

In [2]:
id2seq = {}
aa1 = "ACDEFGHIKLMNPQRSTVWY"
from Bio import SeqIO
for record in SeqIO.parse("/home/db/pdb/pdb_seqres_clean.txt", "fasta"):
    nonstd_present = False
    for aa in str(record.seq):
        if aa not in aa1:
            nonstd_present = True
            break
    if not nonstd_present:
        id2seq[record.id] = str(record.seq)

#### Pcoils output parser

In [3]:
def parse_pcoils(fn):
    f = open(fn)
    data = f.readlines()
    assignment = []
    seq = []
    for i in range(2, len(data)):
        line = data[i].rstrip()
        results = line.split()
        if float(results[4]) > 0.5:
            assignment.append('1')
        else:
            assignment.append('0')
        seq.append(results[1])
    return(''.join(seq), ''.join(assignment))

#### Get coiled coil assignments for 3 different window variants (14, 21, 28)

In [4]:
df = pickle.load(open('./../../1_Data_Preparation/out/pickle/data_all_74.p', 'rb'))

In [5]:
entries = set(df.index.tolist())

In [6]:
variants = ['14', '21', '28']
pcoils_results = {}
for entry in entries:
    results = []
    for variant in variants:
        result = parse_pcoils('results/%s.coils_n%s' % (entry, variant))
        assert result[0] == id2seq[entry]
        results.append(result[1])
        if '1' in result[1]:
            results.append(1)
        else:
            results.append(0)
    pcoils_results[entry] = results

#### Check for missing entries

In [7]:
len(entries), len(pcoils_results)

(28938, 28938)

#### Create pandas

In [8]:
df = pd.DataFrame.from_dict(pcoils_results, orient='index')
df.columns = ['pcoils_14_assignment', 'pcoils_14_cc',
              'pcoils_21_assignment', 'pcoils_21_cc',
             'pcoils_28_assignment', 'pcoils_28_cc']

#### Save data

In [9]:
df.to_csv('./../../1_Data_Preparation/out/csv/pcoils_all_74.csv')
df.to_pickle(open('./../../1_Data_Preparation/out/pickle/pcoils_all_74.p', 'wb'))