In [1]:
import pickle
import pandas as pd
import numpy as np

#### Identificator to sequence mapping

In [2]:
id2seq = {}
aa1 = "ACDEFGHIKLMNPQRSTVWY"
from Bio import SeqIO
for record in SeqIO.parse("/home/db/pdb/pdb_seqres_clean.txt", "fasta"):
    nonstd_present = False
    for aa in str(record.seq):
        if aa not in aa1:
            nonstd_present = True
            break
    if not nonstd_present:
        id2seq[record.id] = str(record.seq)

#### Marcoil output parser

In [3]:
def parse_marcoil(fn):
    f = open(fn)
    data = f.readlines()
    for i in range(0, len(data)):
        line = data[i].rstrip()
        if 'cc-probability in percent and best heptad phase' in line:
            seq = []
            probs = []
            j = i+1
            while data[j].rstrip() != '':
                line2 = data[j].rstrip()
                data2 = line2.split()
                if len(data2) == 4:
                    seq.append(data2[1])
                    probs.append(float(data2[2]))
                j += 1
            return(''.join(seq), probs)

#### Get coiled coil assignments for 3 different probability cutoffs (0.1, 0.5, 0.9)

In [4]:
df = pickle.load(open('./../../1_Data_Preparation/out/pickle/data_all_74.p', 'rb'))

In [5]:
entries = df.index.tolist()

In [6]:
marcoil_data = {}
problist = [10, 50, 90]

for entry in entries:
    seq = id2seq[entry]
    data = parse_marcoil('results/%s.problist' % entry)
    assert data[0] == seq
    probs = np.array(data[1], dtype=float)
    results = []
    for prob in problist:
        temp_arr = np.array(probs, copy=True)
        temp_arr[temp_arr < prob] = 0
        temp_arr[temp_arr >= prob] = 1
        marcoil_assignment = ''.join(str(int(j)) for j in temp_arr)
        assert len(marcoil_assignment) == len(seq)
        if '1' in marcoil_assignment:
            results.append(marcoil_assignment)
            results.append(1)
        else:
            results.append(marcoil_assignment)
            results.append(0)
    marcoil_data[entry] = results

#### Check for missing entries

In [7]:
len(marcoil_data), len(entries)

(28938, 28938)

#### Create pandas

In [8]:
df = pd.DataFrame.from_dict(marcoil_data, orient='index')
df.columns = ['marcoil_10_assignment', 'marcoil_10_cc',
             'marcoil_50_assignment', 'marcoil_50_cc',
             'marcoil_90_assignment', 'marcoil_90_cc']

#### Save data

In [9]:
df.to_csv('./../../1_Data_Preparation/out/csv/marcoil_all_74.csv')
df.to_pickle(open('./../../1_Data_Preparation/out/pickle/marcoil_all_74.p', 'wb'))