### Get PDB protein x-ray entries

In [1]:
import os
entries = []
with open('/home/db/pdb/pdb_entry_type.txt', 'r') as f:
    for line in f:
        line = line.rstrip()
        data = line.split('\t')
        if data[1] == 'prot' and data[2] == 'diffraction':
            entries.append(data[0])

### Parse Socket outputs. Program was run on the whole PDB database (biological assemblies).

#### Get entries with Socket output

In [2]:
entries_socket = set()
for entry in entries:
    if os.path.isfile('/home/users/jludwiczak/socket_pdb/socket_db_bio/%s/%s_74.socket_short' % (entry[1:3].lower(), entry.lower())):
        entries_socket.add(entry)

In [3]:
len(entries), len(entries_socket)

(117426, 113445)

Small number of entries is missing due to e.g. non-standard residues, size limitations of PDB format, etc.

### Now parse Socket outputs for all correct entries

#### First get only positive entries (containing CC)

In [4]:
entries_pos = set()
for entry in entries_socket:
    f = open('/home/users/jludwiczak/socket_pdb/socket_db_bio/%s/%s_74.socket_short' % (entry[1:3].lower(), entry.lower()), 'r')
    lines = f.readlines()
    f.close()
    if 'COILED COILS PRESENT' in lines[-2]:
        entries_pos.add(entry)

In [5]:
len(entries_pos)

11117

11117 entries containing CC domains

In [6]:
from lbs.coiledcoils.socket import parse_socket_output
import pickle

In [7]:
data = {}
for entry in entries_pos:
    fn = '/home/users/jludwiczak/socket_pdb/socket_db_bio/%s/%s_74.socket_short' % (entry[1:3].lower(), entry.lower())
    data[entry] = parse_socket_output(fn, entry, method='kih')

#### Data structure

In [8]:
data['2cce']

([{'heptads': ['---------------------------',
    '-----------------------------',
    '---------------------------',
    '-----------------------------'],
   'indices': [('0', '5', '29', 'A'),
    ('1', '5', '30', 'B'),
    ('2', '5', '29', 'C'),
    ('3', '5', '30', 'D')],
   'oligomerization': '4',
   'orientation': 'parallel',
   'sequences': ['IEDKLEEILSKLYHISNELARI',
    'IEDKLEEILSKLYHISNELARIK',
    'IEDKLEEILSKLYHISNELARI',
    'IEDKLEEILSKLYHISNELARIK']}],
 [['0', '1', 'parallel'],
  ['0', '3', 'parallel'],
  ['1', '2', 'parallel'],
  ['2', '1', 'parallel'],
  ['2', '3', 'parallel']])

### Dump data

In [9]:
pickle.dump(data, open('in/cc_biounit_74_kih.p', 'wb'))