### Merge results from other methods benchmark and parsed Socket assignment

In [1]:
import pandas as pd
import pickle

In [2]:
df_socket = pickle.load(open('./../1_Data_Preparation/out/pickle/data_all_74.p', 'rb'))
df_pcoils = pickle.load(open('./../1_Data_Preparation/out/pickle/pcoils_all_74.p', 'rb'))
df_marcoil = pickle.load(open('./../1_Data_Preparation/out/pickle/marcoil_all_74.p', 'rb'))
df_cchmmprof = pickle.load(open('./../1_Data_Preparation/out/pickle/cchmmprof_all_74.p', 'rb'))

In [3]:
df = pd.concat([df_socket, df_cchmmprof, df_marcoil, df_pcoils], axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


### Get sequences after filtering with cd-hit

We noticed some nearly identical sequences in  pdb50 dataset, therefore sequences obtained in data preparation part were additionally filtered with cd-hit to 50 % similarity.

Command was: cd-hit -i all.fasta -o all.cdhit.fasta -c 0.5 -n 2 -T 0


In [5]:
cdhit_entries = set()
from Bio import SeqIO
for record in SeqIO.parse("filter/seq_db_cdhit.fasta", "fasta"):
    cdhit_entries.add(str(record.id))

In [6]:
df['cdhit'] = df.index.isin(cdhit_entries)

### Now compute similarities of all sequences in the dataset. The goal is to find the sequences not more than 30 % similar to any sequence in the dataset. These will be placed in dataset.

We created blast database from all sequences in the dataset:

Command was: makeblastdb -in all.cdhit.fasta -dbtype prot

Afterwards each sequence was queried against the database to get the homologues.

Command was: psiblast -query all.cdhit.fasta -db all.cdhit.fasta -outfmt "6 qseqid sseqid pident qcovs evalue" -evalue 1e-2 -num_threads 20 -max_target_seqs 2 > blast_all_cdhit.csv

In [7]:
df_blast = pd.read_csv('./filter/seq_db_all.csv', sep='\t', names=['qid','sid','ident', 'cov','evalue'])

In [8]:
rows = []
nohits = []
for pdb in df_blast.groupby('qid'):
    pdbid = pdb[0]
    hits = pdb[1]
    hits = hits[(hits['cov'] >= 0) & (hits['qid'] != hits['sid'])]
    if len(hits)>0:
        rows.append(hits.sort_values(by=['ident']).iloc[-1])
    else:
        nohits.append(pdbid)

In [9]:
df_blast = pd.DataFrame(rows)

In [10]:
less30_entries = (set(df_blast[df_blast['ident'] < 30]['qid'].tolist()) | set(nohits))

In [11]:
df['less30'] = df.index.isin(less30_entries)
df.shape

(28939, 19)

### Filter by resolution

In [12]:
ok_res = set()
f = open('./../5_Analyze_Results/resolu.idx')
lines = f.readlines()
for i in range(6, len(lines)):
    data = lines[i].rstrip()
    data = data.replace("\t", "")
    pdb = data.split(';')[0]
    try:
        res = float(data.split(';')[1])
        if res <= 4.0:
            ok_res.add(pdb.lower())
    except ValueError:
        pass

In [13]:
for index, value in df.iterrows():
    pdb, chain = index.split('_')
    if pdb in ok_res:
        df.set_value(index, 'ok_res', True)
    else:
        df.set_value(index, 'ok_res', False)

  after removing the cwd from sys.path.
  


In [14]:
df = df[df['ok_res'] == True]
df.shape

(28731, 20)

### Filter by length

In [15]:
df = df[(df['sequence'].str.len() >= 20) & (df['sequence'].str.len() <=500)]
df.shape

(26193, 20)

In [16]:
li2016_entries = set()
for record in SeqIO.parse("li2016/li2016.fasta", "fasta"):
    pdb, chain = record.id.split('|')[1].split(':')
    li2016_entries.add('%s_%s' % (pdb.lower(), chain))

In [17]:
df['li2016'] = df.index.isin(li2016_entries)

In [18]:
li2016_testset = df[(df['li2016'] == True) & (df['less30'] == 1) & (df['cdhit'] == True)]

In [19]:
li2016_testset.shape

(518, 21)

In [20]:
df['exclude'] = df.index.isin(li2016_testset.index)

In [21]:
df.shape

(26193, 22)

In [22]:
df = df[df['exclude'] == False]

In [23]:
df.shape

(25675, 22)

### Get only entries after cd-hit

In [24]:
df = df[df['cdhit'] == True]
df.shape

(21166, 22)

###  Filter out half of negative sequences

In [25]:
df = pd.concat([df[df['cc'] == 1], df[df['cc'] == 0].sample(frac=0.5)])
df.shape

(11645, 22)

### Train test split

We want to assure the equal distribution of CC-residues and non-CC-residues in each dataset

In [26]:
cc_count = df[df['cc'] == 1].shape[0]
nocc_count = df[df['cc'] == 0].shape[0]

In [27]:
cc_count, 0.1*cc_count, nocc_count, 0.1*nocc_count

(2125, 212.5, 9520, 952.0)

In [28]:
import random
import numpy as np
pos_cc = ''.join(df['socket_assignment'].tolist()).count('1')
neg_cc = ''.join(df['socket_assignment'].tolist()).count('0')
all_frac = (pos_cc/(pos_cc+neg_cc))
score = 1
best = 1
it = 0
while score > 0.00002:
    it += 1
    df_temp = df.copy()
    test_pos = df_temp[(df_temp['cc'] == 1) & (df_temp['less30'] == True)].sample(random.randrange(200, 230))
    test_neg = df_temp[(df_temp['cc'] == 0) & (df_temp['less30'] == True)].sample(random.randrange(920, 990))
    test = pd.concat((test_pos, test_neg))
    df_temp.drop(test_pos.index, inplace=True)
    df_temp.drop(test_neg.index, inplace=True)
    train = df_temp
    train_pos_cc = ''.join(train['socket_assignment'].tolist()).count('1')
    train_neg_cc = ''.join(train['socket_assignment'].tolist()).count('0')
    train_frac = (train_pos_cc/(train_pos_cc+train_neg_cc))
    test_pos_cc = ''.join(test['socket_assignment'].tolist()).count('1')
    test_neg_cc = ''.join(test['socket_assignment'].tolist()).count('0')
    test_frac = (test_pos_cc/(test_pos_cc+test_neg_cc))
    score = np.std((train_frac, test_frac))
    if score < best:
        print(score)
        best = score
        print(all_frac, train_frac, test_frac, best, it)

0.00161479614645
0.03041347094432709 0.030113536244352784 0.03334312853724531 0.00161479614645 1
0.000258033827777
0.03041347094432709 0.03036506285151086 0.03088113050706567 0.000258033827777 2
0.000121864927236
0.03041347094432709 0.030436439676504976 0.030192709822033006 0.000121864927236 14
6.39369032481e-05
0.03041347094432709 0.03042535873781495 0.030297484931318838 6.39369032481e-05 34
6.06292943259e-05
0.03041347094432709 0.03042486060813734 0.030303602019485468 6.06292943259e-05 38
1.15024062088e-05
0.03041347094432709 0.030411415596733396 0.030434420409150953 1.15024062088e-05 48


In [29]:
train.shape, test.shape

((10485, 22), (1160, 22))

### Split train for 5-fold cross validation|

Again equal distribution of CC and non-CC residues must be assured in each validation run

In [30]:
score = 1
best = 1
it = 0
while score > 0.0002:
    it += 1
    fractions = []
    df_temp = train.copy()
    df_temp = df_temp.sample(frac=1)
    splits = np.array_split(df_temp, 5)
    for split in splits:
        pos_cc = ''.join(split['socket_assignment'].tolist()).count('1')
        neg_cc = ''.join(split['socket_assignment'].tolist()).count('0')
        frac = (pos_cc/(pos_cc+neg_cc))
        fractions.append(frac)
    score = np.std(fractions)
    if score < best:
        best = score
        print(score, fractions, it)
        c = 1
        for split in splits:
            split['val'] = c
            if c == 1:
                train_val = split
            else:
                train_val = pd.concat([train_val, split])
            c += 1

0.00225916966386 [0.028672398147568685, 0.030239752513534415, 0.03241549946786611, 0.033399634206565935, 0.027326000194685097] 1
0.00170069282977 [0.030321885094984572, 0.02855723431894099, 0.03270583129194778, 0.031923456280171326, 0.02854991983105862] 2
0.00111610343167 [0.03016846172916265, 0.031085387266697014, 0.03218259342476657, 0.029461114465152382, 0.029117047947351926] 4
0.00107962152056 [0.02914814388567795, 0.031191939402601114, 0.03202419126798331, 0.029408579065656273, 0.030284475258345236] 9
0.000686113687914 [0.03029827498023899, 0.030419904996057685, 0.031699511604074755, 0.029804394717648702, 0.02985170381199745] 15
0.000448834292455 [0.030831185253635523, 0.03060875512995896, 0.030818870691762406, 0.030141652249134947, 0.029667544888882017] 43
0.000373402399414 [0.03015175329461824, 0.03002597627159941, 0.03052280354993689, 0.031081387396568124, 0.03027036996478447] 113
0.000225642912878 [0.030657628962088252, 0.030200961429332637, 0.030092412223971077, 0.03049602392

In [31]:
train_val[train_val['val'] == 1].shape, train_val[train_val['val'] == 2].shape, train_val[train_val['val'] == 3].shape, train_val[train_val['val'] == 4].shape, train_val[train_val['val'] == 5].shape, 

((2097, 23), (2097, 23), (2097, 23), (2097, 23), (2097, 23))

### Save final data

In [None]:
train_val.to_pickle('out/train_data.p')
train_val.to_csv('out/train_data.csv')
test.to_pickle('out/test_data.p')
test.to_csv('out/test_data.csv')

In [None]:
li2016_testset.to_pickle('out/li2016.p')

In [None]:
li2016_testset.to_csv('out/li2016.csv')