### Merge results from other methods benchmark and parsed Socket assignment

In [1]:
import pandas as pd
import pickle

In [2]:
df_socket = pickle.load(open('./../1_Data_Preparation/out/pickle/data_all_74.p', 'rb'))
df_pcoils = pickle.load(open('./../1_Data_Preparation/out/pickle/pcoils_all_74.p', 'rb'))
df_marcoil = pickle.load(open('./../1_Data_Preparation/out/pickle/marcoil_all_74.p', 'rb'))
df_cchmmprof = pickle.load(open('./../1_Data_Preparation/out/pickle/cchmmprof_all_74.p', 'rb'))

In [3]:
df = pd.concat([df_socket, df_cchmmprof, df_marcoil, df_pcoils], axis=1)

### Get sequences after filtering with cd-hit

We noticed some nearly identical sequences in  pdb50 dataset, therefore sequences obtained in data preparation part were additionally filtered with cd-hit to 50 % similarity.

Command was: cd-hit -i all.fasta -o all.cdhit.fasta -c 0.5 -n 2 -T 0


In [4]:
cdhit_entries = set()
from Bio import SeqIO
for record in SeqIO.parse("filter/seq_db_cdhit.fasta", "fasta"):
    cdhit_entries.add(str(record.id))

In [5]:
df['cdhit'] = df.index.isin(cdhit_entries)

### Now compute similarities of all sequences in the dataset. The goal is to find the sequences not more than 30 % similar to any sequence in the dataset. These will be placed in dataset.

We created blast database from all sequences in the dataset:

Command was: makeblastdb -in all.cdhit.fasta -dbtype prot

Afterwards each sequence was queried against the database to get the homologues.

Command was: psiblast -query all.cdhit.fasta -db all.cdhit.fasta -outfmt "6 qseqid sseqid pident qcovs evalue" -evalue 1e-2 -num_threads 20 -max_target_seqs 2 > blast_all_cdhit.csv

In [6]:
df_blast = pd.read_csv('./filter/seq_db_all.csv', sep='\t', names=['qid','sid','ident', 'cov','evalue'])

In [7]:
rows = []
nohits = []
for pdb in df_blast.groupby('qid'):
    pdbid = pdb[0]
    hits = pdb[1]
    hits = hits[(hits['cov'] >= 0) & (hits['qid'] != hits['sid'])]
    if len(hits)>0:
        rows.append(hits.sort_values(by=['ident']).iloc[-1])
    else:
        nohits.append(pdbid)

In [8]:
df_blast = pd.DataFrame(rows)

In [9]:
less30_entries = (set(df_blast[df_blast['ident'] < 30]['qid'].tolist()) | set(nohits))

In [10]:
df['less30'] = df.index.isin(less30_entries)

### Get only entries after cd-hit

In [11]:
df = df[df['cdhit'] == True]
df.shape

(24166, 19)

### Filter by length

In [12]:
df = df[(df['sequence'].str.len() >= 25) & (df['sequence'].str.len() <=500)]
df.shape

(21783, 19)

###  Filter out half of negative sequences

In [13]:
df = pd.concat([df[df['cc'] == 1], df[df['cc'] == 0].sample(frac=0.5)])
df.shape

(12095, 19)

### Train test split

We want to assure the equal distribution of CC-residues and non-CC-residues in each dataset

In [14]:
cc_count = df[df['cc'] == 1].shape[0]
nocc_count = df[df['cc'] == 0].shape[0]

In [15]:
cc_count, 0.1*cc_count, nocc_count, 0.1*nocc_count

(2407, 240.70000000000002, 9688, 968.8000000000001)

In [16]:
import random
import numpy as np
pos_cc = ''.join(df['socket_assignment'].tolist()).count('1')
neg_cc = ''.join(df['socket_assignment'].tolist()).count('0')
all_frac = (pos_cc/(pos_cc+neg_cc))
score = 1
best = 1
it = 0
while score > 0.00002:
    it += 1
    df_temp = df.copy()
    test_pos = df_temp[(df_temp['cc'] == 1) & (df_temp['less30'] == True)].sample(random.randrange(220, 260))
    test_neg = df_temp[(df_temp['cc'] == 0) & (df_temp['less30'] == True)].sample(random.randrange(940, 990))
    test = pd.concat((test_pos, test_neg))
    df_temp.drop(test_pos.index, inplace=True)
    df_temp.drop(test_neg.index, inplace=True)
    train = df_temp
    train_pos_cc = ''.join(train['socket_assignment'].tolist()).count('1')
    train_neg_cc = ''.join(train['socket_assignment'].tolist()).count('0')
    train_frac = (train_pos_cc/(train_pos_cc+train_neg_cc))
    test_pos_cc = ''.join(test['socket_assignment'].tolist()).count('1')
    test_neg_cc = ''.join(test['socket_assignment'].tolist()).count('0')
    test_frac = (test_pos_cc/(test_pos_cc+test_neg_cc))
    score = np.std((train_frac, test_frac))
    if score < best:
        print(score)
        best = score
        print(all_frac, train_frac, test_frac, best, it)

0.00344050100573
0.031063916913886134 0.030444019283737304 0.037325021295200475 0.00344050100573 1
0.00209635467227
0.031063916913886134 0.030688761202400983 0.03488147054693933 0.00209635467227 2
0.000511523700378
0.031063916913886134 0.030972109434537987 0.03199515683529456 0.000511523700378 3
0.000192140278416
0.031063916913886134 0.03102903821147502 0.0314133187683076 0.000192140278416 12
8.88008489354e-05
0.031063916913886134 0.031079866149868872 0.03090226445199808 8.88008489354e-05 39
2.91307157614e-05
0.031063916913886134 0.031058537815289517 0.031116799246812377 2.91307157614e-05 83
1.60604586502e-05
0.031063916913886134 0.031066847507276594 0.031034726589976143 1.60604586502e-05 185


In [17]:
train.shape, test.shape

((10895, 19), (1200, 19))

### Split train for 5-fold cross validation|

Again equal distribution of CC and non-CC residues must be assured in each validation run

In [18]:
score = 1
best = 1
it = 0
while score > 0.0002:
    it += 1
    fractions = []
    df_temp = train.copy()
    df_temp = df_temp.sample(frac=1)
    splits = np.array_split(df_temp, 5)
    for split in splits:
        pos_cc = ''.join(split['socket_assignment'].tolist()).count('1')
        neg_cc = ''.join(split['socket_assignment'].tolist()).count('0')
        frac = (pos_cc/(pos_cc+neg_cc))
        fractions.append(frac)
    score = np.std(fractions)
    if score < best:
        best = score
        print(score, fractions, it)
        c = 1
        for split in splits:
            split['val'] = c
            if c == 1:
                train_val = split
            else:
                train_val = pd.concat([train_val, split])
            c += 1

0.0017934543971 [0.03448993286074697, 0.02959011345647435, 0.029632527141623552, 0.03078359613849606, 0.030850831510413776] 1
0.00110598604942 [0.03033043089975441, 0.033152431479504245, 0.030029100365302458, 0.03068553422832033, 0.031110182291419288] 2
0.00080175151099 [0.032423861742063326, 0.030102752782926998, 0.03055064630331745, 0.030828998265973056, 0.03141335576964797] 10
0.000646249162705 [0.02997316196614136, 0.03165086813702487, 0.03149816420000757, 0.030687486041837266, 0.031537000139011166] 13
0.000527364876373 [0.030499902669841126, 0.03160713180794594, 0.03119052732133204, 0.030406203661842383, 0.03163737140963656] 36
0.000399969123573 [0.030887770815285216, 0.030945999278020206, 0.03154269252871419, 0.03148382135838253, 0.030472786619535522] 46
0.00035714335487 [0.03118591950323446, 0.031269543464665414, 0.030579649948872495, 0.03155687569909207, 0.03074280420730484] 314
0.000312506758485 [0.03099356022272002, 0.030604236534564688, 0.03143753245865161, 0.030903993609030

In [19]:
train_val[train_val['val'] == 1].shape, train_val[train_val['val'] == 2].shape, train_val[train_val['val'] == 3].shape, train_val[train_val['val'] == 4].shape, train_val[train_val['val'] == 5].shape, 

((2179, 20), (2179, 20), (2179, 20), (2179, 20), (2179, 20))

### Save final data

In [20]:
train_val.to_pickle('out/train_data.p')
train_val.to_csv('out/train_data.csv')
test.to_pickle('out/test_data.p')
test.to_csv('out/test_data.csv')