In [1]:
from scipy.spatial import Delaunay
from itertools import combinations
import numpy as np
def delune():
    # read data
    data = [] 
    cnt = 0
    f = open('3oxc.filtered.pdb')
    for line in f:
        cnt += 1
        data.append(line.split())
    assert cnt == 198
    f.close()
    
    # get the positions and corresponding coordinates
    # place = [int(d[5]) for d in data] # 1.. 99, 101.. 199] 
    x_y_z = [[float(x) for x in d[6:9]] for d in data]
    x_y_z_place = {tuple(x_y_z[i]):i for i in range(len(x_y_z))}
    
    # triangulate 
    tri = Delaunay(x_y_z)
    all_combs = list(combinations(range(4),2))
    
    # storage 
    place_neighbor_dist = {i:{} for i in range(0,198)}
    
    # get the distances 
    for n in np.array(x_y_z)[tri.simplices]:
        for c in all_combs:
            first, second = n[c[0]], n[c[1]]
            dist = np.linalg.norm(first - second) # l2 norm
            p1, p2 = x_y_z_place[tuple(first)], x_y_z_place[tuple(second)]

            place_neighbor_dist[p1].update({p2:dist})
            place_neighbor_dist[p2].update({p1:dist})
    
    return place_neighbor_dist

In [2]:
place_neighbor_dist = delune()

In [3]:
amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
aa_ind = {p:i for i,p in enumerate(amino_acids)}
vec_ind = {v:k for k,v in enumerate([(amino_acids[i], amino_acids[j]) for i in range(len(amino_acids)) for j in range(i,len(amino_acids))])}

In [4]:
def get_count_vector(line):
    # vec_sum = np.zeros(210)
    vec_tot = np.zeros(210)
    for i,aa in enumerate(line):
        p1 = aa_ind[aa]
        n_d = place_neighbor_dist[i]
        for n in n_d.keys():
            naa = line[n]
            p2 = aa_ind[naa]
            pair = (aa, naa) if aa < naa else (naa, aa)
            pair_ind = vec_ind[pair]
            # vec_sum[pair_ind] += d
            vec_tot[pair_ind] += 1
    # vec_tot = map(lambda x:1 if x == 0 else x,vec_tot)
    return vec_tot

In [5]:
from collections import defaultdict
import time 
filetowrite = 'PI_DataSet_6_19_PI_count_vec_210.txt'
g = open(filetowrite,'w')
f = open("PI_DataSet_6_19_expanded.txt")
for line in f:
    header = line.split("\t")
    vec_start = header.index("P1")
    break
count = 0
t0 = time.time()
for line in f:
    line = line.strip().split("\t")
    l = line[:vec_start]
    v =   line[vec_start:]
    vec = get_count_vector(2*v)
    vec = map(lambda x:str(int(x)),vec)
    token = "\t".join(l+vec)+"\n"
    g.write(token)
    count += 1
    if count % 10000 == 0:
        print count, time.time() - t0
        t0 = time.time()
print(count)

10000 15.5363609791
20000 14.9741559029
30000 15.3169879913
40000 14.5395390987
50000 14.5337688923
60000 14.5764241219
70000 14.6211659908
80000 14.7469580173
90000 14.608946085
100000 14.4563779831
110000 14.4653749466
120000 14.6250038147
130000 14.4934790134
140000 14.452930212
150000 14.4259779453
160000 14.5132460594
170000 14.9891531467
180000 14.4625229836
190000 14.4660639763
200000 15.1300601959
210000 14.6891539097
220000 14.7227830887
230000 20.6731390953
240000 14.8462629318
250000 14.4580509663
260000 14.4421069622
270000 14.4446251392
280000 14.5744390488
290000 14.436617136
300000 14.5755329132
310000 14.7687618732
320000 14.4507679939
330000 14.7399189472
340000 15.3586318493
350000 14.6817450523
360000 14.5681519508
370000 14.6086750031
380000 14.3009150028
390000 14.4223351479
400000 15.020637989
410000 14.321144104
414009
