In [1]:
import numpy as np
import os
import random
from math import ceil, floor

Strategy 2: Unlike the strategy 1 yielding a single 256×256 matrix,
we tried an ensemble-based strategy, which yields __multiple matrices rather than
a single one__. These matrices were fed into DCNN, and the average results were
calculated. Here, the number of __ensemble matrices was set to be proportional to
the length of query protein__.

In [2]:
# Sampling: For a protein with length over 256, they randomly sampled a 256×256
# sub-matrix from its contact likelihood matrix. They repeated this procedure
# multiple times and obtained an ensemble

def sampling(distance_matrix, new_shape=(64,64), sample_size=None):
    if not sample_size:
        sample_size = int(floor((distance_matrix.shape[0]/new_shape[0])))*2 # Here, the number of ensemble matrices 
    ensemble = []                                              # was set to be proportional to the length of query protein
    for sample in range(sample_size):
        sampled_matrix = []
        x,y = random.randint(0,len(distance_matrix)-new_shape[0]), random.randint(0,len(distance_matrix)-new_shape[0])
        for i in range(x,x+new_shape[0]):
            sampled_matrix.append(distance_matrix[i][y:y+new_shape[0]])
        ensemble.append(sampled_matrix)
    return(np.array(ensemble))

sampling(np.array([[1,2,3,4,5,6,7,8],
         [1,2,3,4,5,6,7,8],
         [1,2,8,4,5,6,7,8],
         [1,2,3,4,5,6,7,8],
         [1,2,9,4,5,6,7,8],
         [1,2,3,4,5,6,7,8],
         [1,8,3,4,5,6,7,8],
         [1,2,3,4,5,6,7,8]]), new_shape=(7,7))

array([[[1, 2, 3, 4, 5, 6, 7],
        [1, 2, 8, 4, 5, 6, 7],
        [1, 2, 3, 4, 5, 6, 7],
        [1, 2, 9, 4, 5, 6, 7],
        [1, 2, 3, 4, 5, 6, 7],
        [1, 8, 3, 4, 5, 6, 7],
        [1, 2, 3, 4, 5, 6, 7]],

       [[1, 2, 3, 4, 5, 6, 7],
        [1, 2, 8, 4, 5, 6, 7],
        [1, 2, 3, 4, 5, 6, 7],
        [1, 2, 9, 4, 5, 6, 7],
        [1, 2, 3, 4, 5, 6, 7],
        [1, 8, 3, 4, 5, 6, 7],
        [1, 2, 3, 4, 5, 6, 7]]])

In [25]:
# Padding: For a protein with length smaller than 256, we embedded its contact
# matrix into a 256 × 256 matrix with all elements being 0. The embedding
# positions are random; thus, we obtained an ensemble of 256 × 256 matrices
# after repeating this operation multiple times.

def padding(distance_matrix, new_shape=(64,64), sample_size=None):
    if not sample_size:
        sample_size = int(ceil((distance_matrix.shape[0]/new_shape[0])))*2 # Here, the number of ensemble matrices 
    ensemble = []                                                       # was set to be proportional to the
    for sample in range(sample_size):                                   # length of query protein
        sampled_matrix = [[0 for i in range(new_shape[0])] for i in range(new_shape[0])]
        x,y = random.randint(0,len(sampled_matrix)-len(distance_matrix)), random.randint(0,len(sampled_matrix)-len(distance_matrix))
        s = 0
        for i in range(x,x+len(distance_matrix)):
            sampled_matrix[i][y:y+len(distance_matrix)] = distance_matrix[s][:]
            s+=1
        ensemble.append(sampled_matrix)
    return(np.array(ensemble))

padding(np.array([[3,3,3],[3,3,3],[3,3,3]]), new_shape=(8,8))

array([[[0, 0, 0, 0, 0, 3, 3, 3],
        [0, 0, 0, 0, 0, 3, 3, 3],
        [0, 0, 0, 0, 0, 3, 3, 3],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]],

       [[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 3, 3, 3],
        [0, 0, 0, 0, 0, 3, 3, 3],
        [0, 0, 0, 0, 0, 3, 3, 3],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]])

In [2]:
# Strategy 3, Sampling
# stride lenght = new_shape[0]

def sampling_s3(distance_matrix, new_shape=(64,64)):
    ensemble = []  
    filter_s=new_shape[0]
    step = ceil(distance_matrix.shape[0]/filter_s)
    for i in range(step):
        for j in range(step):
            part = np.array(distance_matrix[i*filter_s:(i+1)*filter_s,j*filter_s:(j+1)*filter_s])
            sample = np.zeros((filter_s,filter_s))
            sample[0:part.shape[0], 0:part.shape[1]] = part
            ensemble.append(sample)
    return(np.array(ensemble))

sampling_s3(np.array([[1,2,3,4,5,6,7,8],
         [1,2,3,4,5,6,7,8],
         [1,2,8,4,5,6,7,8],
         [1,2,3,4,5,6,7,8],
         [1,2,9,4,5,6,7,8],
         [1,2,3,4,5,6,7,8],
         [1,8,3,4,5,6,7,8],
         [1,2,3,4,5,6,7,8]]), new_shape=(6,6))

array([[[1., 2., 3., 4., 5., 6.],
        [1., 2., 3., 4., 5., 6.],
        [1., 2., 8., 4., 5., 6.],
        [1., 2., 3., 4., 5., 6.],
        [1., 2., 9., 4., 5., 6.],
        [1., 2., 3., 4., 5., 6.]],

       [[7., 8., 0., 0., 0., 0.],
        [7., 8., 0., 0., 0., 0.],
        [7., 8., 0., 0., 0., 0.],
        [7., 8., 0., 0., 0., 0.],
        [7., 8., 0., 0., 0., 0.],
        [7., 8., 0., 0., 0., 0.]],

       [[1., 8., 3., 4., 5., 6.],
        [1., 2., 3., 4., 5., 6.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]],

       [[7., 8., 0., 0., 0., 0.],
        [7., 8., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]]])

In [34]:
# Strategy 3, Padding
# stride lenght = distance_matrix[0]

def padding_s3(distance_matrix, new_shape=(64,64)):
    ensemble = []  
    filter_s=distance_matrix.shape[0]
    step = ceil(new_shape[0]/distance_matrix.shape[0])
    for i in range(step):
        for j in range(step):
            sample = np.zeros((new_shape[0],new_shape[0]))
            if ((i+1)*filter_s)>new_shape[0] and ((j+1)*filter_s) >new_shape[0]:
                d = distance_matrix[0:filter_s-(new_shape[0]-filter_s),0:filter_s-(new_shape[0]-filter_s)]
            elif ((i+1)*filter_s)>new_shape[0]:            
                d = distance_matrix[0:filter_s-(new_shape[0]-filter_s),:]
            elif ((j+1)*filter_s)>new_shape[0]:
                d = distance_matrix[:,0:filter_s-(new_shape[0]-filter_s)]
            else:
                d = distance_matrix
            sample[i*filter_s:(i+1)*filter_s,j*filter_s:(j+1)*filter_s] = d
            ensemble.append(sample)
    return(np.array(ensemble))

padding_s3(np.array([[3,3,3],[3,3,3],[3,3,3]]), new_shape=(8,8))

array([[[3., 3., 3., 0., 0., 0., 0., 0.],
        [3., 3., 3., 0., 0., 0., 0., 0.],
        [3., 3., 3., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 3., 3., 3., 0., 0.],
        [0., 0., 0., 3., 3., 3., 0., 0.],
        [0., 0., 0., 3., 3., 3., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 3., 3.],
        [0., 0., 0., 0., 0., 0., 3., 3.],
        [0., 0., 0., 0., 0., 0., 3., 3.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0