In [1]:
import numpy as np
import pandas as pd
from Bio import pairwise2
from Bio.Align import substitution_matrices
from os import path
from tqdm import tqdm

In [2]:
import multiprocessing

In [3]:
MATRIX = substitution_matrices.load("BLOSUM80")

In [4]:
score = pairwise2.align.globalds("ACCGT", "ACG", MATRIX, -10, -0.5, score_only=True)
score

9.0

In [5]:
DATA_DIR = "../../data"

In [6]:
chen_data = pd.read_csv(path.join(DATA_DIR, "chen/chen_data.csv"))
chen_data

Unnamed: 0,Antibody_ID,heavy,light,Y
0,12e8,EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKGLE...,DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKL...,0
1,15c8,EVQLQQSGAELVKPGASVKLSCTASGFNIKDTYMHWVKQKPEQGLE...,DIVLTQSPAIMSASLGERVTMTCTASSSVSSSNLHWYQQKPGSSPK...,0
2,1a0q,EVQLQESDAELVKPGASVKISCKASGYTFTDHVIHWVKQKPEQGLE...,DIELTQSPSSLSASLGGKVTITCKASQDIKKYIGWYQHKPGKQPRL...,1
3,1a14,QVQLQQSGAELVKPGASVRMSCKASGYTFTNYNMYWVKQSPGQGLE...,DIELTQTTSSLSASLGDRVTISCRASQDISNYLNWYQQNPDGTVKL...,0
4,1a2y,QVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKGLE...,DIVLTQSPASLSASVGETVTITCRASGNIHNYLAWYQQKQGKSPQL...,0
...,...,...,...,...
2404,6s5a,EVKLLESGGGLVQPGGSLKLSCAASGFDFSRYWMNWVRQAPGKGLE...,QAVVTQESALTTSPGETVTLTCRSSTGAVTTSNYANWVQEKPDHLF...,0
2405,6tyb,EVQLVQSGTEVKRPGESLTISCKTSGYSFSGTWISWVRQMPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGISTYLAWYQQKPGKAPKL...,0
2406,6u1t,EVQLVESGGGLVKPGGSLKLSCAASGFTFSSYDMSWVRQTPEKRLE...,DIQMTQSPASQSASLGESVTITCLASQTIGTWLAWYQQKPGKSPQL...,0
2407,7fab,AVQLEQSGPGLVRPSQTLSLTCTVSGTSFDDYYWTWVRQPPGRGLE...,ASVLTQPPSVSGAPGQRVTISCTGSSSNIGAGHNVKWYQQLPGTAP...,0


In [7]:
def score_seqs(seq1, seq2):
    score = pairwise2.align.globalds(seq1, seq2, MATRIX, -6, -0.5, score_only=True)
    return score

In [8]:
heavy = chen_data["heavy"]

In [9]:
heavy_small = heavy.iloc[:5]

In [10]:
class CompareSeq:
    def __init__(self, seq):
        self.seq = seq
        
    def score(self, seq2):
        return score_seqs(self.seq, seq2)

In [12]:
def compare_row(seq, index, all_seqs):
    remains = len(all_seqs) - index
    if remains < 1:
        return np.zeros(len(all_seqs))
    rest_of_row = all_seqs[index:]
    pool = multiprocessing.Pool(6)
    comparison = CompareSeq(seq)
    scores = pool.map(comparison.score, rest_of_row)
    scores_int = np.fromiter(scores, dtype=int)
    scores_filled = np.concatenate((np.zeros(index), scores_int))
    return  scores_filled

In [13]:
# heavy
distances = [compare_row(s, i, heavy_small) for i, s in enumerate(tqdm(heavy_small))]
#distances = list(map(lambda s: compare_row(s, heavy_small), tqdm(heavy_small)))
distance_matrix = np.stack(distances)
distance_matrix

100%|██████████| 5/5 [00:03<00:00,  1.26it/s]


array([[1048.,  770.,  657.,  638.,  444.],
       [   0., 1038.,  715.,  675.,  468.],
       [   0.,    0., 1020.,  738.,  470.],
       [   0.,    0.,    0., 1050.,  493.],
       [   0.,    0.,    0.,    0., 1005.]])

Process ForkPoolWorker-29:
Process ForkPoolWorker-5:
Process ForkPoolWorker-26:
Process ForkPoolWorker-11:
Process ForkPoolWorker-15:
Process ForkPoolWorker-7:
Process ForkPoolWorker-19:
Process ForkPoolWorker-14:
Process ForkPoolWorker-9:
Process ForkPoolWorker-16:
Process ForkPoolWorker-30:
Process ForkPoolWorker-4:
Process ForkPoolWorker-25:
Process ForkPoolWorker-17:
Process ForkPoolWorker-10:
Process ForkPoolWorker-1:
Process ForkPoolWorker-21:
Process ForkPoolWorker-23:
Process ForkPoolWorker-6:
Process ForkPoolWorker-22:
Process ForkPoolWorker-20:
Process ForkPoolWorker-18:
Process ForkPoolWorker-3:
Process ForkPoolWorker-27:
Process ForkPoolWorker-24:
Process ForkPoolWorker-28:
Process ForkPoolWorker-13:
Process ForkPoolWorker-12:
Process ForkPoolWorker-2:
Process ForkPoolWorker-8:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/brazdilv/.conda/envs/ml/lib/python3.7/multipr

In [14]:
# heavy
distances = [compare_row(s, i, heavy) for i, s in enumerate(tqdm(heavy))]
#distances = list(map(lambda s: compare_row(s, heavy), tqdm(heavy)))
distance_matrix = np.stack(distances)
distance_matrix

  1%|          | 13/2409 [1:07:23<197:57:11, 297.43s/it]Process ForkPoolWorker-31:
Process ForkPoolWorker-45:
Process ForkPoolWorker-41:
Process ForkPoolWorker-36:
Process ForkPoolWorker-94:
Process ForkPoolWorker-57:
Process ForkPoolWorker-106:
Process ForkPoolWorker-105:
Process ForkPoolWorker-88:
Process ForkPoolWorker-74:
Process ForkPoolWorker-66:
Process ForkPoolWorker-76:
Process ForkPoolWorker-52:
Process ForkPoolWorker-100:
Process ForkPoolWorker-91:
Process ForkPoolWorker-61:
Process ForkPoolWorker-48:
Process ForkPoolWorker-35:
Process ForkPoolWorker-101:
Process ForkPoolWorker-77:
Process ForkPoolWorker-40:
Process ForkPoolWorker-103:
Process ForkPoolWorker-104:
Process ForkPoolWorker-43:
Process ForkPoolWorker-50:
Process ForkPoolWorker-108:
Process ForkPoolWorker-64:
Process ForkPoolWorker-55:
Process ForkPoolWorker-86:
Process ForkPoolWorker-99:
Process ForkPoolWorker-63:
Process ForkPoolWorker-80:
Process ForkPoolWorker-68:
Process ForkPoolWorker-73:
Process ForkPoolWor

KeyboardInterrupt: 

Process ForkPoolWorker-140:
Process ForkPoolWorker-176:
Process ForkPoolWorker-165:
Process ForkPoolWorker-212:
Process ForkPoolWorker-210:
Process ForkPoolWorker-181:
Process ForkPoolWorker-195:
Process ForkPoolWorker-170:
Process ForkPoolWorker-130:
Process ForkPoolWorker-209:
Process ForkPoolWorker-201:
Process ForkPoolWorker-205:
Process ForkPoolWorker-214:
Process ForkPoolWorker-118:
Process ForkPoolWorker-190:
Process ForkPoolWorker-174:
Process ForkPoolWorker-151:
Process ForkPoolWorker-153:
Process ForkPoolWorker-199:
Process ForkPoolWorker-206:
Process ForkPoolWorker-161:
Process ForkPoolWorker-193:
Process ForkPoolWorker-219:
Process ForkPoolWorker-187:
Process ForkPoolWorker-226:
Process ForkPoolWorker-202:
Process ForkPoolWorker-198:
Process ForkPoolWorker-137:
Process ForkPoolWorker-227:
Process ForkPoolWorker-217:
Process ForkPoolWorker-150:
Process ForkPoolWorker-146:
Process ForkPoolWorker-208:
Process ForkPoolWorker-133:
Process ForkPoolWorker-175:
Process ForkPoolWork

In [None]:
np.savetxt(path.join(DATA_DIR, "chen/distances/pairwise_scores_heavy.csv"), distance_matrix, delimiter=';')

In [None]:
light = chen_data["light"]

In [None]:
# light
distances = list(map(lambda s: compare_row(s, light), tqdm(light)))
distance_matrix_l = np.stack(distances)
distance_matrix_l

In [None]:
np.savetxt(path.join(DATA_DIR, "chen/distances/pairwise_scores_light.csv"), distance_matrix_l, delimiter=';')

In [None]:
# convert similarity to distance
def sim_to_dist(sim_matrix):
    max_score = np.amax(sim_matrix)
    dist_matrix = max_score = sim_matrix

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
# clustering
def cluster_sequences(dist_matrix, sequences):
    clustering = AgglomerativeClustering(affinity='precomputed', linkage='complete')
    clustering.fit(dist_matrix)
    # create df with sequences and cluster labels?
    return clustering