In [1]:
import os
import time
import yaml
import pandas as pd
# suppress warnings about having no GPU
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf
import logging
# suppress warnings about tf retracing
tf.get_logger().setLevel('ERROR')
logging.getLogger('tensorflow').setLevel(logging.ERROR)

from Humatch.humanise import humanise
from Humatch.model import load_cnn
from Humatch.germline_likeness import get_normalised_germline_likeness_score, mutate_seq_to_match_germline_likeness
from Humatch.utils import get_edit_distance
from Humatch.plot import highlight_differnces_between_two_seqs

### Load pre-aligned non-human sequences

We skip the sequence alignment step here for simplicity. Code to do this can be found in the classification notebook.

In [2]:
root_dir = os.path.dirname(os.path.abspath(""))
data = os.path.join(root_dir, "data", "example_prealigned.csv")
df = pd.read_csv(data)
df = df[df["is_human"]==0].reset_index(drop=True)
print(df.shape)
df.head(2)

(10, 3)


Unnamed: 0,is_human,heavy,light
0,0,EVK-LVESGG-GLVQPGGSLRLSCATSGFTF---------TDYYMS...,DVV-MTQTPLSLPVSLGDQASISCRSSQSLVHS------NGNTYLH...
1,0,EVQ-LVESGG-GLVQPGRSLKLSCAASGFTF---------SNYYMA...,DIV-MTQGALPNPVPSGESVSITCRSSKSLLYS------DGKTYLN...


### Load CNNs

In [3]:
weights_dir = os.path.join(root_dir, "Humatch", "trained_models")

cnn_heavy = load_cnn(os.path.join(weights_dir, "heavy.weights.h5"), "heavy")
cnn_light = load_cnn(os.path.join(weights_dir, "light.weights.h5"), "light")
cnn_paired = load_cnn(os.path.join(weights_dir, "paired.weights.h5"), "paired")

### Germline-likeness mutations

Demonstrate GL mutations in isolation to highlight their speed. Users may choose residues that they wish to remain fixed. Though the example sequence is closest to hv3, we can choose to jump to e.g. hv1 if desired by changing the target gene.

In [4]:
example_seq = df["heavy"][0]
target_gene = "hv1"
target_score = 0.40
allow_CDR_mutations = False
fixed_imgt_positions = ["1 ", "81 ", "81A", "120 "]
germline_mutated_seq = mutate_seq_to_match_germline_likeness(example_seq, target_gene, target_score,
                                                             allow_CDR_mutations, fixed_imgt_positions)

print(f"GL-score ({target_gene}) of original sequence:\t{get_normalised_germline_likeness_score(example_seq, target_gene):.2f}")
print(f"GL-score ({target_gene}) of mutated sequence:\t{get_normalised_germline_likeness_score(germline_mutated_seq, target_gene):.2f}")
print(f"Edit distance between both sequences:\t{get_edit_distance(example_seq, germline_mutated_seq)}\n")
print(example_seq)
print(highlight_differnces_between_two_seqs(example_seq, germline_mutated_seq))
print(germline_mutated_seq)

GL-score (hv1) of original sequence:	0.28
GL-score (hv1) of mutated sequence:	0.40
Edit distance between both sequences:	27

EVK-LVESGG-GLVQPGGSLRLSCATSGFTF---------TDYYMS-WVRQ-P-P-G-K--A-LE-WLGFIRNKA---------NGYTTEY--S--A--SV--K---GRFTISR-D---N-S--QS----I-LYLQMNTLRA-EDSATYYCARDDG----------------------------YFAYWG-QGTLVTVSA
  Q   Q  A EVKK    VKV  K                           A        G     M                            Q  KF        V  T                 T A M L    S   T V                                                   S
EVQ-LVQSGA-EVKKPGGSVKVSCKTSGFTF---------TDYYMS-WVRQ-A-P-G-K--G-LE-WMGFIRNKA---------NGYTTEY--S--Q--KF--K---GRVTITR-D---N-S--QS----T-AYMQLNTLRS-EDTAVYYCARDDG----------------------------YFAYWG-QGTLVTVSS


### Humanise

In [5]:
example_heavy_seq, example_light_seq = df["heavy"][0], df["light"][0]
with open(os.path.join(root_dir, "Humatch", "configs", "default.yaml")) as f:
    config = yaml.safe_load(f)

# Set environment variables to limit the number of CPUs
start_time = time.time()
humatch_output = humanise(example_heavy_seq, example_light_seq, cnn_heavy, cnn_light, cnn_paired, config, verbose=True)
print(f"Time taken to humanise: {time.time()-start_time:.1f}s")

Matching germilne likeness for hv3 and kv2
Designing and scoring single point variants
	It. #1	CNN-H: 0.04,	CNN-L: 0.00,	CNN-P: 0.99,	Edit: 2
	It. #2	CNN-H: 0.30,	CNN-L: 0.00,	CNN-P: 0.98,	Edit: 3
	It. #3	CNN-H: 0.30,	CNN-L: 0.00,	CNN-P: 0.98,	Edit: 4
	It. #4	CNN-H: 0.30,	CNN-L: 0.00,	CNN-P: 0.99,	Edit: 5
	It. #5	CNN-H: 0.30,	CNN-L: 0.06,	CNN-P: 1.00,	Edit: 6
	It. #6	CNN-H: 0.30,	CNN-L: 0.31,	CNN-P: 1.00,	Edit: 7
	It. #7	CNN-H: 0.74,	CNN-L: 0.31,	CNN-P: 1.00,	Edit: 8
	It. #8	CNN-H: 0.74,	CNN-L: 0.69,	CNN-P: 1.00,	Edit: 9
	It. #9	CNN-H: 0.74,	CNN-L: 0.93,	CNN-P: 1.00,	Edit: 10
	It. #10	CNN-H: 0.95,	CNN-L: 0.93,	CNN-P: 1.00,	Edit: 11
Humanised sequences:
	EVKLVESGGGLVQPGGSLRLSCATSGFTFTDYYMSWVRQAPGKGLEWLGFIRNKANGYTTEYAASVKGRFTISRDNSQSILYLQMNTLRAEDSAVYYCARDDGYFAYWGQGTLVTVSS
	DIVMTQTPLSLPVTLGQPASISCRSSQSLVHSNGNTYLHWYLQKPGQSPKLLIYKVSNRFSGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCSQSTHVPLTFGQGTKLELK
Time taken to humanise: 32.3s


In [6]:
# full output
humatch_output

{'Humatch_H': 'EVK-LVESGG-GLVQPGGSLRLSCATSGFTF---------TDYYMS-WVRQ-A-P-G-K--G-LE-WLGFIRNKA---------NGYTTEY--A--A--SV--K---GRFTISR-D---N-S--QS----I-LYLQMNTLRA-EDSAVYYCARDDG----------------------------YFAYWG-QGTLVTVSS',
 'Humatch_L': 'DIV-MTQTPLSLPVTLGQPASISCRSSQSLVHS------NGNTYLH-WYLQ-K-P-G-Q--S-PK-LLIYKV----------------SNR--F--S--GV--P---DRFSGSG-------S--GT----D-FTLKISRVEA-EDVGVYYCSQSTH----------------------------VPLTFG-QGTKLELK-',
 'Edit': 12,
 'HV': 'hv3',
 'LV': 'kv2',
 'CNN_H': 0.95458233,
 'CNN_L': 0.9665863,
 'CNN_P': 0.9996456}

In [7]:
# exmaine mutation locations
humanised_heavy_seq, humanised_light_seq = humatch_output["Humatch_H"], humatch_output["Humatch_L"]
print(f"\nHeavy\n{example_heavy_seq}\n{highlight_differnces_between_two_seqs(example_heavy_seq, humanised_heavy_seq)}\n{humanised_heavy_seq}")
print(f"\nLight\n{example_light_seq}\n{highlight_differnces_between_two_seqs(example_light_seq, humanised_light_seq)}\n{humanised_light_seq}")


Heavy
EVK-LVESGG-GLVQPGGSLRLSCATSGFTF---------TDYYMS-WVRQ-P-P-G-K--A-LE-WLGFIRNKA---------NGYTTEY--S--A--SV--K---GRFTISR-D---N-S--QS----I-LYLQMNTLRA-EDSATYYCARDDG----------------------------YFAYWG-QGTLVTVSA
                                                    A        G                               A                                                     V                                                   S
EVK-LVESGG-GLVQPGGSLRLSCATSGFTF---------TDYYMS-WVRQ-A-P-G-K--G-LE-WLGFIRNKA---------NGYTTEY--A--A--SV--K---GRFTISR-D---N-S--QS----I-LYLQMNTLRA-EDSAVYYCARDDG----------------------------YFAYWG-QGTLVTVSS

Light
DVV-MTQTPLSLPVSLGDQASISCRSSQSLVHS------NGNTYLH-WYLQ-K-P-G-Q--S-PK-LLIYKV----------------SNR--F--S--GV--P---DRFSGSG-------S--GT----D-FTLKISRVEA-EDLGVYFCSQSTH----------------------------VPLTFG-AGTKLELK-
 I            T  QP                                                                                                                              V   Y                                