# Generating antibody barcodes

This notebook contains code to generate AB barcodes for DNA-conjugated antibodies that can be used for simultaneous multiplexed protein profiling and Spatial Transcriptomics with Visium spatial gene expression solution from 10X genomics.

In [1]:
import random
import numpy as np
from itertools import groupby

In [87]:
def visium_barcodes_(inputfile):
    """Function to get spatial barcodes from a visium file containing spatial barcodes and coordinates."""
    barcodes = []
    with open(inputfile) as f:
        for line in f:
            words = line.split("\t")
            barcodes.append(words[0])
    return barcodes

In [88]:
def gc_content(sequence):
    """Calculates the %GC in a sequence."""
    gc_count = sequence.count("G") + sequence.count("C")
    gc_frac = float (gc_count) / len(sequence) *100
    return gc_frac

In [89]:
def hamming_distance (seq1, seq2):
    """Calculates the hamming distance between two sequences."""
    counter = 0
    for nuc1, nuc2 in zip(seq1,seq2):
        if nuc1 != nuc2:
            counter += 1
    return counter

In [90]:
def random_barcodes(n, l):
    """Generates n random barcodes of l length."""
    random.seed(29)
    random_barcodes = []
    for j in range(0,n):
        nucleotides = ["A", "C", "G", "T"]
        random_barcodes.append(''.join(random.choice(nucleotides) for i in range(l)))
    return random_barcodes

In [91]:
visium_barcodes = visium_barcodes_("../data/visium-v1_coordinates.txt") 
ab_barcodes = random_barcodes(100, 16) # create 100 AB barcodes of 16 length

hamming_dist = {} # Store all hamming distances between each AB barcode and all visium spatial barcodes, and GC%

for i in range(0, len(ab_barcodes)):
    GC = gc_content(ab_barcodes[i]) # Calculate GC-content for each ab_barcode
    values = []
    for v_barcode in visium_barcodes:
        val = hamming_distance(ab_barcodes[i], v_barcode) # AB barcode against each and every visium barcode
        values.append(val)
    hamming_dist[ab_barcodes[i]] = (values, GC) 

# Quality check

Check if the hamming distance for the first visium barcode and random barcode is correct

In [13]:
print(hamming_distance(ab_barcodes[0], visium_barcodes[0]))
print(visium_barcodes[0])
print(ab_barcodes[0])

10
AAACAACGAATAGTTC
AGGAGTTAAATCGATG


Check if the length of the values (hamming distances between each AB barcode and visium barcode) are the same number as there are visium barcodes

In [86]:
print(len(hamming_dist[key][0])== len(visium_barcodes))

True


# Check %GC of AB barcodes and Visium barcodes

In [786]:
# GC% of AB barcodes
gc_values = []
for key, val in hamming_dist.items():
    gc_values.append(val[1])
print(min(gc_values))
print(max(gc_values))

25.0
75.0


In [14]:
# Visium barcodes
gc_visium = []
for barcode in visium_barcodes:
    gc_visium.append(gc_content(barcode))
print(min(gc_visium))
print(max(gc_visium))

25.0
75.0


All the randomly generated barcodes here are in the same GC content range as the visium barcodes.

# Check Hamming distance between AB barcodes and visum barcodes

To understand how different the AB barcodes are to the Visium barcodes the minimum and maximum Hamming distance for each AB barcode can be retrieved.

In [42]:
distances = {}
for key, val in hamming_dist.items():
    hamming_val = []
    for i in range(0, len(hamming_dist[key][0])):
        value = val[0][i] # retrieve each hamming distance list comparing random barcode to visium barcode
        hamming_val.append(value)
        distances[key] = (min(hamming_val), max(hamming_val))
distances

{'AGGAGTTAAATCGATG': (5, 16),
 'TCTCCTTCTGGCTTCG': (5, 16),
 'GTTAGCGCGATCTTTG': (5, 16),
 'CGCGAATTCTCGAAAG': (6, 16),
 'AAAAACCTGCAACGTA': (5, 16),
 'CCACATCCCCGCAAGG': (5, 16),
 'CTAGTGCGTATATTTA': (5, 16),
 'GTCCCGTTAGCTATCC': (5, 16),
 'TCGCCATATGAAGCGC': (5, 16),
 'ACCCAGGGACGCCTCG': (5, 16),
 'GGGTTGCACAGAACCC': (6, 16),
 'AGGGAGAGTGAGGAGC': (6, 16),
 'CATCGCTCCTTTACCT': (6, 16),
 'GGGCGCCCCCCTGAAT': (6, 16),
 'CAGGTGACAAAGCCTG': (4, 16),
 'CTCAGCAATCTAATTC': (5, 16),
 'GCAGGAAGGAAGCTCG': (5, 16),
 'GCCGCGCCATCGGAGA': (6, 16),
 'CTTCAGCACGAGTATA': (6, 16),
 'CGCCAGTCAACGCCAA': (2, 16),
 'GGCAAGGCGAGCTCCC': (4, 16),
 'TCAGGGTTGGGGAGCA': (6, 16),
 'CCTACGCAATGACCCA': (4, 16),
 'TGTGACGGTTGTGTGT': (5, 16),
 'AAAGGTGAGAGCTCAT': (4, 16),
 'GGGTGCCAGAGAACCT': (6, 16),
 'CCACGCCAGATGAAGT': (4, 16),
 'AAGGTAACCCGTCTTG': (4, 16),
 'AATCGGCGCAACGAAC': (5, 16),
 'GGTTGAGAATTGGAGT': (5, 16),
 'GTCATGCTATGCGGAT': (6, 16),
 'CGGGAGTGATAACGAC': (6, 16),
 'TACGGTCTGAAGCGGC': (4, 16),
 'GTCGCAGG

# Comparing the Hamming distance between AB barcodes

To understand how different the 100 randomly generated AB barcodes are different to each other their Hamming distances can also be calculated and the minimum value retrieved.

In [18]:
hamming_random_bar = {}

for i in range(0, len(ab_barcodes)):
    values = []
    for barcode in ab_barcodes:
        val = hamming_distance(ab_barcodes[i], barcode)
        values.append(val)
    values.sort()
    hamming_random_bar[ab_barcodes[i]] = values[1] # 1 because the first one is 0 (barcode compared to itself)

In [19]:
hamming_random_bar.values()

dict_values([8, 9, 8, 8, 9, 8, 7, 8, 7, 7, 4, 8, 9, 8, 7, 8, 7, 7, 8, 8, 7, 8, 8, 7, 8, 4, 7, 7, 9, 7, 8, 7, 8, 7, 8, 8, 8, 7, 8, 7, 7, 7, 7, 8, 7, 8, 8, 7, 7, 8, 9, 6, 8, 8, 6, 7, 7, 6, 7, 8, 7, 8, 9, 7, 8, 8, 8, 7, 8, 7, 8, 8, 8, 6, 8, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7, 8, 8, 6, 6, 8, 7, 6, 8, 8, 7, 7, 8, 7])

# Choosing AB barcodes

To decide on which AB barcodes to choose one can start with filtering out the AB barcodes that have closer resemblence to any of the Visium spatial barcodes. Here a minimum Hamming distance of 6 between the AB barcode and any Visium barcode was set as threshold.

In [20]:
bools = []
for k,v in distances.items():
    bools.append(v[0]>=6)
significant = [i for i, x in enumerate(bools) if x] # Retrieve the true indices

Secondly, one can filter out the AB barcodes that have closer resemblence to any of the other AB barcodes. Here a minimum hamming distance of 7 between the AB barcodes and any other potential AB barcode was set as threshold.

In [24]:
l = []
for k,v in hamming_random_bar.items():
    l.append(v>=7)
significant_barcodes = [i for i, x in enumerate(l) if x]

Now the AB barcodes that are at least 6 hamming distances away from each visium spatial barcode and at least 7 hamming distances away from any other AB barcode can be retrieved by checking the intersection of the two lists generated.

In [28]:
overlap = set(significant).intersection(significant_barcodes)

In [30]:
ab_barcodes = np.array(ab_barcodes)

In [79]:
most_sig = ab_barcodes[list(overlap)] # retrieve the most signficant AB barcodes

Lastly one can filter out the barcodes based on consequtive bases, here max 3 consequtive bases will be allowed.

In [53]:
consequtive = {} # Store the number of consequtive bases in each barcode
for barcode in most_sig:
    res = [len(list(j)) for _, j in groupby(barcode)]
    consequtive[barcode] = res

In [54]:
consequtive

{'CGCGAATTCTCGAAAG': [1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 3, 1],
 'AGGGAGAGTGAGGAGC': [1, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1],
 'CATCGCTCCTTTACCT': [1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 2, 1],
 'GGGCGCCCCCCTGAAT': [3, 1, 1, 6, 1, 1, 2, 1],
 'GCCGCGCCATCGGAGA': [1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1],
 'CTTCAGCACGAGTATA': [1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'TCAGGGTTGGGGAGCA': [1, 1, 1, 3, 2, 4, 1, 1, 1, 1],
 'GTCATGCTATGCGGAT': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1],
 'CGGGAGTGATAACGAC': [1, 3, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1],
 'GTCGCAGGGTCGTAAT': [1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 2, 1],
 'CCCTAGAACAAGCAAT': [3, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1],
 'AACCTCCACTTAAGAA': [2, 2, 1, 2, 1, 1, 2, 2, 1, 2],
 'CGGGCGCGAGGAAATC': [1, 3, 1, 1, 1, 1, 1, 2, 3, 1, 1],
 'AGGTATATTCCGCAAT': [1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1],
 'TATGTTGCGACTATCT': [1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'GGCCAGTGTTAAGTCC': [2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2],
 'CCGCCCCGGGAGTTCT': [2, 1, 4, 3, 1, 1, 2, 1, 1],


In [92]:
ultimate = [] # Store the final barcodes that are filtered based on consequtive bases
for k, v in consequtive.items():
    if (max(set(v)) <= 3) & (gc_content(k) >= 25.0) & (gc_content(k) <= 75.0): 
        ultimate.append(k)
ultimate

['CGCGAATTCTCGAAAG',
 'AGGGAGAGTGAGGAGC',
 'CATCGCTCCTTTACCT',
 'GCCGCGCCATCGGAGA',
 'CTTCAGCACGAGTATA',
 'GTCATGCTATGCGGAT',
 'CGGGAGTGATAACGAC',
 'GTCGCAGGGTCGTAAT',
 'CCCTAGAACAAGCAAT',
 'AACCTCCACTTAAGAA',
 'CGGGCGCGAGGAAATC',
 'AGGTATATTCCGCAAT',
 'TATGTTGCGACTATCT',
 'GGCCAGTGTTAAGTCC',
 'GCCTCACATCGGCCAT',
 'TTACGTATAGGTTGTC',
 'CTCTGTAGCAACACGA',
 'ATGAGCAGATATCTTG',
 'TTCACCATTAGTACGC']

These barcodes have no more than 3 consequtive bases in a row, are at least 6 hamming distances away from any visium barcode, at least 7 hamming distances away from any random barcode, and within the same %GC range as the Visium spatial barcodes. Finally we write this to a textfile.

In [93]:
with open ("Antibody_barcodes.txt", "w") as f:
    for barcode in ultimate:
        f.write(barcode+"\n")