# Creating_ab_barcodes

This notebook contains code to generate AB barcodes for DNA-conjugated antibodies that can be used for simultaneous multiplexed protein profiling and Spatial Transcriptomics with Visium spatial gene expression solution from 10X genomics.

In [1]:
import random
import numpy as np
from itertools import groupby

In [2]:
def visium_barcodes_(inputfile):
    """Function to get spatial barcodes from a visium file containing spatial barcodes and coordinates."""
    barcodes = []
    with open(inputfile) as f:
        for line in f:
            words = line.split("\t")
            barcodes.append(words[0])
    return barcodes

In [3]:
def gc_content(sequence):
    """Calculates the %GC in a sequence."""
    gc_count = sequence.count("G") + sequence.count("C")
    gc_frac = float (gc_count) / len(sequence) *100
    return gc_frac

In [4]:
def hamming_distance (seq1, seq2):
    """Calculates the hamming distance between two sequences."""
    counter = 0
    for nuc1, nuc2 in zip(seq1,seq2):
        if nuc1 != nuc2:
            counter += 1
    return counter

In [5]:
def random_barcodes(n, l):
    """Generates n random barcodes of l length."""
    random.seed(29)
    random_barcodes = []
    for j in range(0,n):
        nucleotides = ["A", "C", "G", "T"]
        random_barcodes.append(''.join(random.choice(nucleotides) for i in range(l)))
    return random_barcodes

In [6]:
visium_barcodes = visium_barcodes_("../data/visium-v1_coordinates.txt")
ab_barcodes = random_barcodes(100, 16) # create 100 AB barcodes of 16 length

hamming_dist = {} # Store all hamming distances between each AB barcode and all visium spatial barcodes, and GC%

for i in range(0, len(ab_barcodes)):
    GC = gc_content(ab_barcodes[i]) # Calculate GC-content for each ab_barcode
    values = []
    for v_barcode in visium_barcodes:
        val = hamming_distance(ab_barcodes[i], v_barcode) # AB barcode against each and every visium barcode
        values.append(val)
    hamming_dist[ab_barcodes[i]] = (values, GC) 

# Check GC-content range of Visium spatial barcodes¶

In [7]:
gc_visium = []
for barcode in visium_barcodes:
    gc_visium.append(gc_content(barcode))
print(min(gc_visium))
print(max(gc_visium))

25.0
75.0


# Check Hamming distance between AB barcodes and visum barcodes¶

To understand how different the AB barcodes are to the Visium barcodes the minimum and maximum Hamming distance for each AB barcode can be retrieved.

In [8]:
distances = {}
for key, val in hamming_dist.items():
    hamming_val = []
    for i in range(0, len(hamming_dist[key][0])):
        value = val[0][i] # retrieve each hamming distance list comparing random barcode to visium barcode
        hamming_val.append(value)
        distances[key] = (min(hamming_val), max(hamming_val))

In [9]:
list(distances.items())[0:3]

[('AGGAGTTAAATCGATG', (5, 16)),
 ('TCTCCTTCTGGCTTCG', (5, 16)),
 ('GTTAGCGCGATCTTTG', (5, 16))]

# Comparing the Hamming distance between AB barcodes

To understand how different the 100 randomly generated AB barcodes are different to each other their Hamming distances can also be calculated and the minimum value retrieved.

In [10]:
hamming_random_bar = {}

for i in range(0, len(ab_barcodes)):
    values = []
    for barcode in ab_barcodes:
        val = hamming_distance(ab_barcodes[i], barcode)
        values.append(val)
    values.sort()
    hamming_random_bar[ab_barcodes[i]] = values[1] # 1 because the first one is 0 (barcode compared to itself)

In [11]:
list(hamming_random_bar.items())[0:3]

[('AGGAGTTAAATCGATG', 8), ('TCTCCTTCTGGCTTCG', 9), ('GTTAGCGCGATCTTTG', 8)]

# Choosing AB barcodes

To decide on which AB barcodes to choose one can start with filtering out the AB barcodes that have closer resemblence to any of the Visium spatial barcodes. Here a minimum Hamming distance of 6 between the AB barcode and any Visium barcode was set as threshold.



In [12]:
bools = []
for k,v in distances.items():
    bools.append(v[0]>=6)
significant = [i for i, x in enumerate(bools) if x]

Secondly, one can filter out the AB barcodes that have closer resemblence to any of the other AB barcodes. Here a minimum hamming distance of 7 between the AB barcodes and any other potential AB barcode was set as threshold.

In [13]:
l = []
for k,v in hamming_random_bar.items():
    l.append(v>=7)
significant_barcodes = [i for i, x in enumerate(l) if x]

Now the AB barcodes that are at least 6 hamming distances away from each visium spatial barcode and at least 7 hamming distances away from any other AB barcode can be retrieved by checking the intersection of the two lists generated.

In [14]:
overlap = set(significant).intersection(significant_barcodes)

In [15]:
ab_barcodes = np.array(ab_barcodes)

In [16]:
most_sig = ab_barcodes[list(overlap)] # retrieve the most signficant AB barcodes

Lastly one can filter out the barcodes based on consecutive bases, here max 3 consecutive bases will be allowed.

In [17]:
consecutive = {} # Store the number of consecutive bases in each barcode
for barcode in most_sig:
    res = [len(list(j)) for _, j in groupby(barcode)]
    consecutive[barcode] = res

In [18]:
list(consecutive.items())[0:3]

[('CGCGAATTCTCGAAAG', [1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 3, 1]),
 ('AGGGAGAGTGAGGAGC', [1, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1]),
 ('CATCGCTCCTTTACCT', [1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 2, 1])]

In [19]:
ultimate = [] # Store the final barcodes that are filtered based on consecutive bases
for k, v in consecutive.items():
    if (max(set(v)) <= 3) & (gc_content(k) >= 25.0) & (gc_content(k) <= 75.0): 
        ultimate.append(k)
ultimate

['CGCGAATTCTCGAAAG',
 'AGGGAGAGTGAGGAGC',
 'CATCGCTCCTTTACCT',
 'GCCGCGCCATCGGAGA',
 'CTTCAGCACGAGTATA',
 'GTCATGCTATGCGGAT',
 'CGGGAGTGATAACGAC',
 'GTCGCAGGGTCGTAAT',
 'CCCTAGAACAAGCAAT',
 'AACCTCCACTTAAGAA',
 'CGGGCGCGAGGAAATC',
 'AGGTATATTCCGCAAT',
 'TATGTTGCGACTATCT',
 'GGCCAGTGTTAAGTCC',
 'GCCTCACATCGGCCAT',
 'TTACGTATAGGTTGTC',
 'CTCTGTAGCAACACGA',
 'ATGAGCAGATATCTTG',
 'TTCACCATTAGTACGC']

These barcodes have no more than 3 consecutive bases in a row, are at least 6 hamming distances away from any visium barcode, at least 7 hamming distances away from any random barcode, and within the same %GC range as the Visium spatial barcodes. 

In [20]:
with open ("Antibody_barcodes.txt", "w") as f:
    for barcode in ultimate:
        f.write(barcode+"\n")