In [94]:
import os

import difflib

import pandas as pd
import numpy as np

from Bio import Seq
from Bio import SeqIO

In [3]:
path  = r"C:\Users\johnp\Box\Finkelstein-Matouschek\yeast_engineering\yeast_deletion_collection\barcode_index.csv"
bcdex = pd.read_csv(path)

In [24]:
def read_ab1(path):
    """
    Return Bio.SeqIO record of the .ab1 file at <path>
    """
    handle = open(path, 'rb')
    records = []
    for record in SeqIO.parse(handle, 'abi'):
        records.append(record)
    if len(records) == 1:
        records = records[0]
    else:
        pass
    return records

In [81]:
datadir = r"C:\Users\johnp\Box\Finkelstein-Matouschek\sanger_sequencing\results\Results_1117368"
filetype = '.ab1'

# Following are written in 5' -> 3' direction
barcode_border_5p = 'GATGTCCACGAGGTCTCT'
barcode_border_3p = 'CGTACGCTGCAGGTCGAC'
uptag_example = 'GATGACAGTACAGTTGACAG'
barcode_len = len(uptag_example)

filenames = [fn for fn in os.listdir(datadir) if filetype == fn[-4:]]
paths = [os.path.join(os.path.abspath(datadir), fn) for fn in filenames]
records = [read_ab1(path) for path in paths]
seqs = [rec.seq for rec in records]


In [131]:
for idx in range(len(paths)):
    n = 10
    cutoff = 0.3
    seq = seqs[idx]
    print(filenames[idx])
    seq_revcomp = seq.reverse_complement()
    bc_start = seq_revcomp.find(barcode_border_3p) - barcode_len
    bc_end = bc_start + barcode_len
    barcode_seq = seq_revcomp[bc_start: bc_end]
    print(f'Found barcode\n{barcode_seq}')

    barcodes = list(bcdex.UPTAG)
    args = [
        barcode_seq,
        barcodes
    ]
    kwargs = {
        'n': n,
        'cutoff': cutoff
    }
    matches = difflib.get_close_matches(*args, **kwargs)
    wanted_cols = [
        'Gene Symbol',
        'Gene Description',
        'SGD ID'
    ]
    wanted_cols = [
        'Gene Symbol',
        'SGD ID'
    ]
    print(f'Showing top {n} closest matches in deletion collection UPTAG index:')
    print(bcdex.set_index('UPTAG').loc[matches, wanted_cols], '\n')

12C_001_JPC202-5_MXR3_A03_1117368.ab1
Found barcode
ATCATTCCAGCAGATGAAAG
Showing top 10 closest matches in deletion collection UPTAG index:
                     Gene Symbol      SGD ID
UPTAG                                       
ATCATTCCAGCAGTTGGGAG        HRD1  S000005373
ATCATTAGCAGAGCAGTGAG        SKI7  S000005602
TCTTCACAGCAAGCAAGGGA     YJL185C  S000003721
TCACCTACAGATGGGCGAAG        KAR3  S000006345
TATTTCCAGCCAGGGTGCAG         NaN  S000005946
TATCGGACCAGCAGGCTAAG        AQY1  S000006396
TACTATTCAGGAGAGTGACG        TNA1  S000003492
TACATACCACGACCTTGAAG        MRP2  S000006370
GCACTCTACAGATGATACAG     YAR023C  S000000074
CTTCTATTCAGAGATGGAGG        EFM2  S000000475 

12C_002_JPC209-1_MXR3_B03_1117368.ab1
Found barcode
GCGCTTCTGCAAGAAGAACA
Showing top 10 closest matches in deletion collection UPTAG index:
                     Gene Symbol      SGD ID
UPTAG                                       
GCGCCTCTGCAAGAAGAACA        SSA2  S000003947
GCGTCTGACATAAGAGCAAC        MPH1  S00000144