### Outline
The goal of this code is to generate recombineering primers to tag the native copy of Msm genes with fkbp-eGFP

### Import packages and define functions

In [2]:
import pandas as pd 
import numpy as np 
from Bio.Seq import Seq 
from Bio.SeqUtils import MeltingTemp as mt
from Bio import SeqIO
from Bio.SeqUtils import GC


def PCR_handle(gene_sequence, gibson_handle, direction):
        
    #Compute reverse complement for reverse primers
    if direction == 'rvs':
        gene_sequence = gene_sequence.reverse_complement()
        
    #Grabbing the first nucleotide to initialize a seq object
    pcr_handle = gene_sequence[0]
    
    #Extend PCR handle based on gene sequence until a melting temperature of 58 reached
    i = 1
    while mt.Tm_NN(pcr_handle, nn_table=mt.DNA_NN4)<58:
        pcr_handle = pcr_handle + gene_sequence[i]
        i = i+1
        
    #Set the minumum length of the PCR handle to 19 nucleotides
    if i < 19:
        pcr_handle = pcr_handle + gene_sequence[i:19]
        i = 19
        
    #Returns primer sequence and melting temp
    primer = gibson_handle + pcr_handle
    return(str(primer))

def determine_recombineering_primers(locus_list):
    
    # Vector handles to insert into pHW152 digested with NdeI and NotI 
    # FKBP handles for adding the fkbp-egfp-Zeo fragment between the upstream and downstream regions 
    fwd_vector = Seq('tttaagaaggagatatacatatg').lower()
    rv_linker6 = Seq('ACCGCTGCCCGAGCCACC').lower()

    fwd_linker6 = Seq('ATctcgagtctagaagta').lower()
    rv_vector = Seq('ggtggtggtgctcgagtgcggccgc').lower()
    upstream=500
    downstream=500
    
    gene_sequences = []
    
    for locus_name in locus_list:
    
        if locus_name not in msm_genes['Locus'].values:
            print(f"Locus name '{locus_name}' not found in the DataFrame.")
            return None

        start_position = msm_genes.loc[msm_genes['Locus'] == locus_name, 'Start'].iloc[0]
        stop_position = msm_genes.loc[msm_genes['Locus'] == locus_name, 'Stop'].iloc[0]
        strand = msm_genes.loc[msm_genes['Locus'] == locus_name, 'Strand'].iloc[0]

        start_position -= 1

        if strand == '+':
            gene_seq = msm_genome[start_position:stop_position]
            protein = gene_seq.translate()
            if protein.endswith('*'):
                upstream_seq = msm_genome[stop_position - upstream:stop_position - 3]
            else:
                upstream_seq = msm_genome[stop_position - upstream:stop_position]
            downstream_seq = msm_genome[stop_position:stop_position + downstream]

        elif strand == '-':
            gene_seq = msm_genome[start_position:stop_position].reverse_complement()
            protein = gene_seq.translate()
            if protein.endswith('*'):
                upstream_seq = msm_genome[start_position + 3:start_position + 3 + upstream].reverse_complement()
            else:
                upstream_seq = msm_genome[start_position:start_position + upstream].reverse_complement()
            downstream_seq = msm_genome[start_position - downstream:start_position].reverse_complement()

        #Determine the PCR handles based on these upstream and downstream sequences 
        p1 = PCR_handle(upstream_seq, fwd_vector, 'fwd')
        p2 = PCR_handle(upstream_seq, rv_linker6, 'rvs')
        p3 = PCR_handle(downstream_seq, fwd_linker6, 'fwd')
        p4 = PCR_handle(downstream_seq, rv_vector, 'rvs')
        gene_sequences.append([locus_name, str(upstream_seq), str(downstream_seq), p1, p2, p3, p4, max(len(p1), len(p2), len(p3), len(p4))])

    
    columns = ['gene', 
               'Msm Upstream Recombineering Sequence', 
               'Msm Downstream Recombineering Sequence', 
               'Msm Fwd Upstream primer', 
               'Msm Rvs Upstream primer', 
               'Msm Fwd Downstream primer', 
               'Msm Rvs Downstream primer', 
               'Msm max recombineering primer length']
    return pd.DataFrame(gene_sequences, columns=columns)



### Load reference sequence information

In [12]:
#Load Msmeg gene information as a dataframe
msm_genes = pd.read_csv("./Important_References/Mycobacterium_smegmatis_MC2-155_txt_v4.txt", sep="\t")

#Load Msmeg genome into a SeqRecord
msm_genome = next(SeqIO.parse('./Important_References/Mycobacterium_smegmatis_MC2-155_genome_v4.fasta', "fasta")).seq

#Load list of target genes for TPD 
union_ortho_table_filtered = pd.read_csv('./Important_References/union_ortho_table_filtered.csv')

### Define genes and determine primers

In [14]:
#Define list of genes for recombineering 
target_gene_list = union_ortho_table_filtered['gene']

#Run function to determine recombineering primers
TPD_recombineering_primers = determine_primers(target_gene_list)

#Add recombineering primers to union_ortho_table_random
merged_df = pd.merge(union_ortho_table_filtered, TPD_recombineering_primers, on='gene')

### Export data as csv

In [15]:
path = './Important_References/'
merged_df.to_csv(path + 'TPD_recombineering_primers.csv')

In [16]:
merged_df.head()

Unnamed: 0.1,Unnamed: 0,gene,rv,Msm_VI,Msm_Ess_TnSeq,Msm_Ess_CRISPR,Mtb_VI,Mtb_Ess_TnSeq,Mtb_Ess_CRISPR,Msm_seq,...,Mtb_Aliphatic_Index,PI_difference,Aliphatic_Index_difference,Msm Upstream Recombineering Sequence,Msm Downstream Recombineering Sequence,Msm Fwd Upstream primer,Msm Rvs Upstream primer,Msm Fwd Downstream primer,Msm Rvs Downstream primer,Msm max recombineering primer length
0,0,MSMEG_0250,Rv0206c,-7.702,Essential,Essential,-11.013,Essential,Essential,MFAWWGRTVYQFRYIVIGVMVALCLGGGVYGISLGNHVTQSGFYDE...,...,102.658898,3.8435,8.927408,GCAACGCCGTTCGCAATGCGGTCAACAGCGCGGTGCACGGCGGCGC...,GGCCCGGCTACTCCCGTTCGGGAAGGTCGTCGGACTCTGCGGCGAT...,tttaagaaggagatatacatatgGCAACGCCGTTCGCAATGC,accgctgcccgagccaccCAGCCTGCCTTCGCGGCGC,atctcgagtctagaagtaGGCCCGGCTACTCCCGTTC,ggtggtggtgctcgagtgcggccgcCGTGCGCGCGACGTTCCTG,44
1,1,MSMEG_0311,Rv0225,-0.06,Essential,Essential,-8.756,Essential,Essential,MSARPESAPHVRRVLLLCWRDTGHPQGGGSEAYVQRIGAYLAGRGV...,...,105.598958,0.352306,6.928882,AAGCCGTCGCGCAACTGCGCACCGAGATGCCTGATGTGCACCTGGA...,CGCCGCTCAGAACGCCTCGGCGGCTGCCTTCGCGGTGCGGGCCAGC...,tttaagaaggagatatacatatgAAGCCGTCGCGCAACTGCG,accgctgcccgagccaccGACCAGGCCGCTGAGGTAT,atctcgagtctagaagtaCGCCGCTCAGAACGCCTCG,ggtggtggtgctcgagtgcggccgcCCGCGGCGCTCGACGCCAT,44
2,2,MSMEG_0317,Rv0227c,-9.654,Essential,Essential,-13.167,Essential,Essential,MNRAVALRIAACGLLGLGAALLIAALLLTTYTKGKIAKIPLDIDTS...,...,76.294537,0.287468,9.785865,TCACCGCGCGCGCCGAGGTGTGGGGTGTGCCGGGCGAACCGGACGA...,GAACTGCGGTCCTGCTGCGGTCATGCTCGGTGCCGGCCTATGCGCT...,tttaagaaggagatatacatatgTCACCGCGCGCGCCGAGGT,accgctgcccgagccaccGATCGGTCGGTCCGGTGGC,atctcgagtctagaagtaGAACTGCGGTCCTGCTGCG,ggtggtggtgctcgagtgcggccgcCATCCAGAACACCAGCGGAGC,46
3,3,MSMEG_0359,Rv0236c,-0.614,Essential,Essential,-6.08,Essential,Essential,MVAAATLVLTFAQSPGQISPDTKLDLTANPLRFLARAFNLWNSDLP...,...,100.614286,3.159969,2.127723,ACATGCCGTACCGCGTCGGACTGATCGGCGGACTGGCACTCCTGCC...,CCAATCCTTGCCACCTCAACTGGTTGACAGCGAGCCGGTGCAGTAC...,tttaagaaggagatatacatatgACATGCCGTACCGCGTCGG,accgctgcccgagccaccACCGGCGTGAGCGGATGCT,atctcgagtctagaagtaCCAATCCTTGCCACCTCAACTG,ggtggtggtgctcgagtgcggccgcCGCGCTGTACGGCGACGAC,44
4,4,MSMEG_0384,Rv0334,-9.876,Essential,Essential,-2.498,Essential,Essential,MRGIILAGGSGTRLHPLTIGVSKQLLPVYDKPLVYYPLSTLIMAGI...,...,99.930556,0.132063,1.388889,GCTTCGAACACGTCAGTGGTGGGGCAATATTCGCGTACTGGGTGGC...,CGTGCCTAGCGCAAGGGCATACTGCTGTAACACCTGGCATGCCCAC...,tttaagaaggagatatacatatgGCTTCGAACACGTCAGTGGTGG,accgctgcccgagccaccCTCTCGATCCAGAAGTTGGAGCAGG,atctcgagtctagaagtaCGTGCCTAGCGCAAGGGCA,ggtggtggtgctcgagtgcggccgcAGCATCTGCTGGAACACCGG,45
