### Importing packages and functions

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import os
import peptides as pep
from Bio.Seq import Seq 
from Bio.SeqUtils import MeltingTemp as mt
from Bio import SeqIO
from Bio.SeqUtils import GC

#Load protein fasta files with Biopython 

def read_fasta(fasta_file):
    content = {}
    fas = open(fasta_file,'r')
    seq = ''
    last_entry = ''
    while True:
        line = fas.readline()
        if len(line)==0:
            content[last_entry]=seq
            break
        if '>' in line:
            if last_entry=='':
                line = line.rstrip()
                last_entry=line
            else:
                line = line.rstrip()
                content[last_entry]=seq
                last_entry=line
                seq=''
        else:
            line = line.rstrip()
            seq += line
    return content

def read_gff(gff_file, retrieve_keys = ['Locus','Name']):
    import pandas as pd
    columns=['Seqid','Source','type','Start',
             'End','Score','Strand','Phase'] + retrieve_keys
    df_raw = []
    with open(gff_file, 'r') as f:
        while True:
            line = f.readline().rstrip()
            if line == '':
                f.close()
                break
            else:
                line = line.split('\t')
                info = line[:-1]
                info_dict={}
                for x in line[-1].split(';'):
                    if x.count('=') == 1:
                        k,v = x.split('=')
                        info_dict[k] = v
                info += [info_dict[k] for k in retrieve_keys]
                df_raw.append(info)
    df = pd.DataFrame(df_raw,columns=columns)
    df[['Start','End']] = df[['Start','End']].astype(int)
    return df.sort_values(by='Start').reset_index(drop=True)


def determine_primers(gene_sequence, gibson_handle, direction):
        
    #Compute reverse complement for reverse primers and check for a stop codon        
    if direction == 'rvs':
        protein = gene_sequence.translate()
        test_stop = protein.endswith('*')
        if test_stop:
            gene_sequence = gene_sequence[0:len(gene_sequence)-3]
        gene_sequence = gene_sequence.reverse_complement()
    
    #Remove the start codon if fwd seq
    if direction == 'fwd':
        gene_sequence = gene_sequence[3:]    
        
    #Grabbing the first nucleotide to initialize a seq object
    pcr_handle = gene_sequence[0]
    
    #Extend PCR handle based on gene sequence until a melting temperature of 58 reached
    i = 1
    while mt.Tm_NN(pcr_handle, nn_table=mt.DNA_NN4)<58:
        pcr_handle = pcr_handle + gene_sequence[i]
        i = i+1
        
    #Set the minumum length of the PCR handle to 19 nucleotides
    if i < 19:
        pcr_handle = pcr_handle + gene_sequence[i:19]
        i = 19
        
    #Returns primer sequence and melting temp
    primer = gibson_handle + pcr_handle
    return(str(primer))

### Loading reference data

In [3]:
path = './Important_References/'
union_ortho_table_filtered = pd.read_csv(path+'union_ortho_table_filtered.csv')

#Loading Msm gene sequences with locus ID 
msm_gene_seqs = {k[1:].split('|')[0]:v for k,v in read_fasta(path+'Mycobacterium_smegmatis_MC2-155_genes_v4.fasta').items()}

### Design primers for all substrates

In [5]:
#Set the gibson handles based on the sequence of the digested plasmid pHW76 
Fwd_gibson_handle = Seq('TAATTAATATCAGGAGGTATACATatg').upper()
Rvs_gibson_handle = Seq('CagcGctGCCCGAGCCACCAGTACT').upper()

union_ortho_table_full_length = union_ortho_table_filtered.copy()
union_ortho_table_full_length['Msm_Fwd_primer'] = [determine_primers(Seq(msm_gene_seqs[x]), Fwd_gibson_handle, 'fwd') for x in union_ortho_table_full_length['gene'].values]
union_ortho_table_full_length['Msm_Rvs_primer'] = [determine_primers(Seq(msm_gene_seqs[x]), Rvs_gibson_handle, 'rvs') for x in union_ortho_table_full_length['gene'].values]

In [6]:
#Save to csv
union_ortho_table_full_length.to_csv(path+'union_ortho_table_full_length.csv')

In [8]:
union_ortho_table_full_length.head()

Unnamed: 0.1,Unnamed: 0,gene,rv,Msm_VI,Msm_Ess_TnSeq,Msm_Ess_CRISPR,Mtb_VI,Mtb_Ess_TnSeq,Mtb_Ess_CRISPR,Msm_seq,...,Msm_pro_length,Mtb_pro_length,Msm_PI,Mtb_PI,Msm_Aliphatic_Index,Mtb_Aliphatic_Index,PI_difference,Aliphatic_Index_difference,Msm_Fwd_primer,Msm_Rvs_primer
0,0,MSMEG_0250,Rv0206c,-7.702,Essential,Essential,-11.013,Essential,Essential,MFAWWGRTVYQFRYIVIGVMVALCLGGGVYGISLGNHVTQSGFYDE...,...,1013,944,6.063635,9.907134,93.731491,102.658898,3.8435,8.927408,TAATTAATATCAGGAGGTATACATATGttcgcctggtggggtcgga,CAGCGCTGCCCGAGCCACCAGTACTcagcctgccttcgcggcgc
1,1,MSMEG_0311,Rv0225,-0.06,Essential,Essential,-8.756,Essential,Essential,MSARPESAPHVRRVLLLCWRDTGHPQGGGSEAYVQRIGAYLAGRGV...,...,391,384,9.791186,10.143492,98.670077,105.598958,0.352306,6.928882,TAATTAATATCAGGAGGTATACATATGtctgcccggcccgagtctg,CAGCGCTGCCCGAGCCACCAGTACTgaccaggccgctgaggtat
2,2,MSMEG_0317,Rv0227c,-9.654,Essential,Essential,-13.167,Essential,Essential,MNRAVALRIAACGLLGLGAALLIAALLLTTYTKGKIAKIPLDIDTS...,...,398,421,4.292958,4.580426,86.080402,76.294537,0.287468,9.785865,TAATTAATATCAGGAGGTATACATATGaaccgcgctgtggcgctgc,CAGCGCTGCCCGAGCCACCAGTACTgatcggtcggtccggtggc
3,3,MSMEG_0359,Rv0236c,-0.614,Essential,Essential,-6.08,Essential,Essential,MVAAATLVLTFAQSPGQISPDTKLDLTANPLRFLARAFNLWNSDLP...,...,1414,1400,6.699455,9.859423,98.486563,100.614286,3.159969,2.127723,TAATTAATATCAGGAGGTATACATATGgtcgccgcggcgacactcg,CAGCGCTGCCCGAGCCACCAGTACTaccggcgtgagcggatgct
4,4,MSMEG_0384,Rv0334,-9.876,Essential,Essential,-2.498,Essential,Essential,MRGIILAGGSGTRLHPLTIGVSKQLLPVYDKPLVYYPLSTLIMAGI...,...,288,288,5.004825,5.136888,101.319444,99.930556,0.132063,1.388889,TAATTAATATCAGGAGGTATACATATGcgcggcatcatcctcgccg,CAGCGCTGCCCGAGCCACCAGTACTctctcgatccagaagttggag...
