In [1]:
import pandas as pd
from IPython.display import Image
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from pydna.dseqrecord import Dseqrecord

In [2]:
a_oryzae ={
    'F': {'TTT': 0.38, 'TTC': 0.62},
    'S': {'TCT': 0.18, 'TCC': 0.20, 'TCA': 0.14, 'TCG': 0.16, 'AGT': 0.13, 'AGC': 0.18},
    'Y': {'TAT': 0.47, 'TAC': 0.53},
    'C': {'TGT': 0.46, 'TGC': 0.54},
    '*': {'TAA': 0.33, 'TAG': 0.29, 'TGA': 0.38},
    'L': {'TTA': 0.07, 'TTG': 0.18, 'CTT': 0.19, 'CTC': 0.23, 'CTA': 0.11, 'CTG': 0.22},
    'W': {'TGG': 1.00},
    'P': {'CCT': 0.27, 'CCC': 0.26, 'CCA': 0.25, 'CCG': 0.22},
    'H': {'CAT': 0.53, 'CAC': 0.47},
    'Q': {'CAA': 0.43, 'CAG': 0.57},
    'R': {'CGT': 0.18, 'CGC': 0.23, 'CGA': 0.17, 'CGG': 0.18, 'AGA': 0.13, 'AGG': 0.12},
    'I': {'ATT': 0.36, 'ATC': 0.50, 'ATA': 0.14},
    'M': {'ATG': 1.00},
    'T': {'ACT': 0.24, 'ACC': 0.32, 'ACA': 0.23, 'ACG': 0.20},
    'N': {'AAT': 0.45, 'AAC': 0.55},
    'K': {'AAA': 0.36, 'AAG': 0.64},
    'V': {'GTT': 0.27, 'GTC': 0.33, 'GTA': 0.13, 'GTG': 0.27},
    'A': {'GCT': 0.27, 'GCC': 0.30, 'GCA': 0.23, 'GCG': 0.20},
    'D': {'GAT': 0.53, 'GAC': 0.47},
    'E': {'GAA': 0.44, 'GAG': 0.56},
    'G': {'GGT': 0.28, 'GGC': 0.31, 'GGA': 0.24, 'GGG': 0.17}
}

### SPs

In [3]:
sp_df = pd.read_csv('../data/15_SecretoGen/Best_signal_peptides_for_A_oryzae_RFP_his_tag_sorted.csv')
sp_df = sp_df.rename(columns={"SP_ID": "SecretoGen_ID"})

sp_df

Unnamed: 0,SecretoGen_ID,Sequence,Perplexity
0,SP_997,MRPTLLALGVISFALTLHS,7.515804
1,SP_944,MKFTLIILIIVILTIIFSPGALA,7.421489
2,SP_948,MKVTLSLLAVFLAALSASAIPANG,7.179219
3,SP_993,MKAIIILLLIILTLALTVQG,7.140379
4,SP_204,MRVFSATAILALSPLLIASASP,6.925975
...,...,...,...
995,SP_656,MVLGILTITAIPTVTGLANA,1.597785
996,SP_696,MVLGLLTIAAIPTVTGVAQA,1.563614
997,SP_757,MVLGLLTIAAIPTVTGVGNAVSA,1.554127
998,SP_253,MVLGILTITAIPTVTGVAQA,1.535361


In [4]:
sp_df

Unnamed: 0,SecretoGen_ID,Sequence,Perplexity
0,SP_997,MRPTLLALGVISFALTLHS,7.515804
1,SP_944,MKFTLIILIIVILTIIFSPGALA,7.421489
2,SP_948,MKVTLSLLAVFLAALSASAIPANG,7.179219
3,SP_993,MKAIIILLLIILTLALTVQG,7.140379
4,SP_204,MRVFSATAILALSPLLIASASP,6.925975
...,...,...,...
995,SP_656,MVLGILTITAIPTVTGLANA,1.597785
996,SP_696,MVLGLLTIAAIPTVTGVAQA,1.563614
997,SP_757,MVLGLLTIAAIPTVTGVGNAVSA,1.554127
998,SP_253,MVLGILTITAIPTVTGVAQA,1.535361


In [5]:
# Adding the length of sequences to the dataframe
sp_df['Length'] = sp_df['Sequence'].apply(len)

# Removing sequences with length over 26
filtered_df = sp_df[sp_df['Length'] <= 26]
filtered_df

Unnamed: 0,SecretoGen_ID,Sequence,Perplexity,Length
0,SP_997,MRPTLLALGVISFALTLHS,7.515804,19
1,SP_944,MKFTLIILIIVILTIIFSPGALA,7.421489,23
2,SP_948,MKVTLSLLAVFLAALSASAIPANG,7.179219,24
3,SP_993,MKAIIILLLIILTLALTVQG,7.140379,20
4,SP_204,MRVFSATAILALSPLLIASASP,6.925975,22
...,...,...,...,...
995,SP_656,MVLGILTITAIPTVTGLANA,1.597785,20
996,SP_696,MVLGLLTIAAIPTVTGVAQA,1.563614,20
997,SP_757,MVLGLLTIAAIPTVTGVGNAVSA,1.554127,23
998,SP_253,MVLGILTITAIPTVTGVAQA,1.535361,20


In [6]:

# Getting the top 30
top_30 = sp_df.head(30)

# Sampling the remaining dataframe for a good mix
remaining_sample_size = 84 - 30
remaining_sample = sp_df.iloc[30:].sample(n=remaining_sample_size, random_state=1)

# Combining the top 30 with the remaining sample
final_sample = pd.concat([top_30, remaining_sample]).reset_index(drop=True)
final_sample

Unnamed: 0,SecretoGen_ID,Sequence,Perplexity,Length
0,SP_997,MRPTLLALGVISFALTLHS,7.515804,19
1,SP_944,MKFTLIILIIVILTIIFSPGALA,7.421489,23
2,SP_948,MKVTLSLLAVFLAALSASAIPANG,7.179219,24
3,SP_993,MKAIIILLLIILTLALTVQG,7.140379,20
4,SP_204,MRVFSATAILALSPLLIASASP,6.925975,22
...,...,...,...,...
79,SP_952,MRVSTIPTALLGLSLSAVNA,3.271387,20
80,SP_659,MVRLNSITALASLLAVSLTAA,3.640410,21
81,SP_576,MILSYRSLVVTTVALLGLSTPATA,3.873739,24
82,SP_560,MVLLKPLLATALLASLTPAVA,3.045672,21


# Back-translate

In [7]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from pydna.dseq import Dseq

# Simplified back-translation codon table
amino_acid_to_codon = {
    'A': 'GCT', 'R': 'CGT', 'N': 'AAT', 'D': 'GAT',
    'C': 'TGT', 'Q': 'CAA', 'E': 'GAA', 'G': 'GGT',
    'H': 'CAT', 'I': 'ATT', 'L': 'CTT', 'K': 'AAA',
    'M': 'ATG', 'F': 'TTT', 'P': 'CCT', 'S': 'TCT',
    'T': 'ACT', 'W': 'TGG', 'Y': 'TAT', 'V': 'GTT',
    'X': 'NNN'  # 'X' represents any amino acid
}

def back_translate(protein_sequence):
    return ''.join([amino_acid_to_codon[aa] for aa in protein_sequence])


In [9]:

seq_records = []
for index, row in final_sample.iterrows():
    dna_sequence = back_translate(row['Sequence'])
    dseq_obj = Dseq(dna_sequence)
    dseqrecord = Dseqrecord(dseq_obj, name=row['SecretoGen_ID'], description=f"Perplexity: {row['Perplexity']}, Length: {row['Length']}")
    seq_records.append(dseqrecord)

seq_records

[Dseqrecord(-57),
 Dseqrecord(-69),
 Dseqrecord(-72),
 Dseqrecord(-60),
 Dseqrecord(-66),
 Dseqrecord(-57),
 Dseqrecord(-69),
 Dseqrecord(-60),
 Dseqrecord(-60),
 Dseqrecord(-75),
 Dseqrecord(-66),
 Dseqrecord(-63),
 Dseqrecord(-54),
 Dseqrecord(-51),
 Dseqrecord(-66),
 Dseqrecord(-57),
 Dseqrecord(-63),
 Dseqrecord(-63),
 Dseqrecord(-57),
 Dseqrecord(-51),
 Dseqrecord(-57),
 Dseqrecord(-84),
 Dseqrecord(-66),
 Dseqrecord(-54),
 Dseqrecord(-60),
 Dseqrecord(-63),
 Dseqrecord(-57),
 Dseqrecord(-54),
 Dseqrecord(-60),
 Dseqrecord(-60),
 Dseqrecord(-66),
 Dseqrecord(-57),
 Dseqrecord(-57),
 Dseqrecord(-63),
 Dseqrecord(-72),
 Dseqrecord(-66),
 Dseqrecord(-51),
 Dseqrecord(-63),
 Dseqrecord(-63),
 Dseqrecord(-63),
 Dseqrecord(-54),
 Dseqrecord(-60),
 Dseqrecord(-66),
 Dseqrecord(-60),
 Dseqrecord(-72),
 Dseqrecord(-60),
 Dseqrecord(-63),
 Dseqrecord(-66),
 Dseqrecord(-57),
 Dseqrecord(-60),
 Dseqrecord(-60),
 Dseqrecord(-63),
 Dseqrecord(-60),
 Dseqrecord(-63),
 Dseqrecord(-72),
 Dseqrecor

### Codon optimize the sequences: 

A. oryzae tax id : 5062

https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=5062&lvl=3&lin=f&keep=1&srchmode=1&unlock

In [10]:
from dnachisel import DnaOptimizationProblem, CodonOptimize

# Codon optimization for Aspergillus oryzae
codon_optimized_records = []
codon_optimized_records_str = []
taxonomy_id = 5062 # a oryzae

for seq in seq_records:
    problem = DnaOptimizationProblem(
        sequence=str(seq.seq),
        constraints=[],
        #objectives=[CodonOptimize(codon_usage_table=a_oryzae, method='match_codon_usage')], 
        objectives=[CodonOptimize(species=taxonomy_id, method='match_codon_usage')], 
    )
    problem.optimize()
    optimized_dna_sequence = str(problem.sequence)
    
    dseq_obj = Dseq(optimized_dna_sequence)
    dseqrecord = Dseqrecord(dseq_obj, name=row['SecretoGen_ID'], description=f"Perplexity: {row['Perplexity']}, Length: {row['Length']}")
    codon_optimized_records.append(dseqrecord)
    codon_optimized_records_str.append(str(dseqrecord.seq))
    
final_sample['codon_optimized_sequences']= codon_optimized_records_str

                                                                                  

## Final sample with sp_name 

In [12]:
final_sample['sp_name'] = [f'SP_{i}' for i in range(len(final_sample['SecretoGen_ID'].to_list())) ]

In [13]:
final_sample    

Unnamed: 0,SecretoGen_ID,Sequence,Perplexity,Length,codon_optimized_sequences,sp_name
0,SP_997,MRPTLLALGVISFALTLHS,7.515804,19,ATGATGATGACTATGATGTGGACAACCTGGATTACCATCACGCACA...,SP_0
1,SP_944,MKFTLIILIIVILTIIFSPGALA,7.421489,23,ATGACCTTTACTATCATCATTACGAAGATTATCTTCAAGAAAATTT...,SP_1
2,SP_948,MKVTLSLLAVFLAALSASAIPANG,7.179219,24,ATGAAAGCAGCGATGTTCTTCGCCGCTTGGAAGGCGGCCGCTTTTA...,SP_2
3,SP_993,MKAIIILLLIILTLALTVQG,7.140379,20,ATGATCAAAATTATTATCCTAATCCTTATTCTGCTCATATTGATCA...,SP_3
4,SP_204,MRVFSATAILALSPLLIASASP,6.925975,22,ATGATGCTGTTTGCAGCTGCGATCATTCTAGCCCTTTTGGCCTTCT...,SP_4
...,...,...,...,...,...,...
79,SP_952,MRVSTIPTALLGLSLSAVNA,3.271387,20,ATGATGGTTTGGACAACGGCCACTGCTGCGATGATTGTGAACATGA...,SP_79
80,SP_659,MVRLNSITALASLLAVSLTAA,3.640410,21,ATGTGGATGCTCAATCTGATTTTGGCTCTTGCAGCCATGCTAGCGG...,SP_80
81,SP_576,MILSYRSLVVTTVALLGLSTPATA,3.873739,24,ATGACCACAGTGTATGTCTGGATGGTGGTTATGTACGTTATCATGG...,SP_81
82,SP_560,MVLLKPLLATALLASLTPAVA,3.045672,21,ATGTGGGCAAAGAAAATGGCCGCAGCTGCCGCCATGATGGCTTGGG...,SP_82


# GEnerating primers

In [14]:
RFP = Dseqrecord(SeqIO.read("../data/10_genetic_parts/parts/RFP_w_B_homolgy.fasta", "fasta"))
pTef = Dseqrecord(SeqIO.read("../data/10_genetic_parts/parts/pTEF_w_A_homology.fasta", "fasta"))
print(f'The lenght of RFP is {len(RFP)}')
print(f'The lenght of pTef is {len(pTef)}')

The lenght of RFP is 1977
The lenght of pTef is 1384


In [15]:
from pydna.amplify import pcr
from pydna.primer import Primer

In [16]:
generic_pTef_fwd = Primer('CGAGACAGCAGAATCACCG') # generic 
generic_pTef_rev = Primer('GGTGAAGGTTGTGTTATGTTTTGTGG') # martis

generic_rfp_fwd = Primer('GCCTCCTCCGAGGACG') # martis
generic_rfp_rev = Primer('GAGGAGAGTGGATGGATAGTCTGG') # generic



## generating the overhangs

In [17]:
reverse_ovh_list = []
forward_ovh_list = []

for index, row in final_sample.iterrows():
    # get the sequence
    seq = str(row['codon_optimized_sequences'])

    # make the overhangs like marti propose: 
    # Forward
    fwd_ovh = seq[-44:] + generic_rfp_fwd
    fwd_ovh.name= f"{str(row['sp_name'])}_fwd"
    fwd_ovh.id = f"{str(row['sp_name'])}_fwd"

    # reverse
    rev_ovh = Seq(seq[:-14]).reverse_complement()+ generic_pTef_rev 
    rev_ovh.name = f"{str(row['sp_name'])}_rev"
    rev_ovh.id = f"{str(row['sp_name'])}_rev"


    forward_ovh_list.append(fwd_ovh)
    reverse_ovh_list.append(rev_ovh)

Amplicons 

In [18]:
from pydna.amplify import pcr
from pydna.primer import Primer

pTef_amplicons = []
for i in range(len(reverse_ovh_list)): 
    amplicon = pcr(generic_pTef_fwd, reverse_ovh_list[i], pTef)
    pTef_amplicons.append(amplicon)

In [19]:

RFP_amplicons = []
for i in range(len(forward_ovh_list)): 
    amplicon = pcr(forward_ovh_list[i], generic_rfp_rev, RFP)
    RFP_amplicons.append(amplicon)

# Testing the overlap

In [20]:
id_names= final_sample['sp_name'].to_list()

In [21]:
# changing the names of the amplicons
for i in range(len(pTef_amplicons)):
    pTef_amplicons[i].name =  f'pTef_{pTef_amplicons[i].name }_{id_names[i]}'
    
for i in range(len(RFP_amplicons)):
    RFP_amplicons[i].name =  f'RFP_{RFP_amplicons[i].name }_{id_names[i]}'

In [22]:
from pydna.assembly import Assembly

In [23]:
for i in range(len(RFP_amplicons)): 
    assemblyobj = Assembly([pTef_amplicons[i],RFP_amplicons[i]], limit = 18)
    contig = assemblyobj.assemble_linear()[0]

    print(contig.figure(), '\n')


pTef_929bp_PCR_prod_SP_0|30
                         \/
                         /\
                         30|RFP_2021bp_PCR_prod_SP_0 

pTef_941bp_PCR_prod_SP_1|30
                         \/
                         /\
                         30|RFP_2021bp_PCR_prod_SP_1 

pTef_944bp_PCR_prod_SP_2|30
                         \/
                         /\
                         30|RFP_2021bp_PCR_prod_SP_2 

pTef_932bp_PCR_prod_SP_3|30
                         \/
                         /\
                         30|RFP_2021bp_PCR_prod_SP_3 

pTef_938bp_PCR_prod_SP_4|30
                         \/
                         /\
                         30|RFP_2021bp_PCR_prod_SP_4 

pTef_929bp_PCR_prod_SP_5|30
                         \/
                         /\
                         30|RFP_2021bp_PCR_prod_SP_5 

pTef_941bp_PCR_prod_SP_6|30
                         \/
                         /\
                         30|RFP_2021bp_PCR_prod_SP_6 

pTef_932bp_PCR_prod_SP_7|30

In [24]:
# changing the names of the amplicons
forward_primers = []
reverse_primers = []

for i in range(len(RFP_amplicons)):
    forward_primers.append(str(RFP_amplicons[i].forward_primer.seq))

for i in range(len(pTef_amplicons)):
    
    reverse_primers.append(str(pTef_amplicons[i].reverse_primer.seq))


In [25]:
forward_ovh_list

[SP_0_fwd 60-mer:5'-TGATGTGGACAACCT..ACG-3',
 SP_1_fwd 60-mer:5'-AGATTATCTTCAAGA..ACG-3',
 SP_2_fwd 60-mer:5'-GGAAGGCGGCCGCTT..ACG-3',
 SP_3_fwd 60-mer:5'-TCCTAATCCTTATTC..ACG-3',
 SP_4_fwd 60-mer:5'-TCATTCTAGCCCTTT..ACG-3',
 SP_5_fwd 60-mer:5'-GGATGGCGATCATTA..ACG-3',
 SP_6_fwd 60-mer:5'-TCACGACAACTACTA..ACG-3',
 SP_7_fwd 60-mer:5'-TTGTCTGGACCATCA..ACG-3',
 SP_8_fwd 60-mer:5'-TCTGGTTTTGGTGGA..ACG-3',
 SP_9_fwd 60-mer:5'-CGACCACTACAAAGA..ACG-3',
 SP_10_fwd 60-mer:5'-ATACGATCCTATGGC..ACG-3',
 SP_11_fwd 60-mer:5'-GGTTCATCAAGGCTA..ACG-3',
 SP_12_fwd 60-mer:5'-TCCATATGATTTGGG..ACG-3',
 SP_13_fwd 60-mer:5'-TGTTCGCTGCAGCGT..ACG-3',
 SP_14_fwd 60-mer:5'-CCTTTTTCCTCCTGG..ACG-3',
 SP_15_fwd 60-mer:5'-TGGCAGCTGCAATGT..ACG-3',
 SP_16_fwd 60-mer:5'-TCACTTGCACACAAG..ACG-3',
 SP_17_fwd 60-mer:5'-TGGCAGCTGCAGCCT..ACG-3',
 SP_18_fwd 60-mer:5'-TCATCGCAATTTTCA..ACG-3',
 SP_19_fwd 60-mer:5'-TCGGATTCTTGTTTG..ACG-3',
 SP_20_fwd 60-mer:5'-TTTTCTTCCTTCTGT..ACG-3',
 SP_21_fwd 60-mer:5'-CTACAAACTGGCTAA..ACG-3'

In [26]:
reverse_ovh_list

[SP_0_rev 69-mer:5'-GCGTGATGGTAATCC..TGG-3',
 SP_1_rev 81-mer:5'-TGAAAAAGAAAATTT..TGG-3',
 SP_2_rev 84-mer:5'-TGGCGTTAGCGATAA..TGG-3',
 SP_3_rev 72-mer:5'-TGATCAATATGAGCA..TGG-3',
 SP_4_rev 78-mer:5'-GTGCGAAGAAGGCCA..TGG-3',
 SP_5_rev 69-mer:5'-TGGCCGTCATTATCA..TGG-3',
 SP_6_rev 81-mer:5'-TAATGGTATTGGTAA..TGG-3',
 SP_7_rev 72-mer:5'-TAACGGTCGTTGTGA..TGG-3',
 SP_8_rev 72-mer:5'-CAGCAAAGAACATAA..TGG-3',
 SP_9_rev 87-mer:5'-AAGTCGCAAAGGTCT..TGG-3',
 SP_10_rev 78-mer:5'-ACAAAAAGAGAAGCT..TGG-3',
 SP_11_rev 75-mer:5'-CCTTCATAGCGGCAA..TGG-3',
 SP_12_rev 66-mer:5'-ACACAACCATAGCTG..TGG-3',
 SP_13_rev 63-mer:5'-CCCACATCCAGATGA..TGG-3',
 SP_14_rev 78-mer:5'-CCAAAAGGATGAAAG..TGG-3',
 SP_15_rev 69-mer:5'-AGGCAAAAGCCTGCC..TGG-3',
 SP_16_rev 75-mer:5'-TTATGATCTGCATAG..TGG-3',
 SP_17_rev 75-mer:5'-AGGCGAACGCCGCGA..TGG-3',
 SP_18_rev 69-mer:5'-TCGCGATTGCAGCAA..TGG-3',
 SP_19_rev 63-mer:5'-GAAGCATACCGAGGC..TGG-3',
 SP_20_rev 69-mer:5'-GCAATAGCTTCTTCC..TGG-3',
 SP_21_rev 96-mer:5'-TATGGAAACACGTGA..TGG-3'

In [27]:
final_sample['fwd_primers(5-3)']= forward_primers
final_sample['rev_primers(5-3)']= reverse_primers
final_sample

Unnamed: 0,SecretoGen_ID,Sequence,Perplexity,Length,codon_optimized_sequences,sp_name,fwd_primers(5-3),rev_primers(5-3)
0,SP_997,MRPTLLALGVISFALTLHS,7.515804,19,ATGATGATGACTATGATGTGGACAACCTGGATTACCATCACGCACA...,SP_0,TGATGTGGACAACCTGGATTACCATCACGCACATGATGCATTGGGC...,GCGTGATGGTAATCCAGGTTGTCCACATCATAGTCATCATCATGGT...
1,SP_944,MKFTLIILIIVILTIIFSPGALA,7.421489,23,ATGACCTTTACTATCATCATTACGAAGATTATCTTCAAGAAAATTT...,SP_1,AGATTATCTTCAAGAAAATTTTCTTTTTCATGTGGATCACAATAGC...,TGAAAAAGAAAATTTTCTTGAAGATAATCTTCGTAATGATGATAGT...
2,SP_948,MKVTLSLLAVFLAALSASAIPANG,7.179219,24,ATGAAAGCAGCGATGTTCTTCGCCGCTTGGAAGGCGGCCGCTTTTA...,SP_2,GGAAGGCGGCCGCTTTTATCGCTAACGCCATTGCAAAGAATTGGGC...,TGGCGTTAGCGATAAAAGCGGCCGCCTTCCAAGCGGCGAAGAACAT...
3,SP_993,MKAIIILLLIILTLALTVQG,7.140379,20,ATGATCAAAATTATTATCCTAATCCTTATTCTGCTCATATTGATCA...,SP_3,TCCTAATCCTTATTCTGCTCATATTGATCATTAAGCAGCAAATCGC...,TGATCAATATGAGCAGAATAAGGATTAGGATAATAATTTTGATCAT...
4,SP_204,MRVFSATAILALSPLLIASASP,6.925975,22,ATGATGCTGTTTGCAGCTGCGATCATTCTAGCCCTTTTGGCCTTCT...,SP_4,TCATTCTAGCCCTTTTGGCCTTCTTCGCACTCTGGGCTATCATAGC...,GTGCGAAGAAGGCCAAAAGGGCTAGAATGATCGCAGCTGCAAACAG...
...,...,...,...,...,...,...,...,...
79,SP_952,MRVSTIPTALLGLSLSAVNA,3.271387,20,ATGATGGTTTGGACAACGGCCACTGCTGCGATGATTGTGAACATGA...,SP_79,CGGCCACTGCTGCGATGATTGTGAACATGATCGTCACCAATGCAGC...,TCATGTTCACAATCATCGCAGCAGTGGCCGTTGTCCAAACCATCAT...
80,SP_659,MVRLNSITALASLLAVSLTAA,3.640410,21,ATGTGGATGCTCAATCTGATTTTGGCTCTTGCAGCCATGCTAGCGG...,SP_80,TTTTGGCTCTTGCAGCCATGCTAGCGGCCTGGAACATCGCTGCAGC...,AGGCCGCTAGCATGGCTGCAAGAGCCAAAATCAGATTGAGCATCCA...
81,SP_576,MILSYRSLVVTTVALLGLSTPATA,3.873739,24,ATGACCACAGTGTATGTCTGGATGGTGGTTATGTACGTTATCATGG...,SP_81,TTATGTACGTTATCATGGTATGGACCACGATTATGTGGACTGTCGC...,TCGTGGTCCATACCATGATAACGTACATAACCACCATCCAGACATA...
82,SP_560,MVLLKPLLATALLASLTPAVA,3.045672,21,ATGTGGGCAAAGAAAATGGCCGCAGCTGCCGCCATGATGGCTTGGG...,SP_82,CCGCAGCTGCCGCCATGATGGCTTGGGCGATGATGGCTGCGAAGGC...,TCGCCCAAGCCATCATGGCGGCAGCTGCGGCCATTTTCTTTGCCCA...


For IDT:

In [28]:
fwd_name = [seq.name for seq in forward_ovh_list]
fwd_seq = [str(seq.seq) for seq in forward_ovh_list]

rev_name = [seq.name for seq in reverse_ovh_list]
rev_seq = [str(seq.seq) for seq in reverse_ovh_list]


In [29]:
forward_primers_well_plate = pd.read_excel('../data/example_plate-file-upload (1).xls')[:84]
forward_primers_well_plate['Name'] = fwd_name
forward_primers_well_plate['Sequence'] = fwd_seq

forward_primers_well_plate


Unnamed: 0,Well Position,Name,Sequence
0,A1,SP_0_fwd,TGATGTGGACAACCTGGATTACCATCACGCACATGATGCATTGGGC...
1,A2,SP_1_fwd,AGATTATCTTCAAGAAAATTTTCTTTTTCATGTGGATCACAATAGC...
2,A3,SP_2_fwd,GGAAGGCGGCCGCTTTTATCGCTAACGCCATTGCAAAGAATTGGGC...
3,A4,SP_3_fwd,TCCTAATCCTTATTCTGCTCATATTGATCATTAAGCAGCAAATCGC...
4,A5,SP_4_fwd,TCATTCTAGCCCTTTTGGCCTTCTTCGCACTCTGGGCTATCATAGC...
...,...,...,...
79,G8,SP_79_fwd,CGGCCACTGCTGCGATGATTGTGAACATGATCGTCACCAATGCAGC...
80,G9,SP_80_fwd,TTTTGGCTCTTGCAGCCATGCTAGCGGCCTGGAACATCGCTGCAGC...
81,G10,SP_81_fwd,TTATGTACGTTATCATGGTATGGACCACGATTATGTGGACTGTCGC...
82,G11,SP_82_fwd,CCGCAGCTGCCGCCATGATGGCTTGGGCGATGATGGCTGCGAAGGC...


In [30]:
reverse_primers_well_plate = pd.read_excel('../data/example_plate-file-upload (1).xls')[:84]
reverse_primers_well_plate['Name'] = rev_name
reverse_primers_well_plate['Sequence'] = rev_seq

reverse_primers_well_plate

Unnamed: 0,Well Position,Name,Sequence
0,A1,SP_0_rev,GCGTGATGGTAATCCAGGTTGTCCACATCATAGTCATCATCATGGT...
1,A2,SP_1_rev,TGAAAAAGAAAATTTTCTTGAAGATAATCTTCGTAATGATGATAGT...
2,A3,SP_2_rev,TGGCGTTAGCGATAAAAGCGGCCGCCTTCCAAGCGGCGAAGAACAT...
3,A4,SP_3_rev,TGATCAATATGAGCAGAATAAGGATTAGGATAATAATTTTGATCAT...
4,A5,SP_4_rev,GTGCGAAGAAGGCCAAAAGGGCTAGAATGATCGCAGCTGCAAACAG...
...,...,...,...
79,G8,SP_79_rev,TCATGTTCACAATCATCGCAGCAGTGGCCGTTGTCCAAACCATCAT...
80,G9,SP_80_rev,AGGCCGCTAGCATGGCTGCAAGAGCCAAAATCAGATTGAGCATCCA...
81,G10,SP_81_rev,TCGTGGTCCATACCATGATAACGTACATAACCACCATCCAGACATA...
82,G11,SP_82_rev,TCGCCCAAGCCATCATGGCGGCAGCTGCGGCCATTTTCTTTGCCCA...


In [31]:
forward_primers_well_plate.to_excel('../data/09_primers/forward_primers_2024_08_02.xlsx')
reverse_primers_well_plate.to_excel('../data/09_primers/reverse_primers_2024_08_02.xlsx')
final_sample.to_excel('../data/09_primers/full_dataframe_of_sampled_SPs_2024_08_02.xlsx')
