In [1]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import pandas as pd
from typing import List, Dict, Union

# Designing Primers for Kozak analysis

In this notebook we explore the combinatorial space of the most abundant kozak sequences and make repair-primers for the experiments.  

In [2]:
nucleotide_list = [list("ATGC")]*5
print(nucleotide_list)

[['A', 'T', 'G', 'C'], ['A', 'T', 'G', 'C'], ['A', 'T', 'G', 'C'], ['A', 'T', 'G', 'C'], ['A', 'T', 'G', 'C']]


We want to limit our search space therefore we restrict the compinatorial space to the following combinations which were the most abundant nucleotides in the PWM analysis

In [3]:
position1 = ["C", "T", "A"]
position2 = ["C", "A"]
position3 = [ "A", "G"]
position4 = ["C", "T", "A"]
position5 = ["C", "G", "A"]

nucleotide_list = [position1, position2, position3, position4, position5]

In [4]:
from teemi.design.combinatorial_design import get_combinatorial_list

In [5]:
# make all combinations
kozak = get_combinatorial_list(nucleotide_list)

In [6]:
# Make them into strings

def make_to_string(list_of_list):
    all_combinations_as_str = []
    nuc_seq = ''
    for sp in list_of_list: 
        for seq in sp: 
            nuc_seq += seq
        
        all_combinations_as_str.append(nuc_seq)
        nuc_seq = ''

    return all_combinations_as_str


all_combinations_as_str = make_to_string(kozak)

In [7]:
five_prime = 'TATATTCCACAAAACATAACACAACCT'
three_prime = 'ATGGCCTCCTCCGAGGACGTCATCAAGG'

In [8]:
five_prime_list = [[five_prime]] * len(all_combinations_as_str)
three_prime_list = [list(three_prime)] * len(all_combinations_as_str)


# to string
five_prime_list = make_to_string(five_prime_list)
three_prime_list = make_to_string(three_prime_list) 


In [9]:
muy_dict = {'five':five_prime_list, "kozak":all_combinations_as_str, "three":three_prime_list}

In [10]:
kozak_df = pd.DataFrame(muy_dict) 
kozak_df['primer'] = kozak_df['five'] + kozak_df['kozak'] + kozak_df['three']
kozak_df

Unnamed: 0,five,kozak,three,primer
0,TATATTCCACAAAACATAACACAACCT,CCACC,ATGGCCTCCTCCGAGGACGTCATCAAGG,TATATTCCACAAAACATAACACAACCTCCACCATGGCCTCCTCCGA...
1,TATATTCCACAAAACATAACACAACCT,CCACG,ATGGCCTCCTCCGAGGACGTCATCAAGG,TATATTCCACAAAACATAACACAACCTCCACGATGGCCTCCTCCGA...
2,TATATTCCACAAAACATAACACAACCT,CCACA,ATGGCCTCCTCCGAGGACGTCATCAAGG,TATATTCCACAAAACATAACACAACCTCCACAATGGCCTCCTCCGA...
3,TATATTCCACAAAACATAACACAACCT,CCATC,ATGGCCTCCTCCGAGGACGTCATCAAGG,TATATTCCACAAAACATAACACAACCTCCATCATGGCCTCCTCCGA...
4,TATATTCCACAAAACATAACACAACCT,CCATG,ATGGCCTCCTCCGAGGACGTCATCAAGG,TATATTCCACAAAACATAACACAACCTCCATGATGGCCTCCTCCGA...
...,...,...,...,...
103,TATATTCCACAAAACATAACACAACCT,AAGTG,ATGGCCTCCTCCGAGGACGTCATCAAGG,TATATTCCACAAAACATAACACAACCTAAGTGATGGCCTCCTCCGA...
104,TATATTCCACAAAACATAACACAACCT,AAGTA,ATGGCCTCCTCCGAGGACGTCATCAAGG,TATATTCCACAAAACATAACACAACCTAAGTAATGGCCTCCTCCGA...
105,TATATTCCACAAAACATAACACAACCT,AAGAC,ATGGCCTCCTCCGAGGACGTCATCAAGG,TATATTCCACAAAACATAACACAACCTAAGACATGGCCTCCTCCGA...
106,TATATTCCACAAAACATAACACAACCT,AAGAG,ATGGCCTCCTCCGAGGACGTCATCAAGG,TATATTCCACAAAACATAACACAACCTAAGAGATGGCCTCCTCCGA...
