In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
from Bio.Data import CodonTable
from Bio.Restriction.Restriction_Dictionary import rest_dict
import tqdm

In [22]:
codontable = CodonTable.ambiguous_dna_by_id[1]
possible_nucls = codontable.nucleotide_alphabet + "-"
possible_nucls

'GATCRYWSMKHBVDN-'

In [20]:
PATH_TO_MULAL = "../data/interim/mulal.fasta"

### Generate table summarizing genomes site states

In [38]:
fasta = SeqIO.parse(PATH_TO_MULAL, "fasta")
ref_rec = next(fasta)
base_dct = {nucl: 0 for nucl in possible_nucls}
data = [base_dct.copy() for _ in range(len(ref_rec.seq))]
for rec in tqdm.tqdm(fasta, "Seqs", 56446):
    seq = str(rec.seq)
    for i, nucl in enumerate(seq):
        data[i][nucl] += 1


Seqs: 100%|█████████▉| 56445/56446 [02:11<00:00, 429.02it/s]


In [53]:
df = pd.DataFrame(data)
df.reset_index(inplace=True)
df.rename(columns={"index": "Pos"}, inplace=True)
df["Pos"] += 1
df.to_csv("../data/share/genomes_summary.tsv", sep="\t", index=None)
df.tail()

Unnamed: 0,Pos,G,A,T,C,R,Y,W,S,M,K,H,B,V,D,N,-
16564,16565,0,1,0,55312,0,0,0,0,0,0,0,0,0,0,53,1079
16565,16566,55266,16,0,0,0,0,0,0,0,0,0,0,0,0,58,1105
16566,16567,0,55264,0,1,0,0,0,0,0,0,0,0,0,0,54,1126
16567,16568,0,2,55206,7,0,0,0,0,0,0,0,0,0,0,58,1172
16568,16569,55098,10,1,1,0,0,0,0,0,0,0,0,0,0,61,1274


### Ungappy sites (currently unused)

In [73]:
ungappy = df.loc[np.where(df["-"].values == 0)]
print(ungappy.shape)
ungappy

(11133, 17)


Unnamed: 0,Pos,G,A,T,C,R,Y,W,S,M,K,H,B,V,D,N,-
579,580,0,0,56439,0,0,0,0,0,0,0,0,0,0,0,6,0
580,581,0,56438,0,0,0,0,0,0,0,0,0,0,0,0,7,0
581,582,0,0,56442,0,0,0,0,0,0,0,0,0,0,0,3,0
582,583,56441,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0
584,585,1,56442,0,0,0,0,0,0,0,0,0,0,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14741,14742,0,0,1,56441,0,0,0,0,0,0,0,0,0,0,3,0
14742,14743,9,56432,0,1,0,0,0,0,0,0,0,0,0,0,3,0
14743,14744,0,0,0,56442,0,0,0,0,0,0,0,0,0,0,3,0
14744,14745,0,0,8,56434,0,0,0,0,0,0,0,0,0,0,3,0


### Fully constant sites

In [83]:
constant_pos = df.drop("Pos", axis=1).apply(lambda x: np.sum(x.values > 0) == 1, axis=1).values
df_const = df.loc[constant_pos]
df_const.head()

Unnamed: 0,Pos,G,A,T,C,R,Y,W,S,M,K,H,B,V,D,N,-
623,624,0,0,0,56445,0,0,0,0,0,0,0,0,0,0,0,0
624,625,56445,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
625,626,56445,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
636,637,0,0,0,56445,0,0,0,0,0,0,0,0,0,0,0,0
637,638,0,0,0,56445,0,0,0,0,0,0,0,0,0,0,0,0


#### Extract fully constant sequences 

In [88]:
prev = -10
x = df_const.Pos.values
cur_pos_set = set()
colection_of_sequential = []
for i in range(x.shape[0] - 1):
    if x[i] + 1 == x[i + 1]:
        cur_pos_set.add(x[i])
        cur_pos_set.add(x[i + 1])
    else:
        if len(cur_pos_set) > 3:
            colection_of_sequential.append(sorted(cur_pos_set))
        cur_pos_set = set()


In [89]:
colection_of_sequential

[[637, 638, 639, 640, 641, 642],
 [690, 691, 692, 693, 694],
 [1621, 1622, 1623, 1624],
 [3299, 3300, 3301, 3302, 3303, 3304],
 [4144, 4145, 4146, 4147],
 [4521, 4522, 4523, 4524],
 [4980, 4981, 4982, 4983],
 [7086, 7087, 7088, 7089, 7090],
 [7101, 7102, 7103, 7104, 7105],
 [9169, 9170, 9171, 9172, 9173],
 [10758, 10759, 10760, 10761, 10762],
 [12327, 12328, 12329, 12330],
 [13232, 13233, 13234, 13235],
 [13549, 13550, 13551, 13552, 13553, 13554, 13555, 13556],
 [14009, 14010, 14011, 14012]]

In [111]:
nucls = list("ACGT")
potential_sites = []
for sequential in colection_of_sequential:
    da = df_const[df_const.Pos.isin(sequential)][nucls]
    _pos = np.where(da.values == 56445)[1]
    seq = "".join([nucls[i] for i in _pos])
    potential_sites.append(seq)

potential_sites

['CCCCAT',
 'TACAC',
 'AAGC',
 'TTAACA',
 'TTCC',
 'CTCA',
 'AACC',
 'CACTG',
 'TTCTC',
 'ACACT',
 'CAATG',
 'AATA',
 'AAAA',
 'AACGCCTG',
 'TAGA']

In [129]:
mpr = str.maketrans("ACGT", "TGCA")
"ACTTTCG".translate(mpr)  # check

'TGAAAGC'

In [135]:
extended_potential_sites = []
for ps in potential_sites:
    new_ps = ps.translate(mpr)
    extended_potential_sites.append(ps)
    extended_potential_sites.append(new_ps[::-1])
print(extended_potential_sites)

['CCCCAT', 'ATGGGG', 'TACAC', 'GTGTA', 'AAGC', 'GCTT', 'TTAACA', 'TGTTAA', 'TTCC', 'GGAA', 'CTCA', 'TGAG', 'AACC', 'GGTT', 'CACTG', 'CAGTG', 'TTCTC', 'GAGAA', 'ACACT', 'AGTGT', 'CAATG', 'CATTG', 'AATA', 'TATT', 'AAAA', 'TTTT', 'AACGCCTG', 'CAGGCGTT', 'TAGA', 'TCTA']


#### Search for restriction sites in the fully constant sequences

In [137]:
for enzyme, propperties in rest_dict.items():
    esite = propperties["site"]
    if esite in extended_potential_sites:
        print(enzyme, esite)


BtsIMutI CAGTG


### Restriction site found - **BtsIMutI**. All used human mtDNA contain this site at least on one strand