## Data

In [1]:
import numpy as np
from kerasy.utils import generateSeq

In [2]:
n = 60
seed = 1

In [3]:
sequence = "".join(generateSeq(size=n, nucleic_acid="RNA", seed=seed))

In [4]:
print(sequence)

AUGGUCGCUGCCCGAAGAGUAAUCUAUCUAUUAGAGGGCCGCCCGCUGCCUAUGAAAUUG


## Search

In [5]:
from kerasy.bio.string import StringSearch

In [6]:
model = StringSearch()

In [7]:
model.build(sequence)

Building Suffix Array...
Building Auxiliary data structure for BWT...
Building Longest-Common-Prefix Array...


#### Suffix Array

In [8]:
model.SuffixArray()

 i SA  H Suffix
---------------
 0 60  0 $
 1 54  2 AAAUUG$
 2 14  2 AAGAGUAAUCUAUCUAUUAGAGGGCCGCCCGCUGCCUAUGAAAUUG$
 3 20  3 AAUCUAUCUAUUAGAGGGCCGCCCGCUGCCUAUGAAAUUG$
 4 55  1 AAUUG$
 5 32  4 AGAGGGCCGCCCGCUGCCUAUGAAAUUG$
 6 15  2 AGAGUAAUCUAUCUAUUAGAGGGCCGCCCGCUGCCUAUGAAAUUG$
 7 34  2 AGGGCCGCCCGCUGCCUAUGAAAUUG$
 8 17  1 AGUAAUCUAUCUAUUAGAGGGCCGCCCGCUGCCUAUGAAAUUG$
 9 21  6 AUCUAUCUAUUAGAGGGCCGCCCGCUGCCUAUGAAAUUG$
10 25  2 AUCUAUUAGAGGGCCGCCCGCUGCCUAUGAAAUUG$
11 51  3 AUGAAAUUG$
12  0  2 AUGGUCGCUGCCCGAAGAGUAAUCUAUCUAUUAGAGGGCCGCCCGCUGCCUAUGAAAUUG$
13 29  3 AUUAGAGGGCCGCCCGCUGCCUAUGAAAUUG$
14 56  0 AUUG$
15 10  4 CCCGAAGAGUAAUCUAUCUAUUAGAGGGCCGCCCGCUGCCUAUGAAAUUG$
16 41  2 CCCGCUGCCUAUGAAAUUG$
17 11  3 CCGAAGAGUAAUCUAUCUAUUAGAGGGCCGCCCGCUGCCUAUGAAAUUG$
18 38  4 CCGCCCGCUGCCUAUGAAAUUG$
19 42  2 CCGCUGCCUAUGAAAUUG$
20 48  1 CCUAUGAAAUUG$
21 12  2 CGAAGAGUAAUCUAUCUAUUAGAGGGCCGCCCGCUGCCUAUGAAAUUG$
22 39  3 CGCCCGCUGCCUAUGAAAUUG$
23  5  7 CGCUGCCCGAAGAGUAAUCUAUCUAUUAGAGGGCCGCCCGCUGCCUAUGA

#### Search

In [9]:
Met = "AUG"

In [10]:
model.search(Met)

array([51,  0])

In [11]:
model.where(Met)

Model: [01mSuffix Array[0m
Number of matches: [31m2[0m

S: [00] AUGGUCGCUGCCCGAAGAGUAAUCUAUCUAUUAGAGGGCCGCCCGCUGCCUAUGAAAUUG [59]
 : [00] ***                                                ***       [59]


#### Burrows-Wheeler Transform (BWT)

In [12]:
bwt = "".join(model.BWT)

In [13]:
print(bwt)

GGGUAUAGGAUU$UAGGCGCGCCUCUCUGGUUCAAUCGUCCGAUAGGUCCCGAAUACCAAA


#### Reverse BWT without Suffix Array

In [14]:
from kerasy.bio.string import reverseBWT

In [15]:
string = reverseBWT(bwt)

In [16]:
print(f"original               : {sequence}")
print(f"original→SA→BWT→reverse: {string}")

original               : AUGGUCGCUGCCCGAAGAGUAAUCUAUCUAUUAGAGGGCCGCCCGCUGCCUAUGAAAUUG
original→SA→BWT→reverse: AUGGUCGCUGCCCGAAGAGUAAUCUAUCUAUUAGAGGGCCGCCCGCUGCCUAUGAAAUUG


**Perfectly restored from BWT !!**

## Compression 

If you want to check the function `getAlltext` deeply, please visit ["Python-utils" repository](https://github.com/iwasakishuto/Python-utils/blob/22e0a1e7041f61bf63882a9db535bf777ae655bb/Scraping/scrap_utils.py#L5).

In [17]:
from Scraping.scrap_utils import getAlltext

In [18]:
# get All text in 'https://en.wikipedia.org/wiki/Keras'.
text = getAlltext("https://en.wikipedia.org/wiki/Keras", joint="")

In [19]:
from kerasy.bio.string import SAIS, mkBWT

In [20]:
SA = SAIS(text)
BWT = mkBWT(text, SA)

In [21]:
bwt = "".join(BWT)

In [22]:
from kerasy.bio.string import simple_compression
from kerasy.bio.string import simple_decompression

In [23]:
bwt_compressed = simple_compression(bwt)

In [24]:
print(f"BWT Compression rate: {len(bwt_compressed)/len(bwt)*100:.3f}%")

BWT Compression rate: 76.962%
