In [1]:
%run dataset.ipynb

[INFO] DataManager: Dataset already exists, skipping fetch


In [2]:
data_folder = os.path.join("..", "res", "datasets")


In [3]:
dman = DataManager("mymail@example.com", data_folder)


In [4]:
dman

<__main__.DataManager at 0x18b517386a0>

In [5]:

dataset_name = "RNA Dataset"
queries = ["RNA", "mRNA", "tRNA"]

In [6]:
dman.create_dataset(queries, dataset_name, 5)

[INFO] DataManager: Dataset already exists, skipping fetch


In [7]:
abstracts = dman.load_full_dataset(dataset_name)

In [9]:
import numpy as np

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer

[DEBUG] tensorflow: Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.


In [11]:
tokenizer = Tokenizer()

In [12]:
from tensorflow.keras.utils import to_categorical

In [13]:
from tensorflow.keras.models import Sequential

In [14]:
from tensorflow.keras.layers import Dense, LSTM, Embedding

In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [24]:
abstracts[0]

'Long noncoding RNA nuclear paraspeckle assembly transcript 1 (lncRNA NEAT1) is abnormally expressed in numerous tumors and functions as an oncogene, but the role of NEAT1 in laryngocarcinoma is largely unknown. Our study validated that NEAT1 expression was markedly upregulated in laryngocarcinoma tissues and cells. Downregulation of NEAT1 dramatically suppressed cell proliferation and invasion through inhibiting miR-524-5p expression. Additionally, NEAT1 overexpression promoted cell growth and metastasis, while overexpression of miR-524-5p could reverse the effect. NEAT1 increased the expression of histone deacetylase 1 gene (HDAC1) via sponging miR-524-5p. Mechanistically, overexpression of HDAC1 recovered the cancer-inhibiting effects of miR-524-5p mimic or NEAT1 silence by deacetylation of tensin homolog deleted on chromosome ten (PTEN) and inhibiting AKT signal pathway. Moreover, in vivo experiments indicated that silence of NEAT1 signally suppressed tumor growth. Taken together, 

In [25]:
type(abstracts[0])

str

In [29]:
def clean_text(doc):
    tokens = doc.split()
    table =str.maketrans('','', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    return tokens

In [30]:
tokens = clean_text(abstracts[0])

In [32]:
len(tokens)

134

In [33]:
len(set(tokens))

94

In [35]:
length = 30+1
lines = []
for i in range(length, len(tokens)):
    seq = tokens[i-length:i]
    line = ' '.join(seq)
    lines.append(line)

In [49]:
lines = []

In [50]:
length = 50+1

In [51]:
for abstract in abstracts:
    tokens = clean_text(abstract)
    for i in range(length, len(tokens)):
        seq = tokens[i-length:i]
        line = ' '.join(seq)
        lines.append(line)

In [53]:
len(lines)

2262

In [39]:
###Build LSTM Model and Prepare x and y

In [56]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [58]:
sequences = np.array(sequences)

In [59]:
X, y = sequences[:, :-1], sequences[:, -1]

In [60]:
X[0]

array([1039,  289,   30,  746, 1018, 1009,  998,  988,   17,  969,  350,
          4,  949,  850,    2,  919,   19,   34,  893,  186,    1,  116,
          3,    4,  314,   17,  341,  336,   64,   23,  199,   15,    7,
          6,  797,  197,    4,  314,   74,    2,   14,  747,    3,  733,
        174,   25,   39,    2,   89,   53])

In [61]:
y[0]

204

In [66]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'of': 3,
 'in': 4,
 'to': 5,
 'was': 6,
 'expression': 7,
 'were': 8,
 'with': 9,
 'a': 10,
 'by': 11,
 'for': 12,
 'protein': 13,
 'cells': 14,
 'that': 15,
 'this': 16,
 'is': 17,
 'p': 18,
 'as': 19,
 'we': 20,
 'from': 21,
 'levels': 22,
 'study': 23,
 'results': 24,
 'cell': 25,
 'nscs': 26,
 'using': 27,
 'showed': 28,
 'gene': 29,
 'rna': 30,
 'increased': 31,
 'genes': 32,
 'analysis': 33,
 'an': 34,
 'transcription': 35,
 'significantly': 36,
 'on': 37,
 'different': 38,
 'proliferation': 39,
 'or': 40,
 'most': 41,
 'which': 42,
 'sourdough': 43,
 'viability': 44,
 'b': 45,
 'identified': 46,
 'mrna': 47,
 'after': 48,
 'potential': 49,
 'compared': 50,
 'decidualization': 51,
 'dlnbp': 52,
 'through': 53,
 'related': 54,
 'evaluated': 55,
 'stable': 56,
 'ribosomal': 57,
 'treated': 58,
 'trfs': 59,
 'signal': 60,
 'crayfish': 61,
 'decreased': 62,
 'viruses': 63,
 'our': 64,
 'function': 65,
 'can': 66,
 'their': 67,
 'hypoxic': 68,
 'reverse': 69,
 '

In [63]:
vocab_size = len(tokenizer.word_index) + 1

In [64]:
vocab_size

1048

In [68]:
y = to_categorical(y, num_classes=vocab_size)

In [69]:
seq_length = X.shape[1]

In [70]:
###LSTM Model

In [71]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

In [72]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 50)            52400     
                                                                 
 lstm (LSTM)                 (None, 50, 100)           60400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 1048)              105848    
                                                                 
Total params: 309,148
Trainable params: 309,148
Non-trainable params: 0
_________________________________________________________________


In [73]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [74]:
model.fit(X, y, batch_size =256, epochs =30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x18b5f301e80>