In [1]:
import pandas as pd
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import torch
from transformers import BertTokenizer, BertModel, AutoModel, AutoTokenizer
from transformers import AutoTokenizer
import unicodedata
import re
import numpy as np
import sklearn
import warnings
warnings.filterwarnings("ignore")

In [2]:
model_path = "Trained_Models/abstracts only/BIOBERTabs-nsl/best_BIOBERT-nsl_lrc.pickle"


In [3]:
loaded_model = pickle.load(open(model_path, 'rb'))

In [4]:
with open('examples/GAVISUNK.txt') as f:
    test_abstract = f.read()
    print(test_abstract)

GAVISUNK: Genome assembly validation via inter-SUNK distances in Oxford Nanopore reads
 Highly contiguous de novo genome assemblies are now feasible for large numbers of species and individuals. Methods are needed to validate assembly accuracy and detect misassemblies with orthologous sequencing data to allow for confident downstream analyses. We developed GAVISUNK, an open-source pipeline that detects misassemblies and produces a set of reliable regions genome-wide by assessing concordance of distances between unique k-mers in Pacific Biosciences high-fidelity (HiFi) assemblies and raw Oxford Nanopore Technologies reads.


In [6]:
# Lowercasing the text
test_abstract = test_abstract.lower()


In [7]:
# removing links
regex_link = r"\bhttp[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\b"
test_abstract = test_abstract.replace(regex_link, "")

In [8]:
# removing numbers
regex_nums = r"\b[0-9][0-9]*\b"
test_abstract = test_abstract.replace(regex_nums, "")

In [9]:
# removing special characters
special_character = list("←=()[]/‘’|><\\∼+%$&×–−-·")
for spec_char in special_character:
    test_abstract = test_abstract.replace(spec_char, '')

In [10]:
# removing punctuation
punctuation_signs = list("?:!.,;")
for punct_sign in punctuation_signs:
    test_abstract = test_abstract.replace(punct_sign, '')

In [11]:
# removing strings with length 1-2
regex_short = r"\b\w{0,2}\b"
test_abstract = test_abstract.replace(regex_short, "")

In [12]:
# removing strings starting with numbers
regex_short = r"\b[0-9][0-9]*\w\b"
test_abstract = test_abstract.replace(regex_short, "")

In [13]:
print(test_abstract)

gavisunk genome assembly validation via intersunk distances in oxford nanopore reads
 highly contiguous de novo genome assemblies are now feasible for large numbers of species and individuals methods are needed to validate assembly accuracy and detect misassemblies with orthologous sequencing data to allow for confident downstream analyses we developed gavisunk an opensource pipeline that detects misassemblies and produces a set of reliable regions genomewide by assessing concordance of distances between unique kmers in pacific biosciences highfidelity hifi assemblies and raw oxford nanopore technologies reads


In [15]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [17]:
wordnet_lemmatizer = WordNetLemmatizer()
# Iterating through every word to lemmatize
lemmatized_text_list = []
lemmatized_list = []
text_words = test_abstract.split(" ")
# Iterate through every word to lemmatize
for word in text_words:
    lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
# Join the list
lemmatized_text = " ".join(lemmatized_list)
    
# Append to the list containing the texts
lemmatized_text_list.append(lemmatized_text)
df=pd.DataFrame(lemmatized_text_list,columns=["text"])
df["text"] = df["text"].replace("'s", "")

# removing possessive pronoun terminations
#lemmatized_text_list = lemmatized_text_list.replace("'s", "")
# removing english stop words
# Downloading the stop words list
nltk.download('stopwords')
# Loading the stop words in english
stop_words = list(stopwords.words('english'))
# looping through all stop words
for stop_word in stop_words:
    regex_stopword = r"\b" + stop_word + r"\b"
    df["text"] = df["text"].replace(regex_stopword, '')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
#preprocessing:entry to tokens, map each token to integers
checkpoint = "dmis-lab/biobert-base-cased-v1.2"

model = BertModel.from_pretrained(checkpoint, output_hidden_states = True,)
tokenizer = BertTokenizer.from_pretrained(checkpoint)

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
def bert_text_preparation(text, tokenizer):
    """Preparing the input for BERT
    
    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.
    
    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors
    
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    
    """
    
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

In [21]:
re.compile('<title>(.*)</title>')
df['text'][0] = unicodedata.normalize('NFKD', df['text'][0]).encode('ascii', 'ignore').decode("utf-8")
df['text'][0] = re.sub(r'[^\w]', ' ', df['text'][0])
df['text'][0] = df['text'][0].encode("ascii", "ignore")
df['text'][0] = df['text'][0].decode()  

In [23]:
# Getting embeddings for the target word in all given contexts
embeddings = []
all_embeddings = []
sentence_embedding = np.empty(768, dtype=object)
c=0

tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(df['text'][0], tokenizer)
list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)
embeddings.append(list_token_embeddings)


In [24]:
for l in range(len(embeddings)):
    for v in embeddings[l]:
        sentence_embedding=np.vstack((sentence_embedding, v))
    sentence_embedding = np.delete(sentence_embedding, obj=0, axis=0)
    sentence_embedding = (np.mean(sentence_embedding, axis=0)).tolist()
    all_embeddings.append(sentence_embedding)

all_embeddings = np.array(all_embeddings)
all_embeddings = pd.DataFrame(all_embeddings)

all_embeddings.insert(loc=0, column='text', value=df['text'])

In [25]:
all_embeddings.head()

Unnamed: 0,text,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,gavisunk genome assembly validation via inters...,0.421444,-0.090017,-0.219932,0.168668,0.164829,-0.303141,0.080285,0.183378,-0.027494,...,-0.077531,0.12741,-0.212417,-0.26115,0.108368,-0.106858,-0.093772,0.254505,0.060753,0.120043


In [27]:
res=loaded_model.predict(X)

In [28]:
category_codes = {
    'Sequence alignment': 0,
    'Taxonomic classification': 1,
    'Virus detection': 2,
    'Virus identification': 3,
    'Mapping': 4,
    'Sequence assembly': 5,
    'RNA-seq quantification for abundance estimation': 6,
    'Sequence trimming': 7,
    'Sequencing quality control': 8,
    'Sequence annotation' : 9,
    'SNP-Discovery' : 10,
    'Visualization' : 11,
    'Sequence assembly validation' : 12
}

In [29]:
value = {i for i in category_codes if category_codes[i]==res}
print("The main task of this tool is:",value)

The main task of this tool is: {'Sequence assembly validation'}
