In [39]:
from Bio.Entrez import efetch
from dicttoxml import dicttoxml
from IPython.display import clear_output
import json

global entry_data

def query_pubmed(pmid_file):
    with open(pmid_file, "r") as f:
        for pmid in f:
            pmid = pmid.rstrip()
            try:
                handle = efetch(db="pubmed", id=pmid, retmode="xml", rettype="abstract")
            except:
                yield pmid, None
            yield pmid, handle.read()
            
def parse_pubmed_query(raw_text):
    title, abstract, medline = "None", "None", "None"
    raw_text = "".join(raw_text[0].decode("utf-8"))
    title_start_idx, title_end_idx = raw_text.find("<ArticleTitle>") + len("<ArticleTitle>"), raw_text.find("</ArticleTitle>") 
    title = raw_text[title_start_idx:title_end_idx]
    abstract_start_idx, abstract_end_idx = raw_text.find("<AbstractText>") + \
        len("<AbstractText>"), raw_text.find("</AbstractText>") 
    if "<ChemicalList>" in raw_text:
        medline_start_idx, medline_end_idx = raw_text.find("<ChemicalList>") + len("<ChemicalList>"), \
            raw_text.find("</ChemicalList>")
        medline = raw_text[medline_start_idx: medline_end_idx]
    abstract = raw_text[abstract_start_idx:abstract_end_idx]
    
    print(title)
    
    return {"title": title, "abstract": abstract, "medline": medline}
            
def pubmed_query_to_train(pmid_file, train_size=10000):
    global entry_data
    entry_data = dict()
    current_idx = 0
    for pmid, *data in query_pubmed(pmid_file):
        print(current_idx, "/", train_size)
        try:
            data = parse_pubmed_query(data)
        except:
            data = {"title": None, "abstract": None, "medline": None}
        entry_data[pmid] = data
        clear_output(wait=True)
        current_idx += 1
    json_data      = json.dumps(entry_data)
        
    with open("proteomics_output.json", "w+", encoding='utf-8') as out:
        out.write(json_data)

In [33]:
def download_pubmed_data(file_path):
    pubmed_query_to_train('pmid-proteomics-set.txt')
download_pubmed_data("")

NameError: name 'pubmed_query_to_train' is not defined

In [35]:
import json
from unidecode import unidecode

json_data = None

with open("proteomics_output.json", "r", encoding="utf-8") as f:
    json_data = json.loads(f.read(), strict=False)

In [36]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

from tqdm import tqdm
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")

tokens = []
sentences = []

for pmid in tqdm(json_data.keys()):
    document_abstract = json_data[pmid]["abstract"]
    document_tokens   = word_tokenize(document_abstract)
    tokens.extend(document_tokens)
    
for pmid in tqdm(json_data.keys()):
    document_abstract = json_data[pmid]["abstract"]
    sentences.append(document_abstract)

porter_stemmer    = PorterStemmer()
pre_tf_idf_tokens = []

for word in tokens:
    pre_tf_idf_tokens.append(porter_stemmer.stem(word))

tf_idf_vec_smooth = TfidfVectorizer(use_idf=True,  
                        smooth_idf=True,  
                        ngram_range=(1,1), stop_words='english')

X_unigram = tf_idf_vec_smooth.fit_transform(pre_tf_idf_tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joshu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 10000/10000 [00:10<00:00, 945.05it/s]
100%|██████████| 10000/10000 [00:00<00:00, 1999668.18it/s]


In [37]:
tf_idf_vec_smooth_bigram = TfidfVectorizer(use_idf=True,
                                  smooth_idf=True,
                                  ngram_range=(1,2), stop_words='english')

X_unigram_bigram = tf_idf_vec_smooth_bigram.fit_transform(pre_tf_idf_tokens)

In [103]:
from numpy import count_nonzero

def calculate_sparsity(sparse_matrix):
    sparsity = 1.0 - ( count_nonzero(sparse_matrix) / float(sparse_matrix.size) )
    return sparsity

In [128]:
from nltk.corpus import wordnet
import string
import re
from nltk.stem import WordNetLemmatizer

words = set(nltk.corpus.words.words())

def is_english_token(token):
    if not wordnet.synsets(token):
        print(token)
        
        
translator=str.maketrans('','',string.punctuation)
lemmatizer = WordNetLemmatizer()

def preprocess_sentence(sentence, translator):
    sentence = sentence.lower()
    if 'doctype' in sentence: #flagging badly encoded JSON documents
        return ""
    out = sentence.translate(translator)
    out = re.sub("\d+\.?\d+?", "",  out)
    out = re.sub("[^\u0000-\u05C0\u2100-\u214F]+", "", out)
    
    #greek aware lemmatizer required for proper lemmatization
    out = " ".join([lemmatizer.lemmatize(token) for token in nltk.word_tokenize(out) if len(token) > 2])
    
    return out

In [129]:
preprocessed_sentences = [preprocess_sentence(sentence, translator) for sentence in sentences]
preprocessed_sentences

['proteomics involves the application technology for the identication and quantication overall protein present content cell tissue organism supplement the other omics technology such genomic and transcriptomics expound the identity protein organism and cognize the structure and function particular protein proteomicsbased technology are utilized various capacity for different research setting such detection various diagnostic marker candidate for vaccine production understanding pathogenicity mechanism alteration expression pattern response different signal and interpretation functional protein pathway different disease proteomics practically intricate because includes the analysis and categorization overall protein signature genome mass spectrometry with lcmsms and malditoftof being widely used equipment the central among current proteomics however utilization proteomics facility including the software for equipment database and the requirement skilled personnel substantially increase 

In [130]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(ngram_range=(1,1),
                           stop_words='english')

count_data = count_vectorizer.fit_transform(preprocessed_sentences)
 
cv_dataframe=pd.DataFrame(count_data.toarray(), columns=count_vectorizer.get_feature_names())
cv_dataframe.shape

(10000, 45423)

### Probabilistic Threshing - Dictionary Size
    1. Chemical Compound Lists

In [131]:
cv_dataframe

Unnamed: 0,14βdglucosidic,1a1,1a1b,1a1blight,1acid,1alpha,1alpha1,1alpha2beta4beta7beta7hydroxidi2thienylacetyloxydimethyl3oxa9azoniatricyclononane,1amino,1antichymotrypsin,...,μlml,μm,μmdiameter,μmol,μmolg,μmsup2sup,μmthick,οphenanthroline,ρlt,σsupssup
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [132]:
calculate_sparsity(cv_dataframe.to_numpy())

0.9984647513374282

In [125]:
tf_idf_vec_smooth = TfidfVectorizer(use_idf=True,  
                        smooth_idf=True,  
                        ngram_range=(1,1), stop_words='english')

X_unigram = tf_idf_vec_smooth.fit_transform(preprocessed_sentences)
cv_dataframe_tfidf=pd.DataFrame(X_unigram.toarray(), columns=count_vectorizer.get_feature_names())
cv_dataframe_tfidf.shape

(10000, 45970)

In [136]:
chemical_list_data = []

count = 0
for pmid in tqdm(json_data.keys()):
    chemicals = json_data[pmid]["medline"]
    chemical_list_data.append(chemicals)
    if "Proteome" in chemicals:
        count += 1
chemical_list_data

100%|██████████| 10000/10000 [00:00<00:00, 1428237.14it/s]


['<Chemical><RegistryNumber>0</RegistryNumber><NameOfSubstance UI="D020543">Proteome</NameOfSubstance></Chemical>',
 '<Chemical><RegistryNumber>0</RegistryNumber><NameOfSubstance UI="D020543">Proteome</NameOfSubstance></Chemical>',
 '<Chemical><RegistryNumber>0</RegistryNumber><NameOfSubstance UI="D001426">Bacterial Proteins</NameOfSubstance></Chemical><Chemical><RegistryNumber>0</RegistryNumber><NameOfSubstance UI="D015415">Biomarkers</NameOfSubstance></Chemical><Chemical><RegistryNumber>0</RegistryNumber><NameOfSubstance UI="D020543">Proteome</NameOfSubstance></Chemical>',
 '<Chemical><RegistryNumber>0</RegistryNumber><NameOfSubstance UI="D020543">Proteome</NameOfSubstance></Chemical>',
 '<Chemical><RegistryNumber>0</RegistryNumber><NameOfSubstance UI="D020543">Proteome</NameOfSubstance></Chemical>',
 '<Chemical><RegistryNumber>0</RegistryNumber><NameOfSubstance UI="D015415">Biomarkers</NameOfSubstance></Chemical><Chemical><RegistryNumber>0</RegistryNumber><NameOfSubstance UI="D02054

In [127]:
cv_dataframe_tfidf.head()

Unnamed: 0,0h,11,14,14βdglucosidic,1a,1a1,1a1b,1a1blight,1acid,1alpha,...,μmol,μmolg,μmsup2sup,μmthick,οphenanthroline,ππ,ρ0,ρlt,σsupssup,χ²
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
