In [11]:
import time
import os
from tqdm import tqdm
import mmap
from math import sqrt
import random
import json
import spacy
import spacy.cli
import string
import nltk
import csv
nltk.download('stopwords')
import pandas as pd
from nltk.corpus import stopwords
from sklearn.metrics import auc
import gensim
from gensim.matutils import softcossim
from gensim.models import Word2Vec, KeyedVectors
from sklearn.metrics import classification_report
import numpy as np
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import csv
import multiprocessing
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
root_path = os.path.join(os.getcwd(), 'data')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
model = KeyedVectors.load_word2vec_format(os.path.join(root_path, 'PubMed-and-PMC-w2v.bin'), binary=True)
model.init_sims(replace=True)

2022-04-15 17:40:31,295 : INFO : loading projection weights from /content/gdrive/My Drive/OA_proj/data/PubMed-and-PMC-w2v.bin
2022-04-15 17:41:57,382 : INFO : loaded (4087446, 200) matrix from /content/gdrive/My Drive/OA_proj/data/PubMed-and-PMC-w2v.bin
2022-04-15 17:41:57,385 : INFO : precomputing L2-norms of word weight vectors


In [9]:
# ########################## Function Timing ###############################
## Just add @timing before any functions (def)

def timing(f):
    def wrap(*args, **kwargs):
        time1 = time.time()
        ret = f(*args, **kwargs)
        time2 = time.time()
        print('{:s} function took {:.4f} (s)'.format(f.__name__, (time2-time1)))
        return ret
    return wrap

############################################################################

nlp = spacy.load('en_core_web_sm', disable = ['tagger', 'parser', 'ner'])
stop_words = stopwords.words('english')
punctuations = string.punctuation

def single_text_clean(doc): 
    doc = nlp(doc)
    tokens = [tok.lemma_.strip() for tok in doc if tok.lemma_ != '-PRON-']
    tokens = [tok for tok in tokens if tok not in stop_words and tok not in punctuations]
    return tokens

@timing
def ontology2json(rfile, delim, filenames, w2v_model): 
    dicts = []
    for i, rf in enumerate(rfile):
        infile = open(rf,'rb+')
        jsondict = {}
        contents = infile.read()
        soup = BeautifulSoup(contents,'lxml-xml')
        classes = soup.find_all('Class')
        desc = soup.find_all('Description')

        for cl in tqdm(classes):
            if cl.has_attr('rdf:about'):
                labels = cl.find_all('label')
                relsy = cl.find_all('hasRelatedSynonym')
                subc = cl.find_all('subClassOf')
                # desc = cl.find_all('Description')
                subcc = cl.find_all('someValuesFrom')
                # subcc_all = cl.find_all('allValuesFrom')
                hasdef = cl.find_all('hasDefinition')

                s1 = cl.attrs['rdf:about']
                ss = s1[s1.index(delim) + 1:].replace('_','').replace('-','').lower() # concept code
                jsondict[ss] = {'label': [], 'embed': [], 'alignable': 1, 'related': []}
                for lb in labels:
                    lab = lb.get_text().replace(',','').replace('_',' ').replace('-',' ').replace('/',' or ').lower()
                    tok_list = single_text_clean(lab)
                    emb_list = []
                    jsondict[ss]['label'].append(lab)
                    if tok_list:
                        for tok in tok_list:
                            if tok in w2v_model.vocab:
                                emb_list.append(w2v_model[tok])
                        if not emb_list:
                            jsondict[ss]['embed'].append(np.zeros(w2v_model.vector_size).tolist())
                        else:
                            jsondict[ss]['embed'].append(np.mean(emb_list, axis=0).tolist())
                    else:
                        jsondict[ss]['embed'].append(np.zeros(w2v_model.vector_size).tolist())

              #  if relsy_len != 0 or subc_len != 0:
                for rs in relsy:
                    # if rs.has_attr('rdf:resource'):
                    r1 = rs.attrs['rdf:resource']
                    jsondict[ss]['related'].append({'code': r1[r1.index(delim) + 1:].replace('_','').lower(), 'reltype':'hasRelatedSynonym'}) 

                for sc in subc:
                    if sc.has_attr('rdf:resource'):
                        if sc.attrs['rdf:resource'] != 'http://www.w3.org/2002/07/owl#Thing':
                            st = sc.attrs['rdf:resource']
                            jsondict[ss]['related'].append({'code': st[st.index(delim) + 1:].replace('_','').lower(), 'reltype':'subClassOf'}) # .replace('-',' ')
                
                for scc in subcc:
                    if scc.has_attr('rdf:resource'):
                        st = scc.attrs['rdf:resource']
                        jsondict[ss]['related'].append({'code': st[st.index(delim) + 1:].replace('_','').lower(), 'reltype':'RestrictedsubClassOf'}) # .replace('-',' ')
                
                for hd in hasdef:
                    hdf = hd.attrs['rdf:resource']
                    jsondict[ss]['related'].append({'code': hdf[hdf.index(delim) + 1:].replace('_','').lower(), 'reltype':'hasDefinition'}) 
                                        
        for ds in desc:
            dscon = ds.attrs['rdf:about']
            dsid = dscon[dscon.index(delim) + 1:].replace('_','').lower()
            jsondict[dsid] = {'label': [], 'embed': [], 'alignable': 0, 'related': []}
            desc_labels = ds.find_all('label')
            for ds_label in desc_labels:
                lab = ds_label.get_text().replace(',','').replace('_',' ').replace('-',' ').replace('/',' or ').lower()
                tok_list = single_text_clean(lab)
                emb_list = []
                jsondict[dsid]['label'].append(lab)
                if tok_list:
                    for tok in tok_list:
                        if tok in w2v_model.vocab:
                            emb_list.append(w2v_model[tok])
                    if not emb_list:
                        jsondict[dsid]['embed'].append(np.zeros(w2v_model.vector_size).tolist())
                    else:    
                        jsondict[dsid]['embed'].append(np.mean(emb_list, axis=0).tolist())
                else:
                    jsondict[dsid]['embed'].append(np.zeros(w2v_model.vector_size).tolist())
            
        infile.close()
        dicts.append(jsondict) 
        with open(os.path.join(root_path, filenames[i]), 'w+') as fp:
            json.dump(jsondict, fp, indent=4)
    return dicts

@timing
def ontology2json_ref(rfile, delim, filename):
    outfile = open(os.path.join(root_path, filename), 'w+', newline='')
    out_writer = csv.writer(outfile, delimiter=',')

    for rf in rfile:
        infile = open(rf,'rb+')
        contents = infile.read()
        soup = BeautifulSoup(contents,'lxml-xml')
        maps = soup.find_all('Cell')
        
        for mp in tqdm(maps):
            en1 = mp.find_all('entity1')
            en2 = mp.find_all('entity2')
            row = []
            sr1 = en1[0].attrs['rdf:resource']
            row.append(sr1[sr1.index(delim) + 1:].replace('_', '').replace('-','').lower())
            sr2 = en2[0].attrs['rdf:resource']
            row.append(sr2[sr2.index(delim) + 1:].replace('_', '').replace('-','').lower())
            out_writer.writerow(row)

        infile.close()
    outfile.close()
    return


In [10]:
readfile = []
readfile.append(os.path.join(root_path, 'anatomy', 'human.owl'))
readfile.append(os.path.join(root_path, 'anatomy', 'mouse.owl'))
filenames = ['human.json','mouse.json']
jdict = ontology2json(readfile, '#', filenames, model)
readfile = []
readfile.append(os.path.join(root_path, 'anatomy', 'reference.rdf'))
ontology2json_ref(readfile, '#', 'ref.csv')

100%|██████████| 3304/3304 [00:01<00:00, 3185.40it/s]
100%|██████████| 2744/2744 [00:00<00:00, 4329.97it/s]


ontology2json function took 14.1137 (s)


100%|██████████| 1516/1516 [00:00<00:00, 35331.25it/s]

ontology2json_ref function took 1.0247 (s)



