In [1]:
import ipywidgets as widgets
from IPython.display import clear_output
from ipywidgets import interact, interactive, fixed, interact_manual
from ipywidgets import Layout, Button, Box, FloatProgress
from ipyfilechooser import FileChooser


In [2]:
from file_builder.research_aspect_classifier import ResearchAspectClassifier

import IPython
store = IPython.get_ipython().find_line_magic('store')

import re
import spacy
import scispacy
from scispacy.umls_linking import UmlsEntityLinker

import ktrain
import csv

import pickle

import os
import zipfile

import shutil


## GLOBAL VARIABLES

In [3]:
test_uids = ['wtvfow2f','jja2wcfz','d22fs0fx','36h7zzlo','klv9jdm8','ujnspv4j','n0uzbda0','s1hj89kj','0tpmz9uy','11minoss','bk601l5a','v6jyplcn','b5u5hp2r','6bq0tps2','hgpgeel6','puvdw0ci','uym826bh','0qaoam29','8qp8o2g2','xwzichc3','t3cpzmwj','jeglryus','z2lv280w','stqj3ue5','fc338qdt', '0084poiq', '00cef9ee', 'puc13jf1']

In [4]:
MACRO_TOPICS = ['Coronavirus', 'Public health and epidemics', 'Molecular biology', 'Influenza', 'Immunology',
               'Rotavirus', 'Antivirals', 'Clinical trials', 'Testing and diagnosing']

In [5]:
RESEARCH_ASPECTS = ["Background", "Purpose", "Method", "Finding_Contribution", "Other"]

In [6]:
av = ["Background", "Purpose", "Method", "Finding_Contribution"]

In [None]:
nlp_sentences = spacy.load('en_core_sci_lg')
linker = UmlsEntityLinker(resolve_abbreviations=True)
nlp_sentences.add_pipe(linker)

In [None]:
doc = nlp_sentences('Bulbo-Spinal Atrophy')
entity = doc.ents[0]

print(linker.umls.cui_to_entity[entity._.umls_ents[0][0]].concept_id)
print(linker.umls.cui_to_entity[entity._.umls_ents[0][0]].canonical_name)


#for umls_ent in entity._.umls_ents:
#    print(linker.umls.cui_to_entity[umls_ent[0]].canonical_name)

## JSON builder and topic builder

In [None]:
class Builder:
    def __init__(self, metadata, progress_bar):
        self.metadata = metadata
        self.progress_bar = progress_bar
        self.entries = {}
        self.nlp_sentences = spacy.load('en_core_sci_lg')
        self.zsl = ktrain.text.ZeroShotClassifier()
        #self.linker = EntityLinker(resolve_abbreviations=True, name="umls")
        
        self.rac = ResearchAspectClassifier(RESEARCH_ASPECTS)
        progress_bar.value += 25
        
    def return_dict_uids(self, uids=None):
        reader = csv.DictReader(self.metadata)
        papers = {}

        for row in reader:
            if(uids is not None):
                if(row['cord_uid'] in uids):
                    papers[row['cord_uid']] = {'title': row['title'], 'abstract':row['abstract'], 'publish_time':row['publish_time']}
            else:
                papers[row['cord_uid']] = {'title': row['title'], 'abstract':row['abstract'], 'publish_time':row['publish_time']}
        
        print(len(papers))
        return papers
        
        
    def return_docs_dict(self, papers):
         
        progress_paper = 75.0/float(len(papers))
                                
        for i, (uid, paper) in enumerate(papers.items()):  
            try:
                abstract_nlp = self.nlp_sentences(paper['abstract'])
                sentences = [sent.text.strip() for sent in abstract_nlp.sents]

                classified_abstract = self.rac.classify_abstract(sentences)


                is_covid = bool(re.search('COVID-19', paper['abstract'], re.IGNORECASE) or re.search('COVID', paper['abstract'], re.IGNORECASE))
                ents = [ent.text for ent in abstract_nlp.ents]
                umls_ents = []
                
                for ent in ents:
                    umls_ents.append({'id': linker.umls.cui_to_entity[ent._.umls_ents[0][0]].concept_id, 'name':linker.umls.cui_to_entity[ent._.umls_ents[0][0]].canonical_name})

                #if(len(sentences) > 0):
                #    macro_topic = self.zsl.predict(sentences, topic_strings=MACRO_TOPICS, include_labels=True)[0][0] 
                #else:
                #    macro_topic = ""

                self.entries[uid] = {'title': paper['title'],'abstract': classified_abstract, 'is_covid': is_covid, 'umls_ents': umls_ents, 'publish_time': paper['publish_time']}
            except Exception as e:
                print(e)
            
            progress_bar.value += progress_paper

        print('== Done Building Docs Dictionary ==')
        return self.entries
    
    def return_topic_models(self, docs_dict, aspects_to_train):
        topic_models = {'covid_papers':{}, 'all_papers':{}}
        texts = {'covid_papers':{}, 'all_papers':{}}
        uids = {'covid_papers':[], 'all_papers':[]}
        build_docs = {'covid_papers':{}, 'all_papers':{}}
        
        for aspect in aspects_to_train:
            texts['covid_papers'][aspect] = []
            texts['all_papers'][aspect] = []
            
            build_docs['covid_papers'][aspect] = []
            build_docs['all_papers'][aspect] = []
            
        
        for i, (uid, doc) in enumerate(docs_dict.items()):  
            if(doc['is_covid']):
                uids['covid_papers'].append(uid)
            uids['all_papers'].append(uid)
            
            for aspect in aspects_to_train:
                if(aspect == 'Whole'):
                    whole_text_dict = {}
                    whole_text = ''
                    for aspect in RESEARCH_ASPECTS:
                        for sentence in doc['abstract'][aspect]:
                            whole_text_dict[sentence[0]] = sentence[1]

                    for key in sorted(whole_text_dict):
                        whole_text+= whole_text_dict[key]

                    if(doc['is_covid']):
                        texts['covid_papers']['Whole'].append(whole_text)

                    texts['all_papers']['Whole'].append(whole_text)
                else:
                    if(doc['is_covid']):
                        texts['covid_papers'][aspect].append(' '.join([sentence[1] for sentence in doc['abstract'][aspect]]))

                    texts['all_papers'][aspect].append(' '.join([sentence[1] for sentence in doc['abstract'][aspect]]))
                
        for i, (classification, aspects) in enumerate(texts.items()):  
            for z, (aspect, docs) in enumerate(aspects.items()):  
                try:
                    topic_models[classification][aspect] = ktrain.text.get_topic_model(docs, min_df=0, lda_max_iter=1, verbose=False, n_features=30000)
                    build_docs[classification][aspect] = docs
                except Exception as e:
                    print(e)
                                            
        return topic_models, uids, build_docs
                        

In [None]:
def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        for file in files:
            ziph.write(os.path.join(root, file))

def create_dir(path):
    try:
        os.mkdir(path)
    except OSError:
        print ("Creation of the directory %s failed" % path)
    else:
        print ("Successfully created the directory %s " % path)
    

def build_files(metadata, progress_bar, aspects, is_test):
    builder = Builder(metadata, progress_bar)
    
    create_dir('built_files')
    
    if(is_test):
        docs_dict = builder.return_docs_dict(builder.return_dict_uids(test_uids))
    else:
        docs_dict = builder.return_docs_dict(builder.return_dict_uids())
    
    #uncomment
    #docs_dict = {}
    #with zipfile.ZipFile('built_files_cord19ktool.zip') as zf:
    #    with zf.open('built_files/docs_dict.p') as docs:
    #        docs_dict = pickle.load( docs )        
        
    topic_models, uids, build_docs = builder.return_topic_models(docs_dict, aspects)
    
    with open('built_files/docs_dict.p', 'wb') as fp:
        pickle.dump(docs_dict, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
    
    for i, (classification, aspects) in enumerate(topic_models.items()):  
        create_dir('built_files/'+classification)
        
        with open('built_files/'+classification+'/uids.p', 'wb') as fp:
            pickle.dump(uids[classification], fp, protocol=pickle.HIGHEST_PROTOCOL)
        
        for z, (aspect, model) in enumerate(aspects.items()):  
            print(aspect)
            create_dir('built_files/'+classification+'/'+aspect)
            model.save('built_files/'+classification+'/'+aspect+'/'+aspect)
            
            with open('built_files/'+classification+'/'+aspect+'/docs.p', 'wb') as fp:
                pickle.dump(build_docs[classification][aspect], fp, protocol=pickle.HIGHEST_PROTOCOL)
    
        

        
    zipf = zipfile.ZipFile('built_files_cord19ktool.zip', 'w', zipfile.ZIP_DEFLATED)
    zipdir('built_files', zipf)
    zipf.close()
    

## SETUP JSON FILE FROM METADATA.CSV FILE

In [None]:

RESEARCH_ASPECTS.extend(['Whole'])


In [None]:
def enable_btns_setup_server():
    btn_setup_server.disabled = False
    btn_setup_server_test.disabled = False
    
import time



    
title = widgets.Label(value='Upload the metadata.csv from CODA-19 to build files')

aspects = widgets.SelectMultiple(
    options=RESEARCH_ASPECTS,
    description='Research Aspects to train',
    disabled=False
)

btn_setup_server = widgets.Button(
    description='BUILD FILES',
    disabled=True,
    button_style='', 
    icon='gear' 
)

btn_setup_server_test = widgets.Button(
    description='BUILD FILES FOR TEST PAPERS',
    disabled=True,
    button_style='', 
    icon='gear' 
)

progress_bar = FloatProgress(min=0, max=100) 

def selected_file(fc):
    enable_btns_setup_server()

def pressed_build_files(btn):
    display(progress_bar)
    with open(fc.selected_path+'/'+fc.selected_filename) as cord19:
        build_files(cord19, progress_bar, aspects.value, False)
    
    cord19.close()
    
def pressed_build_files_test(btn):
    display(progress_bar)
    with open(fc.selected_path+'/'+fc.selected_filename) as cord19:
        build_files(cord19, progress_bar, aspects.value, True)
    
    cord19.close()
    
fc = FileChooser('./')
display(fc)
fc.register_callback(selected_file)


btn_setup_server.on_click(pressed_build_files)
btn_setup_server_test.on_click(pressed_build_files_test)

#display(title)
display(aspects)
display(btn_setup_server)
display(btn_setup_server_test)

In [None]:
len(test_uids)