In [1]:
import pandas as pd
import numpy as np

## Step 1 - Extract Case study

In [2]:
import urllib.request
import nltk

user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'


URL = "https://www.cell.com/ajhg/fulltext/S0002-9297(17)30459-7"
#cookies = {'required_cookie': required_value}
headers={'User-Agent':user_agent,} 
cookieProcessor = urllib.request.HTTPCookieProcessor()
opener = urllib.request.build_opener(cookieProcessor)

request=urllib.request.Request(URL,None, headers) #The assembled request
response = opener.open(request,timeout=100)
data = response.read() # The data u need

## Step 2 - Extract Table

In [255]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(data, "lxml")#.text

table = soup.find('table') # what if there are more tables?


In [203]:
def tableDataText(table):    
    """Parses a html segment started with tag <table> followed 
    by multiple <tr> (table rows) and inner <td> (table data) tags. 
    It returns a list of rows with inner columns. 
    Accepts only one <th> (table header/data) in the first row.
    """
    def rowgetDataText(tr, coltag='td'): # td (data) or th (header)       
        return [td.get_text(strip=True) for td in tr.find_all(coltag)]  
    rows = []
    trs = table.find_all('tr')
    headerow = rowgetDataText(trs[0], 'th')
    if headerow: # if there is a header row include first
        rows.append(headerow)
        trs = trs[1:]
    for tr in trs: # for every table row
        rows.append(rowgetDataText(tr, 'td') ) # data row       
    return rows

def parseTable(table, remove_inc=True):
    """
    Convert Table to pandas Dataframe
    
    Input:
        table = html table from article
        remove_inc = remove rows with incosistent length
            (be careful: these can be helpful to categorize table)
    """
    list_table = tableDataText(table)
    dftable = pd.DataFrame(list_table[1:], columns=list_table[0])
    if remove_inc:
        med = np.median(dftable.isnull().sum(axis=1).values)
        dftable = dftable.dropna(thresh=len(dftable.columns)-med)
    return dftable

raw_table = parseTable(table)

In [204]:
raw_table.head()

Unnamed: 0,Case ID,Inheritance,Gender,Age (Years),Prenatal and Neonatal History,PN Growth Retardation,Microcephaly,Motor Delay,Speech Delay,DD/ ID,"Behavioral, Psychiatric, and Neurological Features",Malformations and Physical Anomalies,Additional Comments
1,I,DN,M,4,SGA and feeding difficulties,Y,N,Y,Y,mod,possible absence and focal seizures,"VSD with tortuous aortic arch, horseshoe kidne...","early-onset hypothyroidism, limitation of join..."
2,II,DN,M,7,SGA and feeding difficulties,Y,N,Y,Y,mild,"sociable, empathetic, hand flapping tendency a...","Rt pelvic kidney, Rt inguinal hernia and scoli...","GOR, asthma and allergies."
3,III,DN,M,7,"SGA, polycythaemia, jaundice and hypoglycaemia...",Y,Y,N,Y,Mild,"attention deficit, echolalia and tantrums.","inguinal hernia, cryptorchidism, proximally pl...","perineal and scalp abscesses, recurrent chest ..."
4,IVa,Mat,F,32,SGA,Y,N,Y,Y,mod,empathetic personality.,scoliosis,"glaucoma, asthma, and eczema"
5,IVb,U,F,68,U,U,U,U,U,mild,U,horseshoe kidney with multiple cysts,hiatus hernia


### First Product: Save extracted file

In [279]:
import re

for div in soup.find_all('div', {'class': 'dropBlock__body', 'class': 'inline-table'}):  # , {'class':'Google-Scholar'}
    div.replaceWith('')

for element in ['ul', 'i', 'span', 'li', 'a', "script", "style", "meta", "link", "sup", "select", "option"]: # a 
    for div in soup.find_all(element):  # , {'class':'Google-Scholar'}
        #print(div)
        div.decompose()

for s in soup.select('div'):
    s.get_attribute_list = ''       

title= soup.title.string
title = title[:50].replace(' ', '_')

content = 'DOI: ' + URL + '<br><br>' + str(soup.prettify())
with open("results/0_raw/%s.html" % (title), "w", encoding="utf-8") as file:
    file.write(content)

In [290]:
import re

new_soup = re.sub(r"\<em\>|\</em\>", "", str(soup)) # remove em tags ? (overbodig?)
new_soup = re.sub(r"\<[^\>]+\>", "", new_soup) #remove html tags
new_soup = re.sub(r"\s{3,}", r'<br>', new_soup) # change excessive spaces into a single newline
new_soup = re.sub(r"\.([A-Z])", r'. \1', new_soup) # add whitespace where a new sentence is started
new_soup = re.sub(r"\n", r"<br>", new_soup) # format newlines to <br>

#new_soup

In [291]:
with open("results/0_raw/%s.html" % (title), "w", encoding="utf-8") as file:
    file.write(new_soup)
#m.groups()

In [292]:
# Highlight parts with phenotypes 
# automatically save tables

Main text spanning from: None to -1


### Second Product: Extract text where phenotypes are mentioned (not right/ left things like that)

Perform low resolution Clinphen to check locations of phenotypes. (Without typo correction)
- extensive = ?? sec (>5 min)
- quick ( without typo) 1.7 sec

In [137]:
#soup.get_text()

In [136]:
import importlib as imp

import time
start = time.time()
print("Time elapsed on working...")

items, mr_map = clinphen('results/0_raw/%s.html' % (title),'data', extensive=False)

end = time.time()
print("Time consumed in working: ",end - start)

df_hpo = pd.DataFrame([n.split('\t') for n in items.split('\n')])
df_hpo.columns = df_hpo.iloc[0]
df_hpo = df_hpo.reindex(df_hpo.index.drop(0))
df_hpo.to_csv('results/2_phenotypes/%s.csv' % (title))
df_hpo.head()



Time elapsed on working...
False
Time consumed in working:  1.659013032913208


Unnamed: 0,HPO ID,Phenotype name,No. occurrences,Earliness (lower = earlier),Example sentence
1,HP:0012825,Mild,16,235,congenital cmv infection td td y td td y td td...
2,HP:0011968,Feeding difficulties,13,226,div div div div div figcaption figure div div...
3,HP:0001290,Generalized hypotonia,10,285,td td frequent otisis media gh deficiency and...
4,HP:0001319,Neonatal hypotonia,10,285,td td frequent otisis media gh deficiency and...
5,HP:0100790,Hernia,8,230,td td thoracic kyphosis and bl inguinal herni...


In [162]:
df_hpo.sort_values(by='Earliness (lower = earlier)')

Unnamed: 0,HPO ID,Phenotype name,No. occurrences,Earliness (lower = earlier),Example sentence
6,HP:0001249,Intellectual disability,7,11,div class dropblock reference citations sup s...
71,HP:0000589,Coloboma,1,13,in humans postulated gain of function missense...
36,HP:0000407,Sensorineural hearing impairment,2,14,snhl sensorineural hearing loss
21,HP:0000365,Hearing impairment,3,14,hl hearing loss
37,HP:0008527,Congenital sensorineural hearing impairment,2,14,snhl sensorineural hearing loss
...,...,...,...,...,...
70,HP:0000316,Hypertelorism,2,778,div class dropblock reference citations sup s...
154,HP:0000356,Abnormality of the outer ear,1,842,of note the pli score for em actg1 em is 0 22 ...
155,HP:0000598,Abnormality of the ear,1,842,of note the pli score for em actg1 em is 0 22 ...
72,HP:0012553,Hypoplastic thumbnail,1,90,div class floatdisplay figure class id fig1 t...


In [142]:
#items, mr_map = clinphen('results/0_raw/%s.html' % (title),'data', extensive=False)
mr_map 

defaultdict(set,
            {'': {0,
              1,
              2,
              3,
              4,
              5,
              6,
              7,
              8,
              9,
              10,
              11,
              12,
              13,
              14,
              16,
              17,
              18,
              21,
              22,
              24,
              25,
              27,
              28,
              29,
              30,
              32,
              33,
              34,
              35,
              36,
              37,
              39,
              40,
              41,
              43,
              44,
              45,
              46,
              47,
              49,
              51,
              52,
              53,
              54,
              55,
              56,
              57,
              59,
              60,
              62,
              63,
              64,
              65,
              66,

In [23]:
from clinphen_src import load_all_hpo_synonyms


syns = load_all_hpo_synonyms(hpo_syn_file)

defaultdict(set,
            {'': {0,
              2,
              3,
              5,
              6,
              8,
              9,
              10,
              11,
              12,
              13,
              14,
              15,
              16,
              17,
              18,
              19,
              20,
              21,
              23,
              24,
              25,
              26,
              27,
              28,
              30,
              31,
              33,
              34,
              35,
              36,
              38,
              39,
              42,
              43,
              44,
              45,
              46,
              48,
              49,
              50,
              52,
              53,
              54,
              56,
              57,
              59,
              60,
              61,
              64,
              65,
              66,
              67,
              68,
              

### Import Acronym list
The acronyms were expanded with respect to the following dictionary: https://www.tabers.com/tabersonline/view/Tabers-Dictionary/767492/all/Medical_Abbreviations

In [6]:
import pickle
a_file = open("preprocessing/acronym_list.pkl", "rb")
d_acronyms = pickle.load(a_file)
a_file.close()

In [None]:
medical_record = get_phenotypes.load_medical_record_subsentences(record)

In [161]:
len(content.split('\n'))

for ix, line in enumerate(content.split('<br>')):
    print(ix, line)
    if ix ==17 :
        break
    

0 DOI: https://www.cell.com/ajhg/fulltext/S0002-9297(17)30459-7
1 
2 <!DOCTYPE html>
<html class="pb-page" data-request-id="824da15d-91fa-4c51-825c-33bf2975c22c" lang="en">
 <head data-pb-dropzone="head">
  <!-- if there is any customization for Responsive Project widget  -->
  <title>
   ACTB Loss-of-Function Mutations Result in a Pleiotropic Developmental Disorder: The American Journal of Human Genetics
  </title>
  <!--

    Pagebuild admin UI
-->
  <noscript>
  </noscript>
  <input name="journalCode-ux3-lastin-head" type="hidden" value="ajhg"/>
  <!-- -->
 </head>
 <body class="pb-ui">
  <div class="skip">
  </div>
  <div data-ng-non-bindable="" id="pb-page-content">
   <div data-pb-dropzone="main" data-pb-dropzone-name="Main">
    <div class="ux3" data-widget-def="ux3-layout-widget" data-widget-id="832ad3f3-97ac-45ab-9af1-90c6f7bff54a">
     <div class="ajhg">
      <div data-widget-id="832ad3f3-97ac-45ab-9af1-90c6f7bff54a" data-widget-name="ux3-layout-widget">
       <div>
      

### Expand Acronyms

In [158]:
#from clinphen_src import load_mr_map
#from clinphen_src import load_all_hpo_synonyms

#mr_map =# load_mr_map(medical_record_words)
from collections import defaultdict
import os

safe_ID_to_lines = defaultdict(set)

hpo_syn_file = os.path.join("data/hpo_synonyms.txt")

syns = get_phenotypes.load_all_hpo_synonyms(hpo_syn_file)

l_extract = []
for hpoID in syns.keys():
    for syn in syns[hpoID]:
        syn = re.sub('[^0-9a-zA-Z]+', ' ', syn.lower())
        synTokens = get_phenotypes.alphanum_only(set([syn]))
        if len(synTokens) < 1: continue
        firstToken = list(synTokens)[0]
        lines = set(mr_map[firstToken])
        for token in synTokens:
            lines &= set(mr_map[token])
            if len(lines) < 1: break
        if len(lines) < 1: continue
        l_extract.extend(list(lines))
        print(lines, firstToken) # firstToken
        #for i in lines:
        #    line = " ".join(medical_record_words[i])
        #    flagged = False
        #    for flag in medical_record_flags[i]:
        #        if flag not in synTokens:
        #            flagged = True
        #            break 
        #    if flagged: continue
        #    safe_ID_to_lines[hpoID].add(i)

{1011, 798} large
{194, 780, 785, 185, 30} wide
{194, 780, 785, 185, 30} wide
{1011, 798} large
{356, 413} kyphosis
{229, 252, 749, 300} cryptorchidism
{357, 230, 745, 239, 305, 251} inguinal
{1} abnormality
{1} abnormality
{1} head
{1} head
{720, 309} hydrocephalus
{247} hypoglycaemia
{406} hypoglycemia
{247} hypoglycaemia
{751} palatal
{751} palatal
{281, 362, 271} clinodactyly
{733} dilatation
{353} dysplastic
{353} dysplastic
{65, 755, 174, 23} abnormality
{65, 755, 174, 23} abnormality
{419} finger
{419} tapered
{419} finger
{419} tapered
{735} kidneys
{735} kidneys
{228, 265, 309, 373, 735} kidney
{228, 265, 309, 373, 735} kidney
{788, 13} coloboma
{480, 729} malformations
{480, 729} malformations
{425} contractures
{425} contractures
{425} flexion
{424, 425} contracture
{424} joint
{425} contractures
{425} contractures
{424} joint
{424, 425} contracture
{233, 284} limitation
{233, 284} limitation
{25, 180, 772, 221} growth
{221} growth
{221} growth
{25, 180, 772, 221} growth
{47

In [156]:
l_extract = set(l_extract)
l_extract

{1,
 11,
 13,
 14,
 21,
 22,
 23,
 25,
 29,
 30,
 32,
 54,
 65,
 66,
 90,
 172,
 173,
 174,
 180,
 182,
 184,
 185,
 187,
 190,
 193,
 194,
 196,
 197,
 198,
 200,
 220,
 221,
 224,
 226,
 227,
 228,
 229,
 230,
 231,
 232,
 233,
 234,
 235,
 238,
 239,
 240,
 242,
 243,
 246,
 247,
 248,
 249,
 251,
 252,
 254,
 259,
 260,
 262,
 263,
 265,
 266,
 267,
 271,
 273,
 275,
 276,
 277,
 279,
 281,
 284,
 285,
 286,
 287,
 289,
 290,
 291,
 292,
 295,
 296,
 299,
 300,
 301,
 302,
 303,
 304,
 305,
 306,
 308,
 309,
 310,
 311,
 312,
 313,
 314,
 315,
 316,
 317,
 318,
 320,
 322,
 323,
 324,
 325,
 326,
 327,
 328,
 329,
 331,
 332,
 335,
 336,
 337,
 338,
 339,
 340,
 344,
 345,
 347,
 348,
 351,
 352,
 353,
 354,
 355,
 356,
 357,
 358,
 359,
 360,
 361,
 362,
 363,
 366,
 367,
 370,
 373,
 375,
 376,
 377,
 380,
 381,
 383,
 384,
 388,
 390,
 391,
 393,
 394,
 396,
 397,
 398,
 399,
 400,
 402,
 403,
 405,
 406,
 407,
 408,
 411,
 412,
 413,
 414,
 415,
 416,
 417,
 418,
 419,
 420,
 4

## Step 3 ClinPhen 

In [21]:
from clinphen_src import get_phenotypes
from clinphen_src import src_dir

def load_common_phenotypes(commonFile):
    returnSet = set()
    for line in open(commonFile): returnSet.add(line.strip())
    return returnSet

def clinphen(inputFile, srcDir, extensive=True, custom_thesaurus="", rare=False):
    """
    Employ ClinPhen to infer HPO-codes based on format-free text 
    
    Extensive: perform an extensive search
    """
    #srcDir
    print(extensive)
    hpo_main_names = srcDir + "/hpo_term_names.txt"

    def getNames():
        returnMap = {}
        for line in open(hpo_main_names):
            lineData = line.strip().split("\t")
            returnMap[lineData[0]] = lineData[1]
        return returnMap
    hpo_to_name = getNames()

    inputStr = ""
    for line in open(inputFile): inputStr += line
    if extensive:
        if not custom_thesaurus: returnString = get_phenotypes.extract_phenotypes(inputStr, hpo_to_name, extensive)
        else: returnString = get_phenotypes.extract_phenotypes_custom_thesaurus(inputStr, custom_thesaurus, hpo_to_name)
        if not rare: return returnString
    else:
        if not custom_thesaurus: returnString, mr_map = get_phenotypes.extract_phenotypes(inputStr, hpo_to_name, extensive)
        else: returnString, mr_map = get_phenotypes.extract_phenotypes_custom_thesaurus(inputStr, custom_thesaurus, hpo_to_name)
        if not rare: return returnString, mr_map
    #print('qq')
    items = returnString.split("\n")
    returnList = []
    common = load_common_phenotypes(srcDir + "/common_phenotypes.txt")
    for item in items:
        HPO = item.split("\t")[0]
        if HPO in common: continue
        returnList.append(item)
    if extensive == True:
        return "\n".join(returnList)
    elif extensive == False :
        return "\n".join(returnList), mr_map

items = clinphen('example_data.txt','data', extensive=True)

df_hpo = pd.DataFrame([n.split('\t') for n in items.split('\n')])
df_hpo.columns = df_hpo.iloc[0]
df_hpo = df_hpo.reindex(df_hpo.index.drop(0))
df_hpo.head()

True


Unnamed: 0,HPO ID,Phenotype name,No. occurrences,Earliness (lower = earlier),Example sentence
1,HP:0000006,Autosomal dominant inheritance,1,0,townes brocks syndrome tbs is an autosomal dom...
2,HP:0000356,Abnormality of the outer ear,1,2,major findings include external ear anomalies ...
3,HP:0000598,Abnormality of the ear,1,2,major findings include external ear anomalies ...
4,HP:0000365,Hearing impairment,1,3,major findings include external ear anomalies ...
5,HP:0010442,Polydactyly,1,4,major findings include external ear anomalies ...


## Check header table

In [10]:
l_yes = ['y', 'true', 't', 'yes', '1', 'present', 'p', 'pres'] 
l_no = ['n', 'false', 'f', 'no', '0', 'absent', 'a', 'abs']
l_unknown = ['u', 'unknown', 'na', 'nan'] # , ''


#if all values within at least one of the lists above

In [11]:
l_not = []
for col in raw_table.columns:
    l_val = raw_table[col].values
    for i in l_val:
        val = i.lower()
        if (val not in l_yes and val not in l_no and val not in l_unknown):
            l_not.append(col)
            break
#print(l_not)
l_qualify = list(set(raw_table.columns) - set(l_not))
l_qualify
#np.all([i if (i in l_yes or i in l_no) for i in ])

['Speech Delay', 'Motor Delay', 'Microcephaly', 'PN Growth Retardation']

In [13]:
d_col = {}

for col in l_qualify:
    l_hpo = [] 
    with open('screen_text.txt', 'w') as f:
        f.write(col)
    items = clinphen('screen_text.txt','data')
    df_hpo = pd.DataFrame([n.split('\t') for n in items.split('\n')])
    df_hpo.columns = df_hpo.iloc[0]
    df_hpo = df_hpo.reindex(df_hpo.index.drop(0))
    d_col[col] = list(df_hpo['HPO ID'])

[{'speech', 'delas', 'delaing', 'delay'}]
[{'motor', 'delas', 'delaing', 'delay'}]
[{'microcephaling', 'microcepha', 'microcephaly', 'microcephals'}]
[{'growth', 'pn', 'retardation'}]


In [37]:
d_col

{'Speech Delay': ['HP:0000750'],
 'Motor Delay': ['HP:0001270'],
 'Microcephaly': ['HP:0000252', 'HP:0011451'],
 'PN Growth Retardation': ['HP:0001510']}

In [38]:
def col_hpo(row, d_col):
    #print(row)
    l_hpo = []
    for ix, i in enumerate(row):
        #print(ix)
        col_name = row.keys()[ix]
        val = i.lower()
        if val in l_yes:
            l_hpo.extend(d_col[col_name])
    return l_hpo
raw_table['col_hpo'] = raw_table.apply(lambda x : col_hpo(x, d_col), axis=1)

In [39]:
raw_table

Unnamed: 0,Case ID,Inheritance,Gender,Age (Years),Prenatal and Neonatal History,PN Growth Retardation,Microcephaly,Motor Delay,Speech Delay,DD/ ID,"Behavioral, Psychiatric, and Neurological Features",Malformations and Physical Anomalies,Additional Comments,col_hpo
1,I,DN,M,4,SGA and feeding difficulties,Y,N,Y,Y,mod,possible absence and focal seizures,"VSD with tortuous aortic arch, horseshoe kidne...","early-onset hypothyroidism, limitation of join...","[HP:0001510, HP:0001270, HP:0000750]"
2,II,DN,M,7,SGA and feeding difficulties,Y,N,Y,Y,mild,"sociable, empathetic, hand flapping tendency a...","Rt pelvic kidney, Rt inguinal hernia and scoli...","GOR, asthma and allergies.","[HP:0001510, HP:0001270, HP:0000750]"
3,III,DN,M,7,"SGA, polycythaemia, jaundice and hypoglycaemia...",Y,Y,N,Y,Mild,"attention deficit, echolalia and tantrums.","inguinal hernia, cryptorchidism, proximally pl...","perineal and scalp abscesses, recurrent chest ...","[HP:0001510, HP:0000252, HP:0011451, HP:0000750]"
4,IVa,Mat,F,32,SGA,Y,N,Y,Y,mod,empathetic personality.,scoliosis,"glaucoma, asthma, and eczema","[HP:0001510, HP:0001270, HP:0000750]"
5,IVb,U,F,68,U,U,U,U,U,mild,U,horseshoe kidney with multiple cysts,hiatus hernia,[]
7,V,DN,F,13,SGA and feeding difficulties,Y,N,Y,Y,mod,"sociable personality, mild ventriculomegaly, a...","VSD, PDA, BL 5th finger clinodactyly, BL 2-3 &...",N,"[HP:0001510, HP:0001270, HP:0000750]"
8,VI,U,M,20,N,Y,N,N,Y,mild,stress intolerance.,"short and broad uvula, broad halluces, short d...","frequent otisis media, GH deficiency and limit...","[HP:0001510, HP:0000750]"
9,VII,DN,F,12,"SGA, hypotonia and feeding difficulties",Y,Y,Y,Y,mod,emotional problems and hypotonia,"tricuspid valve dysplasia, 2-3-4 fingers and 2...",BL severe SNHL and dorsal hypertrichosis,"[HP:0001510, HP:0000252, HP:0011451, HP:000127..."
10,VIII,DN,M,7,"SGA, hypotonia and feeding difficulties",Y,N,Y,Y,sev,"Thin CC, septum pellucidum cyst, megacisterna ...","BL CLAP, VSD, Lt extra nipple, hypospadias, UL...",cutis marmorata; additionalde novo1.65 Mb loss...,"[HP:0001510, HP:0001270, HP:0000750]"
11,IX,DN,M,6,hypotonia and feeding difficulties,Y,Y,Y,Y,mod,cortical and subcortical atrophy,atrial septal defect and BL inguinal hernia,GOR,"[HP:0001510, HP:0000252, HP:0011451, HP:000127..."


In [40]:
raw_table.iloc[9]['col_hpo']

['HP:0001510', 'HP:0000252', 'HP:0011451', 'HP:0001270', 'HP:0000750']