In [1]:
import json
import time
import requests
import sys
from datetime import datetime

def post_json_request(url, obj):
    return requests.post(url, json=obj).json()

In [2]:
database="test10"
project="adc_cali"
start_time = time.time()
pubmedResult = post_json_request(
    "http://localhost:5000/query",
    {
        "database": database,
        "project": project,
        "maxdocs": 50,
        "patternid": 1,
        "query": "breast[Title/Abstract] AND cancer[Title/Abstract]",
    },
)[0]["exit"]
print("--- %s seconds ---" % (time.time() - start_time))

--- 1109.2847278118134 seconds ---


In [3]:
print(pubmedResult)

0


In [4]:
metadata_documents = post_json_request(
    "http://localhost:5001/mongo-doc-list",
    {
        "db_name": database,
        "coll_name": f"{project}_metadata_global"
    },
)
metadata_documents[0].keys()

dict_keys(['_id', 'title', 'abstract', 'authors', 'dbid', 'doi', 'lang', 'org', 'pat_id', 'url', 'year'])

In [5]:
authors_documents = post_json_request(
    "http://localhost:5001/mongo-doc-list",
    {
        "db_name": database,
        "coll_name": f"{project}_author_vs_doc_id_global"
    },
)
authors_documents[0].keys()

dict_keys(['_id', 'author', 'doc_id', 'doi'])

In [6]:
[f"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id={document.get('dbid')}&retmode=ref&cmd=prlinks" for document in metadata_documents]

['http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=36933187&retmode=ref&cmd=prlinks',
 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=36933062&retmode=ref&cmd=prlinks',
 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=36933050&retmode=ref&cmd=prlinks',
 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=36932957&retmode=ref&cmd=prlinks',
 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=36932886&retmode=ref&cmd=prlinks',
 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=36932743&retmode=ref&cmd=prlinks',
 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=36932670&retmode=ref&cmd=prlinks',
 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=36932388&retmode=ref&cmd=prlinks',
 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=36932252&retmode=ref&cmd=prlinks',
 'http://e

In [7]:
print(", ".join([document.get('dbid') for document in metadata_documents]))

36933187, 36933062, 36933050, 36932957, 36932886, 36932743, 36932670, 36932388, 36932252, 36932246, 36932232, 36932156, 36932003, 36931740, 36931265, 36931171, 36931164, 36931128, 36931039, 36931031, 36930999, 36930943, 36930854, 36930833, 36930678, 36930677, 36930555, 36930460, 36930419, 36930347, 36930346, 36930345, 36930344, 36930147, 36930083, 36929983, 36929975, 36929946, 36929942, 36929848, 36929759, 36929748, 36929572, 36929295, 36929288, 36929229, 36929087, 36928988, 36928951, 36928924, 36940464, 36940420, 36940301, 36940196, 36940038, 36939982, 36939976, 36939902, 36939876, 36939778, 36939607, 36939483, 36939453, 36939445, 36939381, 36939370, 36939293, 36939143, 36939123, 36938944, 36938928, 36938826, 36938720, 36938678, 36938303, 36938225, 36938187, 36937987, 36937848, 36937820, 36937603, 36937585, 36937550, , 36937444, 36937408, 36937402, 36937398, 36937388, 36937381, 36936994, 36936536, 36936429, 36936420, 36936412, 36936362, 36936274, 36936256


In [8]:
urlList=list(map(lambda document: {"url": document["url"]}, metadata_documents))
urlList

[{'url': None},
 {'url': 'https://link.springer.com/content/pdf/10.1007/s00018-023-04734-7.pdf'},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': 'http://www.plosone.org/article/fetchObject.action?uri=info:doi/10.1371/journal.pcbi.1010939&representation=PDF'},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': 'https://link.springer.com/content/pdf/10.1007/s10549-022-06815-w.pdf'},
 {'url': None},
 {'url': None},
 {'url': None},
 {'url': 'http://www.thelancet.com/pdfs/journals/lancet/PIIS0140-6736(22)02249-8.pdf'},
 {'url': 'http://www.thelancet.com/pdfs/journals/lancet/PIIS0140-6736(22)02241-3.pdf'},
 {'url': None},
 {'url':

In [9]:
availablePDFs=len(list([url_dict["url"]] for url_dict in urlList if url_dict["url"]))/len(urlList)*100
print(f"avaliable pdfs: {availablePDFs}%")

avaliable pdfs: 35.714285714285715%


In [10]:
def url2head(url):
    try:
        return post_json_request(
            "http://localhost:5002/url2htext",
            { "url": url }
        )
    except:
        return {"htext": ""}
url2head("http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC9983066&blobtype=pdf")

{'htext': 'MEDICINE INTERNATIONAL 3: 11, 2023\n\nThe combination of positive anti‑WDR1 antibodies with\nnegative anti‑CFL1 antibodies in serum is a poor prognostic\nfactor for patients with esophageal carcinoma\nMASAAKI ITO1, SATOSHI YAJIMA2, TAKASHI SUZUKI2, YOKO OSHIMA2, TATSUKI NANAMI2,\nMAKOTO SUMAZAKI2, FUMIAKI SHIRATORI2, HAO WANG3,4, LIUBING HU4, HIROTAKA TAKIZAWA5,\nSHU‑YANG LI6, YASUO IWADATE6, TAKAKI HIWASA1,6 and HIDEAKI SHIMADA1,2\n1\n\nDepartment of Clinical Oncology, Toho University Graduate School of Medicine; 2Department of Gastroenterological Surgery,\nToho University School of Medicine, Tokyo 143‑8541, Japan; 3Stroke Center, The First Affiliated Hospital, Jinan University;\n4\nDepartment of Anesthesiology, Stroke Center, The First Affiliated Hospital and Health Science Center, Jinan University,\nGuangzhou, Guangdong 510630, P.R. China; 5Port Square Kashiwado Clinic, Kashiwado Memorial Foundation,\nChiba 260‑0025; 6Department of Neurological Surgery, Graduate School of

In [11]:
urls = list(filter(lambda url: url["url"] != None, urlList))
urls

[{'url': 'https://link.springer.com/content/pdf/10.1007/s00018-023-04734-7.pdf'},
 {'url': 'http://www.plosone.org/article/fetchObject.action?uri=info:doi/10.1371/journal.pcbi.1010939&representation=PDF'},
 {'url': 'https://link.springer.com/content/pdf/10.1007/s10549-022-06815-w.pdf'},
 {'url': 'http://www.thelancet.com/pdfs/journals/lancet/PIIS0140-6736(22)02249-8.pdf'},
 {'url': 'http://www.thelancet.com/pdfs/journals/lancet/PIIS0140-6736(22)02241-3.pdf'},
 {'url': 'http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC10020071&blobtype=pdf'},
 {'url': 'http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC10020396&blobtype=pdf'},
 {'url': 'https://link.springer.com/content/pdf/10.1007/s00428-023-03527-4.pdf'},
 {'url': 'https://link.springer.com/content/pdf/10.1007/s00421-023-05177-5.pdf'},
 {'url': 'http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC10017099&blobtype=pdf'},
 {'url': 'http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC10017101&blobtype=pdf'},
 {'url': 'ht

In [12]:
start_time = time.time()
headers = list(map(lambda url: url2head(url["url"])["htext"], urls))
print("--- %s seconds ---" % (time.time() - start_time))
headers

--- 1131.2687497138977 seconds ---


['Cellular and Molecular Life Sciences\n(2023) 80:100\nhttps://doi.org/10.1007/s00018-023-04734-7\n\nCellular and Molecular Life Sciences\n\nORIGINAL ARTICLE\n\nLoss of Kmt2c in vivo leads to EMT, mitochondrial dysfunction\nand improved response to lapatinib in breast cancer\nNikiana Simigdala1 · Anna Chalari1 · Aimilia D. Sklirou2 · Evangelia Chavdoula1,3,4 · George Papafotiou1 ·\nPelagia Melissa1 · Aimilia Kafalidou1 · Nikolaos Paschalidis1 · Ioannis S. Pateras5 · Emmanouil Athanasiadis1 ·\nDimitris Konstantopoulos6 · Ioannis P. Trougakos2 · Apostolos Klinakis1\nReceived: 4 May 2022 / Revised: 22 January 2023 / Accepted: 22 February 2023\n© The Author(s) 2023\n\nAbstract\nDeep sequencing of human tumours has uncovered a previously unappreciated role for epigenetic regulators in tumorigenesis.\nH3K4 methyltransferase KMT2C/MLL3 is mutated in several solid malignancies, including more than 10% of breast tumours.\nTo study the tumour suppressor role of KMT2C in breast cancer, we generat

In [13]:
def text2locations(text):
    try:
        return post_json_request(
            "http://localhost:5002/text2locations",
            { "text": text }
        )
    except:
        return {"locations": []}
text2locations("Exploration of Targeted Anti-tumor Therapy\nOpen Access Review\n\nAn overview of the anti-cancer actions of Tanshinones, derived\nfrom Salvia miltiorrhiza (Danshen)\nIrum Naz1, Myriam Merarchi2, Shanaya Ramchandani3, Muhammad Rashid Khan4*, Muhammad Nouman\nMalik1, Sumaira Sarwar1, Acharan S Narula5, Kwang Seok Ahn6*\nDepartment of Biochemistry, Faculty of Biological Sciences, Quaid-i-Azam University, Islamabad 45320, Pakistan\n\n1\n\nFaculty of Pharmacy, University of Paris Descartes, 75006 Paris, France\n\n2\n\nDepartment of Pharmacology-Biomedicine, The University of Melbourne, Parkville, VIC 3010, Australia\n\n3\n\nHigher Education Commission of Pakistan, Islamabad 44000, Pakistan\n\n4\n\nNarula Research, Chapel Hill, NC 27516, USA\n\n5\n\nDepartment of Science in Korean Medicine, College of Korean Medicine, Kyung Hee University, 24 Kyungheedae-ro,\nDongdaemun-gu, Seoul 02447, South Korea\n6\n\n*Correspondence: Muhammad Rashid Khan, Department of Biochemistry, Faculty of Biological Sciences, Quaid-i-Azam\nUniversity, Islamabad 45320, Pakistan. mrkhanqau@yahoo.com; Kwang Seok Ahn, Department of Science in Korean Medicine,\nCollege of Korean Medicine, Kyung Hee University, 24 Kyungheedae-ro, Dongdaemun-gu, Seoul 02447, South Korea. ksahn@\nkhu.ac.kr\nAcademic Editor: Gautam Sethi, National University of Singapore, Singapore\nReceived: April 16, 2020 Accepted: May 17, 2020 Published: June 29, 2020\n\nCite this article: Naz I, Merarchi M, Ramchandani S, Khan MR, Malik MN, Sarwar S, et al. An overview of the anti-cancer\nactions of Tanshinones, derived from Salvia miltiorrhiza (Danshen). Explor Target Antitumor Ther. 2020;1:153-70. https://doi.\norg/10.37349/etat.2020.00010\n\n")

{'locations': ['Tanshinones',
  'Salvia',
  'Biochemistry',
  'Pakistan',
  'Pharmacy',
  'Paris Descartes',
  'Paris',
  'France',
  'Melbourne',
  'Parkville',
  'Australia',
  'Pakistan Pakistan',
  'Science',
  'Korean Medicine',
  'Korean Medicine Seoul',
  'South',
  'Biochemistry Pakistan',
  'Science Korean Medicine',
  'Korean Medicine Seoul South Korea',
  'Singapore',
  'Tanshinones Salvia']}

In [15]:
start_time = time.time()
locations = list(map(lambda text: text2locations(text)["locations"], headers))
print("--- %s seconds ---" % (time.time() - start_time))
locations

--- 5.018128871917725 seconds ---


[['Cellular',
  'Author',
  'Athens',
  'Athens Greece',
  'Biophysics',
  'Biology',
  'Athens Athens',
  'Greece',
  'Cancer',
  'Genetics',
  'Columbus',
  'Columbus Pathology',
  'Athens Athens Greece'],
 ['Bayesian',
  'Mathematics',
  'Hong Kong',
  'Hong Kong China',
  'Mathematics Shenzhen',
  'China',
  'Life',
  'Northeastern University',
  'China China',
  'Chengdu',
  'China Ching',
  'Bayesian UNITED STATES',
  'China Bayesian'],
 ['Author',
  'New Zealand',
  'Epidemiology',
  'Biostatistics',
  'Auckland',
  'Auckland New',
  'Oncology',
  'Oncology Hamilton',
  'New',
  'NIDEA',
  'Demographic',
  'Waikato',
  'Hamilton'],
 ['Embase',
  'Nursing',
  'English',
  'Canada',
  'Australia',
  'Lebanon',
  'England',
  'Dublin',
  'Ireland',
  'Dublin Ireland',
  'Ireland Irish'],
 ['England',
  'Scotland',
  'Mean',
  'London',
  'Nutrition',
  'Health',
  'Medicine',
  'Brazil',
  'Cancer',
  'France'],
 ['Cancer',
  'U.S.',
  'US',
  'Guam',
  'Guam Hawai',
  'Guam Filipi

In [27]:
def text2places(text):
    try:
        return post_json_request(
            "http://localhost:5002/text2places",
            { "text": text }
        )
    except:
        return {"places": []}
text2places("Exploration of Targeted Anti-tumor Therapy\nOpen Access Review\n\nAn overview of the anti-cancer actions of Tanshinones, derived\nfrom Salvia miltiorrhiza (Danshen)\nIrum Naz1, Myriam Merarchi2, Shanaya Ramchandani3, Muhammad Rashid Khan4*, Muhammad Nouman\nMalik1, Sumaira Sarwar1, Acharan S Narula5, Kwang Seok Ahn6*\nDepartment of Biochemistry, Faculty of Biological Sciences, Quaid-i-Azam University, Islamabad 45320, Pakistan\n\n1\n\nFaculty of Pharmacy, University of Paris Descartes, 75006 Paris, France\n\n2\n\nDepartment of Pharmacology-Biomedicine, The University of Melbourne, Parkville, VIC 3010, Australia\n\n3\n\nHigher Education Commission of Pakistan, Islamabad 44000, Pakistan\n\n4\n\nNarula Research, Chapel Hill, NC 27516, USA\n\n5\n\nDepartment of Science in Korean Medicine, College of Korean Medicine, Kyung Hee University, 24 Kyungheedae-ro,\nDongdaemun-gu, Seoul 02447, South Korea\n6\n\n*Correspondence: Muhammad Rashid Khan, Department of Biochemistry, Faculty of Biological Sciences, Quaid-i-Azam\nUniversity, Islamabad 45320, Pakistan. mrkhanqau@yahoo.com; Kwang Seok Ahn, Department of Science in Korean Medicine,\nCollege of Korean Medicine, Kyung Hee University, 24 Kyungheedae-ro, Dongdaemun-gu, Seoul 02447, South Korea. ksahn@\nkhu.ac.kr\nAcademic Editor: Gautam Sethi, National University of Singapore, Singapore\nReceived: April 16, 2020 Accepted: May 17, 2020 Published: June 29, 2020\n\nCite this article: Naz I, Merarchi M, Ramchandani S, Khan MR, Malik MN, Sarwar S, et al. An overview of the anti-cancer\nactions of Tanshinones, derived from Salvia miltiorrhiza (Danshen). Explor Target Antitumor Ther. 2020;1:153-70. https://doi.\norg/10.37349/etat.2020.00010\n\n")

{'places': [['Pakistan', 4],
  ['Singapore', 2],
  ['France', 1],
  ['Australia', 1],
  ['United States of America', 1],
  ['South Korea', 1],
  ['Canada', 1],
  ['India', 1],
  ['Sweden', 1],
  ['New Zealand', 1],
  ['Denmark', 1],
  ['Indonesia', 1]]}

In [28]:
start_time = time.time()
places = list(map(lambda text: text2places(text)["places"], headers))
print("--- %s seconds ---" % (time.time() - start_time))
places

--- 163.43850755691528 seconds ---


[[['Greece', 4], ['United States of America', 2], ['Canada', 1], ['Spain', 1]],
 [["People's Republic of China", 6],
  ['United States of America', 1],
  ['Japan', 1],
  ['India', 1],
  ['Mexico', 1]],
 [['New Zealand', 2],
  ['Denmark', 1],
  ['Canada', 1],
  ['United States of America', 1],
  ['United Kingdom', 1],
  ['Australia', 1],
  ['Spain', 1],
  ['Kazakhstan', 1],
  ['France', 1],
  ['India', 1]],
 [['Ireland', 3],
  ['United States of America', 1],
  ['Canada', 1],
  ['Australia', 1],
  ['Lebanon', 1],
  ['Vietnam', 1],
  ['Belarus', 1],
  ['Netherlands', 1],
  ['Ukraine', 1]],
 [['United Kingdom', 2],
  ['Brazil', 1],
  ['France', 1],
  ['Cameroon', 1],
  ['Canada', 1],
  ['United States of America', 1],
  ['Serbia', 1],
  ['Spain', 1],
  ['Hungary', 1],
  ['Portugal', 1],
  ['Indonesia', 1],
  ['Ukraine', 1]],
 [['United States of America', 4], ['India', 1], ['Spain', 1], ['Japan', 1]],
 [['Argentina', 10]],
 [['Egypt', 4],
  ['United States of America', 1],
  ['Australia',

In [23]:
import geograpy
url = 'http://www.gutenberg.ca/ebooks/doyleac-casebookofsherlockholmes/doyleac-casebookofsherlockholmes-00-h.html'
places = geograpy.get_place_context(url=url)
print(places.countries)
print(places.country_mentions)
print(places.region_mentions)
print(places.city_mentions)

['South Africa', 'Canada', 'United States of America', 'Turkey', 'United Kingdom', 'Czech Republic', 'Japan', 'Australia', 'Spain', 'Tanzania', 'Norway', 'France', 'Italy', 'East Timor', 'Philippines', 'New Zealand', 'Sweden', 'Indonesia', 'Serbia', 'Mexico', 'Guyana', 'Brazil', 'Libya', 'Argentina', 'Netherlands', 'Mali', 'Pakistan', 'Ukraine', 'Bosnia and Herzegovina']
[('Canada', 3), ('South Africa', 2), ('United States of America', 1), ('Turkey', 1), ('United Kingdom', 1), ('Czech Republic', 1), ('Japan', 1), ('Australia', 1), ('Spain', 1), ('Tanzania', 1), ('Norway', 1), ('France', 1), ('Italy', 1), ('East Timor', 1), ('Philippines', 1), ('New Zealand', 1), ('Sweden', 1), ('Indonesia', 1), ('Serbia', 1), ('Mexico', 1), ('Guyana', 1), ('Brazil', 1), ('Libya', 1), ('Argentina', 1), ('Netherlands', 1), ('Mali', 1), ('Pakistan', 1), ('Ukraine', 1), ('Bosnia and Herzegovina', 1)]
[('Baker', 2), ('Liverpool', 1), ('Eastern', 1), ('Briton', 1), ('British', 1), ('Yuan', 1), ('Adelbert', 1

In [26]:
import geograpy
text = headers[4]
places = geograpy.get_place_context(text=text)
print(places.countries)
print(places.country_mentions)
print(places.country_mentions[0][0])
print(places.region_mentions)
print(places.city_mentions)

['Brazil', 'United Kingdom', 'France', 'Cameroon', 'Canada', 'United States of America', 'Serbia', 'Spain', 'Hungary', 'Portugal', 'Indonesia', 'Ukraine']
[('United Kingdom', 2), ('Brazil', 1), ('France', 1), ('Cameroon', 1), ('Canada', 1), ('United States of America', 1), ('Serbia', 1), ('Spain', 1), ('Hungary', 1), ('Portugal', 1), ('Indonesia', 1), ('Ukraine', 1)]
United Kingdom
[('Center', 3), ('São Paulo', 1), ('Wales', 1), ('England', 1), ('Scotland', 1)]
[('São Paulo', 2), ('England', 1), ('Scotland', 1), ('Wales', 1), ('London', 1), ('Center', 1), ('Health', 1), ('University', 1), ('Brazil', 1), ('Cancer', 1), ('Lyon', 1)]


In [29]:
def text2ner(text):
    try:
        return post_json_request(
            "http://localhost:5002/text2ner",
            { "text": text }
        )
    except:
        return {"ner": []}
text2ner("Exploration of Targeted Anti-tumor Therapy\nOpen Access Review\n\nAn overview of the anti-cancer actions of Tanshinones, derived\nfrom Salvia miltiorrhiza (Danshen)\nIrum Naz1, Myriam Merarchi2, Shanaya Ramchandani3, Muhammad Rashid Khan4*, Muhammad Nouman\nMalik1, Sumaira Sarwar1, Acharan S Narula5, Kwang Seok Ahn6*\nDepartment of Biochemistry, Faculty of Biological Sciences, Quaid-i-Azam University, Islamabad 45320, Pakistan\n\n1\n\nFaculty of Pharmacy, University of Paris Descartes, 75006 Paris, France\n\n2\n\nDepartment of Pharmacology-Biomedicine, The University of Melbourne, Parkville, VIC 3010, Australia\n\n3\n\nHigher Education Commission of Pakistan, Islamabad 44000, Pakistan\n\n4\n\nNarula Research, Chapel Hill, NC 27516, USA\n\n5\n\nDepartment of Science in Korean Medicine, College of Korean Medicine, Kyung Hee University, 24 Kyungheedae-ro,\nDongdaemun-gu, Seoul 02447, South Korea\n6\n\n*Correspondence: Muhammad Rashid Khan, Department of Biochemistry, Faculty of Biological Sciences, Quaid-i-Azam\nUniversity, Islamabad 45320, Pakistan. mrkhanqau@yahoo.com; Kwang Seok Ahn, Department of Science in Korean Medicine,\nCollege of Korean Medicine, Kyung Hee University, 24 Kyungheedae-ro, Dongdaemun-gu, Seoul 02447, South Korea. ksahn@\nkhu.ac.kr\nAcademic Editor: Gautam Sethi, National University of Singapore, Singapore\nReceived: April 16, 2020 Accepted: May 17, 2020 Published: June 29, 2020\n\nCite this article: Naz I, Merarchi M, Ramchandani S, Khan MR, Malik MN, Sarwar S, et al. An overview of the anti-cancer\nactions of Tanshinones, derived from Salvia miltiorrhiza (Danshen). Explor Target Antitumor Ther. 2020;1:153-70. https://doi.\norg/10.37349/etat.2020.00010\n\n")

{'entities': [['Exploration', 'NN'],
  ['of', 'IN'],
  [['Targeted', 'NNP']],
  ['Anti-tumor', 'NNP'],
  ['Therapy', 'NNP'],
  ['Open', 'NNP'],
  ['Access', 'NNP'],
  ['Review', 'NNP'],
  ['An', 'DT'],
  ['overview', 'NN'],
  ['of', 'IN'],
  ['the', 'DT'],
  ['anti-cancer', 'JJ'],
  ['actions', 'NNS'],
  ['of', 'IN'],
  [['Tanshinones', 'NNP']],
  [',', ','],
  ['derived', 'VBD'],
  ['from', 'IN'],
  [['Salvia', 'NNP']],
  ['miltiorrhiza', 'NN'],
  ['(', '('],
  [['Danshen', 'NNP']],
  [')', ')'],
  [['Irum', 'NNP'], ['Naz1', 'NNP']],
  [',', ','],
  [['Myriam', 'NNP'], ['Merarchi2', 'NNP']],
  [',', ','],
  [['Shanaya', 'NNP'], ['Ramchandani3', 'NNP']],
  [',', ','],
  [['Muhammad', 'NNP'], ['Rashid', 'NNP'], ['Khan4', 'NNP']],
  ['*', 'NNP'],
  [',', ','],
  [['Muhammad', 'NNP'], ['Nouman', 'NNP'], ['Malik1', 'NNP']],
  [',', ','],
  [['Sumaira', 'NNP'], ['Sarwar1', 'NNP']],
  [',', ','],
  [['Acharan', 'NNP'], ['S', 'NNP'], ['Narula5', 'NNP']],
  [',', ','],
  [['Kwang', 'NNP'], ['S

In [18]:
import time
def text2emails(text):
    try:
        return post_json_request(
            "http://localhost:5002/text2emails",
            { "text": text }
        )
    except:
        return {"emails": []}
text2emails("Exploration of Targeted Anti-tumor Therapy\nOpen Access Review\n\nAn overview of the anti-cancer actions of Tanshinones, derived\nfrom Salvia miltiorrhiza (Danshen)\nIrum Naz1, Myriam Merarchi2, Shanaya Ramchandani3, Muhammad Rashid Khan4*, Muhammad Nouman\nMalik1, Sumaira Sarwar1, Acharan S Narula5, Kwang Seok Ahn6*\nDepartment of Biochemistry, Faculty of Biological Sciences, Quaid-i-Azam University, Islamabad 45320, Pakistan\n\n1\n\nFaculty of Pharmacy, University of Paris Descartes, 75006 Paris, France\n\n2\n\nDepartment of Pharmacology-Biomedicine, The University of Melbourne, Parkville, VIC 3010, Australia\n\n3\n\nHigher Education Commission of Pakistan, Islamabad 44000, Pakistan\n\n4\n\nNarula Research, Chapel Hill, NC 27516, USA\n\n5\n\nDepartment of Science in Korean Medicine, College of Korean Medicine, Kyung Hee University, 24 Kyungheedae-ro,\nDongdaemun-gu, Seoul 02447, South Korea\n6\n\n*Correspondence: Muhammad Rashid Khan, Department of Biochemistry, Faculty of Biological Sciences, Quaid-i-Azam\nUniversity, Islamabad 45320, Pakistan. mrkhanqau@yahoo.com; Kwang Seok Ahn, Department of Science in Korean Medicine,\nCollege of Korean Medicine, Kyung Hee University, 24 Kyungheedae-ro, Dongdaemun-gu, Seoul 02447, South Korea. ksahn@\nkhu.ac.kr\nAcademic Editor: Gautam Sethi, National University of Singapore, Singapore\nReceived: April 16, 2020 Accepted: May 17, 2020 Published: June 29, 2020\n\nCite this article: Naz I, Merarchi M, Ramchandani S, Khan MR, Malik MN, Sarwar S, et al. An overview of the anti-cancer\nactions of Tanshinones, derived from Salvia miltiorrhiza (Danshen). Explor Target Antitumor Ther. 2020;1:153-70. https://doi.\norg/10.37349/etat.2020.00010\n\n")

{'emails': ['mrkhanqau@yahoo.com']}

In [40]:
start_time = time.time()
emails = list(map(lambda text: text2emails(text)["emails"], headers))
print("--- %s seconds ---" % (time.time() - start_time))
emails

--- 0.1497042179107666 seconds ---


[['nsimigdala@bioacademy.gr',
  'konstantopoulos@fleming.gr',
  'aklinakis@bioacademy.gr',
  'itrougakos@biol.uoa.gr',
  'achalari@bioacademy.gr',
  'asklirou@biol.uoa.gr',
  'echavdoula@gmail.com',
  'geo.papafotiou@gmail.com',
  'pmelissa@bioacademy.gr',
  'akafalidou@bioacademy.gr',
  'npaschal@bioacademy.gr',
  'ipateras@med.uoa.gr',
  'mathan@bioacademy.gr'],
 ['yushanqiu2526374@163.com'],
 ['o.scott@auckland.ac.nz'],
 ['kathleen.frazer@ucd.ie'],
 ['chu-mei.chang@imperial.ac.uk'],
 ['monicake@hawaii.edu',
  'kristimh@hawaii.edu',
  'taflague@triton.uog.edu',
  'bernice7@hawaii.edu',
  'kaholoku@hawaii.edu'],
 ['zhanglei6@hrbmu.edu.cn'],
 ['dina.elgendy3@med.tanta.edu.eg'],
 ['t.hureau@unistra.fr'],
 ['hr.zheng@siat.ac.cn', 'meilin7@mail.sysu.edu.cn'],
 ['xinzhang@hmfl.ac.cn'],
 ['alinebandeirageriatra@gmail.com'],
 ['drmdrockybhasan@gmail.com'],
 ['sirinihao@163.com'],
 [],
 ['2017202040046@whu.edu.cn', 'xiaochong@stu.cdutcm.edu.cn'],
 [],
 [],
 ['aaron.wheeler@utoronto.ca'],
 ['n

In [53]:
domains = set([email.split("@")[1] for group_emails in emails for email in group_emails])
domains

{'126.com',
 '163.com',
 'auckland.ac.nz',
 'bioacademy.gr',
 'biol.uoa.gr',
 'csu.edu.cn',
 'cu.edu.eg',
 'ejust.edu.eg',
 'fjmu.edu.cn',
 'fleming.gr',
 'gmail.com',
 'gzucm.edu.cn',
 'hawaii.edu',
 'hmfl.ac.cn',
 'hotmail.com',
 'hrbmu.edu.cn',
 'imperial.ac.uk',
 'mail.sysu.edu.cn',
 'med.lu.se',
 'med.tanta.edu.eg',
 'med.uoa.gr',
 'purdue.edu',
 'sci.cu.edu.eg',
 'siat.ac.cn',
 'sina.com',
 'stu.cdutcm.edu.cn',
 'szu.edu.cn',
 'triton.uog.edu',
 'ucd.ie',
 'uci.edu',
 'uni-muenster.de',
 'unistra.fr',
 'utoronto.ca',
 'vumc.org',
 'whu.edu.cn'}

In [30]:
# pip3 install python-whois
import whois

def is_registered(domain_name):
    try:
        w = whois.whois(domain_name)
    except Exception:
        return False
    else:
        return bool(w.domain_name)

domain_name = "med.toho‑u.ac.jp"
if is_registered(domain_name):
    whois_info = whois.whois(domain_name)
    print(whois_info)
    
print(is_registered(domain_name))

False


In [55]:
# First step is to import the package
import whois21

def domain2whois(domain: str) -> str:
    return whois21.WHOIS(domain).raw.decode('utf-8')

query = 'nasa.gov'

# Second step is to create an instance of the WHOIS class
whois = whois21.WHOIS(query)

# Third step is to check if the operation was successful
if not whois.success:
    print(whois.error)
    exit()

# And basically you are done!
# Now you can print the results
import log21  # I use log21 to print the results in a cool way 8D

# Print the results in a nice way
# PPrint the dictionary
log21.pprint(whois.whois_data)
# Tree-Print the dictionary
log21.tree_print(whois.whois_data)

# Or you can print the results in as raw text
print(whois.raw.decode('utf-8'))

# Or you can access each part of the results individually
print(f'Creation date   : {whois.creation_date}')
print(f'Expiration date : {whois.expires_date}')
print(f'Updated date    : {whois.updated_date}')
print(f'Updated date    : {whois.registrar_name}')

[94m{[32m'>>> LAST UPDATE OF WHOIS DATABASE'[91m: [32m'2023-03-21T11:35:55Z <<<'[91m,
[32m 'DOMAIN NAME'[91m: [32m'NASA.GOV'[91m,
[32m 'SECURITY CONTACT EMAIL'[91m: [32m'soc@nasa.gov'[91m,
[32m 'STATUS'[91m: [32m'ACTIVE'[94m}[32m[0m
[32m─┬ [95mroot
[32m [32m├───┬ [95mDOMAIN NAME
[32m [32m│   [32m└──── [95mNASA.GOV
[32m [32m├───┬ [95mSTATUS
[32m [32m│   [32m└──── [95mACTIVE
[32m [32m├───┬ [95mSECURITY CONTACT EMAIL
[32m [32m│   [32m└──── [95msoc@nasa.gov
[32m [32m└───┬ [95m>>> LAST UPDATE OF WHOIS DATABASE
[32m [32m    [32m└──── [95m2023-03-21T11:35:55Z <<<
[0m


% DOTGOV WHOIS Server ready
   Domain Name: NASA.GOV
   Status: ACTIVE
   Security Contact Email: soc@nasa.gov

>>> Last update of whois database: 2023-03-21T11:35:55Z <<<

Please be advised that this whois server only contains information pertaining
to the .GOV domain. For information for other domains please use the whois
server at RS.INTERNIC.NET. 

Creation date   : 
Expiration date : 
Updated date    : 
Updated date    : 


In [56]:
whois_list = [domain2whois(domain) for domain in domains]

In [57]:
whois_list

['Invalid parameter:mail.sysu.edu.cn\n',
 '   Domain Name: HOTMAIL.COM\r\n   Registry Domain ID: 5244890_DOMAIN_COM-VRSN\r\n   Registrar WHOIS Server: whois.markmonitor.com\r\n   Registrar URL: http://www.markmonitor.com\r\n   Updated Date: 2021-02-02T17:04:58Z\r\n   Creation Date: 1996-03-27T05:00:00Z\r\n   Registry Expiry Date: 2024-03-28T04:00:00Z\r\n   Registrar: MarkMonitor Inc.\r\n   Registrar IANA ID: 292\r\n   Registrar Abuse Contact Email: abusecomplaints@markmonitor.com\r\n   Registrar Abuse Contact Phone: +1.2086851750\r\n   Domain Status: clientDeleteProhibited https://icann.org/epp#clientDeleteProhibited\r\n   Domain Status: clientTransferProhibited https://icann.org/epp#clientTransferProhibited\r\n   Domain Status: clientUpdateProhibited https://icann.org/epp#clientUpdateProhibited\r\n   Domain Status: serverDeleteProhibited https://icann.org/epp#serverDeleteProhibited\r\n   Domain Status: serverTransferProhibited https://icann.org/epp#serverTransferProhibited\r\n   Domai

In [32]:
import whois11

print(whois11.whois('nasa.gov'))

% DOTGOV WHOIS Server ready
   Domain Name: NASA.GOV
   Status: ACTIVE
   Security Contact Email: soc@nasa.gov

>>> Last update of whois database: 2023-03-21T11:19:02Z <<<

Please be advised that this whois server only contains information pertaining
to the .GOV domain. For information for other domains please use the whois
server at RS.INTERNIC.NET. 



In [33]:
import pdftotext
import io
from urllib.request import (
    Request,
    urlopen,
)
def file_download(url):
    req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
    pdf_pages = pdftotext.PDF(io.BytesIO(urlopen(req).read()))
    return pdf_pages
    
def text_from_pdf_bytes(pdf_bytes):
    pdf_text = pdftotext.PDF(io.BytesIO(urlopen(req).read()))
    return "\n\n".join(pdf_text)

def head_extract(pdf_bytes):
    text = text_from_pdf_file(path)
    abstract_pos = re.search(r"[aA]bstract", text)
    return text[: abstract_pos.start(0) if abstract_pos else 10000]

In [34]:
pdf_pages = file_download("http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC9983066&blobtype=pdf")

In [35]:
len(pdf_pages)

8

In [36]:
pdf_pages[0]

'                                                 MEDICINE INTERNATIONAL 3: 11, 2023\n           The combination of positive anti‑WDR1 antibodies with\n       negative anti‑CFL1 antibodies in serum is a poor prognostic\n                      factor for patients with esophageal carcinoma\n          MASAAKI ITO1, SATOSHI YAJIMA2, TAKASHI SUZUKI2, YOKO OSHIMA2, TATSUKI NANAMI2,\n      MAKOTO SUMAZAKI2, FUMIAKI SHIRATORI2, HAO WANG3,4, LIUBING HU4, HIROTAKA TAKIZAWA5,\n                  SHU‑YANG LI6, YASUO IWADATE6, TAKAKI HIWASA1,6 and HIDEAKI SHIMADA1,2\n1\n  Department of Clinical Oncology, Toho University Graduate School of Medicine; 2Department of Gastroenterological Surgery,\n  Toho University School of Medicine, Tokyo 143‑8541, Japan; 3Stroke Center, The First Affiliated Hospital, Jinan University;\n    4\n      Department of Anesthesiology, Stroke Center, The First Affiliated Hospital and Health Science Center, Jinan University,\n        Guangzhou, Guangdong 510630, P.R. China; 5Po

In [1]:
from googlesearch import search 

list_of_queries = ["wikipedia med.toho‑u.ac.jp"]
results = []

for query in list_of_queries:
    results.extend(list(search(query, num_results=10)))

print(results)

['https://en.wikipedia.org/wiki/Toho_University', 'https://www.wikidata.org/wiki/Q107016761', 'https://www.toho-u.ac.jp/english/', 'https://m.facebook.com/profile.php?id=112389765445207', 'https://www.timeshighereducation.com/world-university-rankings/toho-university', 'https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0280475', 'http://www.tohoku.ac.jp/en/', 'https://en-academic.com/dic.nsf/enwiki/4346092', 'https://www.timeshighereducation.com/world-university-rankings/toho-university', 'https://www.tripadvisor.com.sg/ShowTopic-g298184-i861-k7892653-Plz_help_with_suggestions_regards_hospitals-Tokyo_Tokyo_Prefecture_Kanto.html', 'http://www.tohoku.ac.jp/']
