In [1]:
import DataLoader
from glob import glob
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

## Matches in Research Fish

In [11]:
df = pd.read_pickle('data/EPMC/ffs_or_predecessors.pkl')
df = df[df['pmid'].astype(float).notna()]
df = df.set_index('pmid')

rf_publications_path = glob('data/ResearchFish/*publications*.xlsx')[0]
researchfish = DataLoader.read_excel(rf_publications_path,index_col='PMID')
rf_pmids = researchfish.index.dropna().unique()
rf_pmids = pd.Series(rf_pmids).astype(float)
rf_pub_in_epmc = rf_pmids.isin(df.index.astype(float))

print(sum(rf_pub_in_epmc==False),'papers in research fish not returned in EPMC query:')
with pd.option_context('display.max_colwidth', -1):
    display(researchfish[researchfish.index.isin(rf_pmids[~rf_pub_in_epmc].values)][['Award Reference','Author*','Publication*',
                                                                                 'Month','Year*','Journal*','Volume','Issue',
                                                                                     'Pages']])

print('===========================================================================')

epmc_pub_in_rf = df.index.astype(float).isin(rf_pmids)

print(sum(epmc_pub_in_rf==False),'papers in EPMC query not in research fish.')
with pd.option_context('display.max_colwidth', -1):
    display(df[~epmc_pub_in_rf][['title','firstPublicationDate','doi','journalInfo','grantsList']])

1 papers in research fish not returned in EPMC query:


Unnamed: 0_level_0,Award Reference,Author*,Publication*,Month,Year*,Journal*,Volume,Issue,Pages
PMID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
23034306.0,1310/11,Andrew Bastawrous,Author reply.,10.0,2012,Ophthalmology,119,10,2201


896 papers in EPMC query not in research fish.


Unnamed: 0_level_0,title,firstPublicationDate,doi,journalInfo,grantsList
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
29853847,Progress in Gene Therapy to Prevent Retinal Ganglion Cell Loss in Glaucoma and Leber's Hereditary Optic Neuropathy.,2018-05-02,10.1155/2018/7108948,"{'volume': '2018', 'journalIssueId': 2661340, 'dateOfPublication': '2018 ', 'monthOfPublication': 0, 'yearOfPublication': 2018, 'printPublicationDate': '2018-01-01', 'journal': {'title': 'Neural Plasticity', 'medlineAbbreviation': 'Neural Plast', 'isoabbreviation': 'Neural Plast.', 'nlmid': '100883417', 'essn': '1687-5443', 'issn': '2090-5904'}}",
29996768,Sustained high glucose exposure sensitizes macrophage responses to cytokine stimuli but reduces their phagocytic activity.,2018-07-11,10.1186/s12865-018-0261-0,"{'issue': '1', 'volume': '19', 'journalIssueId': 2705325, 'dateOfPublication': '2018 Jul', 'monthOfPublication': 7, 'yearOfPublication': 2018, 'printPublicationDate': '2018-07-01', 'journal': {'title': 'BMC immunology', 'medlineAbbreviation': 'BMC Immunol', 'isoabbreviation': 'BMC Immunol.', 'nlmid': '100966980', 'essn': '1471-2172', 'issn': '1471-2172'}}","[{'grantId': '16/0005537', 'agency': 'Diabetes UK', 'orderIn': 0}, {'grantId': '1574/75', 'agency': 'Fight for Sight UK', 'orderIn': 0}]"
30213993,Microwave treatment of the cornea leads to localised disruption of the extracellular matrix.,2018-09-13,10.1038/s41598-018-32110-0,"{'issue': '1', 'volume': '8', 'journalIssueId': 2722578, 'dateOfPublication': '2018 Sep', 'monthOfPublication': 9, 'yearOfPublication': 2018, 'printPublicationDate': '2018-09-01', 'journal': {'title': 'Scientific reports', 'medlineAbbreviation': 'Sci Rep', 'isoabbreviation': 'Sci Rep', 'nlmid': '101563288', 'essn': '2045-2322', 'issn': '2045-2322'}}",
29188084,Lens internal curvature effects on age-related eye model and lens paradox.,2017-10-03,10.1364/BOE.8.004827,"{'issue': '11', 'volume': '8', 'journalIssueId': 2622271, 'dateOfPublication': '2017 Nov', 'monthOfPublication': 11, 'yearOfPublication': 2017, 'printPublicationDate': '2017-11-01', 'journal': {'title': 'Biomedical optics express', 'medlineAbbreviation': 'Biomed Opt Express', 'isoabbreviation': 'Biomed Opt Express', 'nlmid': '101540630', 'essn': '2156-7085', 'issn': '2156-7085'}}",
29445135,Alteration in nerves and neurotransmitter stimulation of lacrimal gland secretion in the TSP-1-/- mouse model of aqueous deficiency dry eye.,2018-02-14,10.1038/s41385-018-0002-y,"{'issue': '4', 'volume': '11', 'journalIssueId': 2701731, 'dateOfPublication': '2018 Jul', 'monthOfPublication': 7, 'yearOfPublication': 2018, 'printPublicationDate': '2018-07-01', 'journal': {'title': 'Mucosal immunology', 'medlineAbbreviation': 'Mucosal Immunol', 'isoabbreviation': 'Mucosal Immunol', 'nlmid': '101299742', 'essn': '1935-3456', 'issn': '1933-0219'}}","[{'grantId': 'P30 EY003790', 'agency': 'NEI NIH HHS', 'acronym': 'EY', 'orderIn': 0}, {'grantId': 'R01 EY026202', 'agency': 'NEI NIH HHS', 'acronym': 'EY', 'orderIn': 0}, {'grantId': 'R01 EY006177', 'agency': 'NEI NIH HHS', 'acronym': 'EY', 'orderIn': 0}]"
29760979,Non-invasive in vivo quantification of the developing optical properties and graded index of the embryonic eye lens using SPIM.,2018-04-10,10.1364/BOE.9.002176,"{'issue': '5', 'volume': '9', 'journalIssueId': 2684453, 'dateOfPublication': '2018 May', 'monthOfPublication': 5, 'yearOfPublication': 2018, 'printPublicationDate': '2018-05-01', 'journal': {'title': 'Biomedical optics express', 'medlineAbbreviation': 'Biomed Opt Express', 'isoabbreviation': 'Biomed Opt Express', 'nlmid': '101540630', 'essn': '2156-7085', 'issn': '2156-7085'}}","[{'grantId': 'EP/M010767/1', 'agency': 'Engineering and Physical Sciences Research Council', 'orderIn': 0}, {'grantId': 'EP/I010173/1', 'agency': 'Engineering and Physical Sciences Research Council', 'orderIn': 0}]"
29973504,Overview of Antibody Drug Delivery.,2018-07-04,10.3390/pharmaceutics10030083,"{'issue': '3', 'volume': '10', 'journalIssueId': 2702704, 'dateOfPublication': '2018 Jul', 'monthOfPublication': 7, 'yearOfPublication': 2018, 'printPublicationDate': '2018-07-01', 'journal': {'title': 'Pharmaceutics', 'medlineAbbreviation': 'Pharmaceutics', 'isoabbreviation': 'Pharmaceutics', 'nlmid': '101534003', 'essn': '1999-4923', 'issn': '1999-4923'}}",
29259520,Gene Therapy for Color Blindness.,2017-12-19,,"{'issue': '4', 'volume': '90', 'journalIssueId': 2631415, 'dateOfPublication': '2017 Dec', 'monthOfPublication': 12, 'yearOfPublication': 2017, 'printPublicationDate': '2017-12-01', 'journal': {'title': 'The Yale Journal of Biology and Medicine', 'medlineAbbreviation': 'Yale J Biol Med', 'isoabbreviation': 'Yale J Biol Med', 'nlmid': '0417414', 'essn': '1551-4056', 'issn': '0044-0086'}}",
28608271,Repeated subretinal surgery and removal of subretinal decalin is well tolerated - evidence from a porcine model.,2017-06-12,10.1007/s00417-017-3704-z,"{'issue': '9', 'volume': '255', 'journalIssueId': 2663110, 'dateOfPublication': '2017 Sep', 'monthOfPublication': 9, 'yearOfPublication': 2017, 'printPublicationDate': '2017-09-01', 'journal': {'title': 'Graefe's archive for clinical and experimental ophthalmology = Albrecht von Graefes Archiv fur klinische und experimentelle Ophthalmologie', 'medlineAbbreviation': 'Graefes Arch Clin Exp Ophthalmol', 'isoabbreviation': 'Graefes Arch. Clin. Exp. Ophthalmol.', 'nlmid': '8205248', 'essn': '1435-702X', 'issn': '0721-832X'}}","[{'grantId': '2013-89', 'agency': 'Candys Foundation', 'orderIn': 0}]"
29675270,Caspases in retinal ganglion cell death and axon regeneration.,2017-07-03,10.1038/cddiscovery.2017.32,"{'volume': '3', 'journalIssueId': 2515741, 'dateOfPublication': '2017 ', 'monthOfPublication': 0, 'yearOfPublication': 2017, 'printPublicationDate': '2017-01-01', 'journal': {'title': 'Cell death discovery', 'medlineAbbreviation': 'Cell Death Discov', 'isoabbreviation': 'Cell Death Discov', 'nlmid': '101665035', 'essn': '2058-7716', 'issn': '2058-7716'}}",


## Which FFS papers does key word search catch?

In [13]:
#df = pd.read_pickle('data/EPMC/eye_keywords.pkl')
df = pd.read_pickle('data/EPMC/eye_in_new_keywords.pkl')

eyekw_pmids = df['pmid'].astype(float).dropna().astype(int)

del df

In [16]:
ffs = pd.read_pickle('data/EPMC/ffs_or_predecessors.pkl')
ffs = ffs[ffs['pmid'].notna()]

ffs_pmids = ffs['pmid'].astype(int).dropna()
print(sum(~ffs_pmids.isin(eyekw_pmids.astype(int))),'FFS papers missing from eye key word search.',
      sum(ffs_pmids.isin(eyekw_pmids.astype(int))),'FFS papers found.')

cited = pd.read_pickle('data/EPMC/ffs_or_predecessors_citations.pkl')
cited = cited[cited['pmid'].notna()]

cited_pmids = cited['pmid'].astype(int).dropna()
print(sum(~cited_pmids.isin(eyekw_pmids)),'citing papers missing from eye key word search.',
      sum(cited_pmids.isin(eyekw_pmids)),'citing papers found.')

print('===========================================================================')

print('Some examples:')
with pd.option_context('display.max_colwidth', -1):
    display(ffs[~ffs_pmids.isin(eyekw_pmids)][['pmid','title','firstPublicationDate',
                                               'doi']].head(50))

291 FFS papers missing from eye key word search. 1272 FFS papers found.
15571 citing papers missing from eye key word search. 18974 citing papers found.
Some examples:


Unnamed: 0,pmid,title,firstPublicationDate,doi
0,29423837,Analysis of hedgehog signaling in periocular sebaceous carcinoma.,2018-02-08,10.1007/s00417-018-3900-5
9,29996768,Sustained high glucose exposure sensitizes macrophage responses to cytokine stimuli but reduces their phagocytic activity.,2018-07-11,10.1186/s12865-018-0261-0
18,29188084,Lens internal curvature effects on age-related eye model and lens paradox.,2017-10-03,10.1364/BOE.8.004827
21,28296182,Endothelial Progenitors: A Consensus Statement on Nomenclature.,2017-03-10,10.1002/sctm.16-0360
25,29352236,Control of neural crest induction by MarvelD3-mediated attenuation of JNK signalling.,2018-01-19,10.1038/s41598-018-19579-5
30,29973504,Overview of Antibody Drug Delivery.,2018-07-04,10.3390/pharmaceutics10030083
31,29484768,The Vasoreparative Function of Myeloid Angiogenic Cells Is Impaired in Diabetes Through the Induction of IL1β.,2018-03-09,10.1002/stem.2810
54,26236824,A dimensionless ordered pull-through model of the mammalian lens epithelium evidences scaling across species and explains the age-dependent changes in cell density in the human lens.,2015-07-01,10.1098/rsif.2015.0391
59,28246395,RNA-Sequencing data supports the existence of novel VEGFA splicing events but not of VEGFAxxxb isoforms.,2017-03-03,10.1038/s41598-017-00100-3
72,29892256,"Modeling the Triggering of Saccades, Microsaccades, and Saccadic Intrusions.",2018-05-28,10.3389/fneur.2018.00346


## Words in FFS Papers that are not in Eye Keyword Papers

In [10]:
'''
# longer stop word lists
with open('data/atire_puurula.txt','r') as f:
    stop_words = f.read().split('\n')

#import nltk
#nltk.download('words')

from nltk.corpus import words
stop_words = words.words()

with open('data/words.txt','r') as f:
    stop_words = f.read().split('\n')

print(len(stop_words))
'''

466545


In [None]:
ffs = ffs[~ffs_pmids.isin(eyekw_pmids.astype(int))]

ffs_vectorizer = CountVectorizer(strip_accents='unicode', lowercase=True, stop_words='english', max_df=1.0, min_df=1)
ffs_vectorizer.fit(ffs['abstractText'].str.replace('\d+', '').dropna())

ffs_vocab = ffs_vectorizer.vocabulary_
ffs_words = list(ffs_vocab.keys())
print('FFS vocabulary:',len(ffs_vocab),'words.')

eye = pd.read_pickle('data/EPMC/eye_keywords.pkl')
eye_vectorizer = CountVectorizer(strip_accents='unicode', lowercase=True, stop_words='english', max_df=1.0, min_df=1)
eye_vectorizer.fit(eye['abstractText'].str.replace('\d+', '').dropna())
eye_vocab = eye_vectorizer.vocabulary_
eye_words = list(eye_vocab.keys())
print('Eye keyword vocabulary:',len(eye_vocab),'words.')

del ffs, ffs_vectorizer, eye, eye_vectorizer

FFS vocabulary: 6424 words.


In [12]:
ffs_unique_words = [word for word in ffs_words if word not in eye_words]
print(len(ffs_unique_words))
print(ffs_unique_words)

55
['nbap', 'batdetective', 'probandwise', 'papillovascular', 'positivem', 'cbap', 'strat', 'compromization', 'mechanosignalling', 'underaccommodate', 'wally', 'vsnr', 'gppr', 'pglps', 'peritumorally', 'ebisc', 'etoc', 'vegfaxxxb', 'eigenmode', 'kprm', 'ebar', 'standardises', 'giacta', 'venusberg', 'thiomodification', 'trmu', 'liedtke', 'eicosenoyl', 'palmitoleoyl', 'eicosadienoyl', 'interfant', 'mcbpy', 'gsnestt', 'pums', 'isoindicial', 'sgef', 'cytotherapies', 'xxpress', 'turc', 'lants', '_dupcag', 'glndup', 'kiac', 'obsl', 'phenodb', 'gfog', 'vasorepair', 'hfmsc', 'haeckels', 'stamzelle', 'naturliche', 'schopfungsgeschichte', 'reimer', 'sjeh', 'quango']


In [13]:
ffs_shared_words = [word for word in ffs_words if word in eye_words]
print(len(ffs_shared_words))

999
['aav', 'abca', 'absolute', 'ace', 'aceturate', 'acr', 'acrysof', 'act', 'adas', 'add', 'adeno', 'adherens', 'afc', 'aflibercept', 'afm', 'aid', 'aim', 'aims', 'air', 'akt', 'alp', 'alpha', 'als', 'also', 'altman', 'am', 'amd', 'american', 'amphipathic', 'amsterdam', 'anandamide', 'anemia', 'ang', 'angiocrine', 'angiogensis', 'angiopoietin', 'angiopoietins', 'angle', 'angles', 'anglia', 'another', 'anti', 'antiangiogenic', 'anxa', 'any', 'aod', 'ap', 'apicobasal', 'apoptosis', 'apoptotic', 'app', 'april', 'aqp', 'aquaporin', 'aquaporins', 'ara', 'arachidonoyl', 'arc', 'are', 'arm', 'as', 'asian', 'ask', 'asn', 'aspasn', 'assembly', 'astrocytes', 'at', 'atlas', 'atpase', 'au', 'august', 'autocrine', 'axis', 'aβ', 'bag', 'balance', 'bank', 'banking', 'bap', 'bar', 'base', 'basic', 'basolateral', 'bat', 'bb', 'be', 'begin', 'berlin', 'best', 'beta', 'bevacizumab', 'bias', 'bind', 'bioactive', 'biodiversity', 'biofluids', 'biomarkers', 'bjs', 'bkca', 'bl', 'black', 'bland', 'block', 'b