Use the trained ner model on all 1300 publications to extract specimen citations. 

In [1]:
import pandas as pd
import numpy as np
import re
import glob
import random
from random import sample
import json
import jsonlines
import pysbd
seg = pysbd.Segmenter(language="en", clean=False)
import spacy
from spacy import displacy

In [2]:
custom_ner_model = spacy.load("./models/02/model-best/")



In [3]:
combined_papers = glob.glob('./papers_all/*.json')
print(f"Looking for SPECIMEN entities in {len(combined_papers)} combined papers")

Looking for SPECIMEN entities in 1306 combined papers


In [4]:
custom_ner_model.analyze_pipes()

{'summary': {'ner': {'assigns': ['doc.ents',
    'token.ent_iob',
    'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False}},
 'problems': {'ner': []},
 'attrs': {'doc.ents': {'assigns': ['ner'], 'requires': []},
  'token.ent_iob': {'assigns': ['ner'], 'requires': []},
  'token.ent_type': {'assigns': ['ner'], 'requires': []}}}

In [4]:
##Review Filenames
for test_file in combined_papers: 
    ref_id = test_file.split("/")[1].split('\\')[-1]
    print(ref_id)

0021b632-3246-b7d9-bb29-66398e4a295d.json
002b2f92-8dc5-7bd9-6689-ef79f8c3c461.json
002f9cc4-096b-faff-f5b7-751f497e28aa.json
004cdaf0-0ed9-1a32-4f0f-a9db4b6a3fea.json
00513063-020a-ca12-055d-fed45fa7cb00.json
00c99f27-6e4d-d42f-3306-f416b09b8163.json
00d3d1af-353e-0c86-24ae-ba79e4b82987.json
00f24603-8295-1fe0-52c5-53b5d2f74429.json
01364fc5-1a6e-d97f-03cd-7bab667b94c9.json
01847803-d840-fccb-2446-eceecaa0a679.json
01e50dde-46ca-2ca5-90da-4776dbf9c505.json
02352cfc-43d3-c1cd-4443-6c48cfbb5634.json
0262a5aa-edec-31ef-5762-762620379089.json
027752fe-61f6-8c6e-689c-0559bf4fa4a3.json
0361194e-41a4-4908-6a86-40178be6aeb4.json
0361524c-2cc0-f859-aa65-24ab713b7e05.json
03927935-8d5f-52c0-83f0-ba7756b8002a.json
03a89879-36a9-8d74-00de-59f550f0fc2b.json
03c54c71-fca0-5536-2169-df82b9bdee2d.json
03f7ba05-cf4b-b315-2325-4d812bdfb727.json
03f9c3a3-5fb1-a0dd-34fe-f75045709961.json
047cd56e-fd6c-c364-8191-cbeb175bf448.json
052daad3-26c0-0984-c734-bb05788c31f6.json
0584d26c-f724-3a0c-0947-bf82b8ed3a

In [5]:
## Create a dataframe object with the ref_entries from the json file as 'text', includes the 'paper_id' or filename, 
## and paper_title from the json file. 
df_publications_refs = pd.DataFrame()
for test_file in combined_papers: 
    with open(test_file, 'r') as f:
        reflist = []
        data = json.loads(f.read())
        for k,v in data["pdf_parse"]["ref_entries"].items():
            if k.startswith("FIGREF") or k.startswith("TABREF"):
                ref_label = k
                reflist.append(ref_label)
        for refID in reflist:
            file_data = pd.json_normalize(data["pdf_parse"]["ref_entries"][refID])
            file_data["paper_id"] = test_file.split("/")[1].split('\\')[-1]
            if "content" in file_data:
                file_data.drop(["content"], axis = 1, inplace = True)
            if file_data["text"] is None:
                file_data["text"] = "No Text"
            df_publications_refs = pd.concat([df_publications_refs, file_data])


print(df_publications_refs)

df_publications_refs.to_csv("./df_publications_refs.csv", index=False, header=True)

   type_str  uris                                               text   num  \
0    figure  None  Percentage of available sites (n = 280; solid ...  None   
0    figure  None                                                     None   
0     table   NaN  Results from nonparametric contingency table t...  None   
0     table   NaN  Downloaded from https://afspubs.onlinelibrary....  None   
0    figure  None  Cytosine methylation in DNA. (A) Addition of a...  None   
..      ...   ...                                                ...   ...   
0    figure  None  Suspensorium and preopercle of Astroblepus sp....  None   
0    figure  None  Suspensorium and Meckelian cartilage of young ...  None   
0     table   NaN                                              Figs.  None   
0     table   NaN  ). C Bullockia rnaldonadoi (57.1 mm; KU 19371)...  None   
0     table   NaN  Cranial bones articulating with the hyomandibu...  None   

                                     paper_id  
0   0021b632-32

In [6]:
print(df_publications_refs.size)

53930


In [7]:
## Create a dataframe object with the body_text from the json file as 'text', includes the 'paper_id' or filename, 
## and paper_title from the json file. 

df_publications_body = pd.DataFrame()
for test_file in combined_papers: 
    with open(test_file, 'r') as f:
        data = json.loads(f.read())
        file_data = pd.json_normalize(data["pdf_parse"]["body_text"])
        file_data["paper_id"] = test_file.split("/")[1].split('\\')[-1]
        file_data["paper_title"] = data["title"]
    df_publications_body = pd.concat([df_publications_body, file_data])
    
print(df_publications_body)

##Saves the dataframe as a .csv file
df_publications_body.to_csv("./df_publications_body.csv", index=False, header=True)

                                                 text  \
0   Many rare fishes are declining, threatened, en...   
1   Spawning is easily observed in many common fis...   
2   Further quantifying the conditions under which...   
3   The Moapa Dace, a cyprinid that is endemic to ...   
4   The Muddy River is a unique ecosystem but cont...   
..                                                ...   
95  , 1990, 2, Downloaded from https://onlinelibra...   
96  , 1990, 2, Downloaded from https://onlinelibra...   
97  , 1990, 2, Downloaded from https://onlinelibra...   
98  , 1990, 2, Downloaded from https://onlinelibra...   
99  , 1990, 2, Downloaded from https://onlinelibra...   

                                           cite_spans  \
0   [{'start': 86, 'end': 112, 'text': '(Minckley ...   
1   [{'start': 412, 'end': 430, 'text': '(Ebner et...   
2                                                  []   
3   [{'start': 101, 'end': 124, 'text': '(Hubbs an...   
4   [{'start': 263, 'end': 288

In [8]:
print(df_publications_body.size)

469400


In [9]:
##Creates a new list "combined_sentences" using data from df_publications_body

combined_sentences = []

for row in df_publications_body.itertuples(): 
    for sent in seg.segment(row[1]):   #segments the text in column 1
        ##column7 = paper_id, column 5 = section, column 8 = title, then the text(sentence)
        combined_sentences.append((row[7], row[5], row[8], sent))

In [10]:
print(len(combined_sentences))

306630


In [11]:
%%time
##Adds the data from df_publications_refs

for row in df_publications_refs.itertuples():  ##itertuples lets you iterate over the rows in a dataframe, and store the entire row as a "tuple" (multiple objects)
    for sent in seg.segment(row[3]):   #segments the text in column 1 segment
        ## column7 is the paper_id, then the section, then the title again, then the text(sentence)
        combined_sentences.append((row[5], row[1], row[5], sent))  

CPU times: total: 32 s
Wall time: 34.7 s


In [12]:
print(len(combined_sentences))

371362


In [13]:
print((combined_sentences[371360]))

('ffc573d5-fd0b-a70e-385a-f4635e4af862.json', 'table', 'ffc573d5-fd0b-a70e-385a-f4635e4af862.json', 'Same scale applies to A and B. apa, autopalatine; hy, hyomandibula; iop, interopercle; meo, membranous outgrowths; mtg, metaptmygoid; op, opercle; pop, preopercle; qc, complexquadrate.')


In [14]:
print(combined_sentences[4825])

('052daad3-26c0-0984-c734-bb05788c31f6.json', 'MAXILLO-PALATO-PTERYGOID ARCH', 'Not so rare snakes: a revision of the Geophis sieboldi group (Colubridae: Dipsadinae) in lower Central America and Colombia', 'Two species differ from other members of the group in having a tooth at the anterior tip of the maxilla or the first tooth preceded by a short toothless area and in one of these the maxilla is not dorsoventrally depressed.')


In [15]:
##Create a new dataframe with data from combined_sentences

df_sent = pd.DataFrame(combined_sentences, columns=['Id', 'section_title', 'paper_title', 'sent'])
df_sent['sent'] = df_sent['sent'].astype(str)
df_sent

Unnamed: 0,Id,section_title,paper_title,sent
0,0021b632-3246-b7d9-bb29-66398e4a295d.json,,Use of Underwater Videography to Quantify Cond...,"Many rare fishes are declining, threatened, en..."
1,0021b632-3246-b7d9-bb29-66398e4a295d.json,,Use of Underwater Videography to Quantify Cond...,Understanding the factors affecting their beha...
2,0021b632-3246-b7d9-bb29-66398e4a295d.json,,Use of Underwater Videography to Quantify Cond...,Knowledge of fish spawning behavior and requir...
3,0021b632-3246-b7d9-bb29-66398e4a295d.json,,Use of Underwater Videography to Quantify Cond...,Spawning is easily observed in many common fis...
4,0021b632-3246-b7d9-bb29-66398e4a295d.json,,Use of Underwater Videography to Quantify Cond...,A method that has proven useful for investigat...
...,...,...,...,...
371357,ffc573d5-fd0b-a70e-385a-f4635e4af862.json,table,ffc573d5-fd0b-a70e-385a-f4635e4af862.json,Figs.
371358,ffc573d5-fd0b-a70e-385a-f4635e4af862.json,table,ffc573d5-fd0b-a70e-385a-f4635e4af862.json,).
371359,ffc573d5-fd0b-a70e-385a-f4635e4af862.json,table,ffc573d5-fd0b-a70e-385a-f4635e4af862.json,C Bullockia rnaldonadoi (57.1 mm; KU 19371).
371360,ffc573d5-fd0b-a70e-385a-f4635e4af862.json,table,ffc573d5-fd0b-a70e-385a-f4635e4af862.json,"Same scale applies to A and B. apa, autopalati..."


In [16]:
def predict_ents(df):
    df_sent['specimen_prediction'] = df_sent['sent'].apply(extract_ents)
    
def extract_ents(text):
    doc = custom_ner_model(text)
    if len(doc.ents) > 0:
        matchlist = []
        for ents in doc.ents:
            match = clean_text2(ents)
            matchlist.append(match)
        matches_string = ", ".join(matchlist)
        return matches_string
    else:
        return None
    
def clean_text2(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt))

In [17]:
# using the functions defined above, predicts entities and saves as a .csv

predict_ents(df_sent)
df_sent.to_csv("./specimen_results_ner.csv", index=False, header=True)

In [18]:
### this prints the last column, counts the number of times each specimen appears, and orders by value

(df_sent['specimen_prediction'].value_counts().sort_values(ascending=False))

UMMZ                                     55
MCP 15219                                16
MCZ 8500                                 12
GMNH 27372                               12
ROM 48915                                11
                                         ..
AMNH 94132                                1
UMMZ 239625                               1
UMMZ 238961, UMMZ 238959, UMMZ 239625     1
UMMZ 239624                               1
KU 21802                                  1
Name: specimen_prediction, Length: 2961, dtype: int64