<a href="https://colab.research.google.com/github/julianflowers/herbivores_ghg/blob/master/unified_script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%load_ext rpy2.ipython

In [None]:
%%R
install.packages("remotes")
remotes::install_github("kth-library/semanticscholar", dependencies = TRUE)

In [3]:
%%R
library(semanticscholar)

In [None]:
%%R
search <- "herbivory climate change"
s2_search <- semanticscholar::S2_search_papers(keyword = search, offset = 1, limit = 100)
s2_search

In [None]:
%%R
library(tidyverse)

In [17]:
%%R

ids <- pluck(s2_search$data, "paperId")

papers <- map(ids, S2_paper)

papid <- map(papers, "paperId") |>
  enframe()

abstract <- map(papers, "abstract") |>
  enframe()

topics <- map(papers, "topics") |>
  enframe()

doi <- map(papers, "doi") |>
  enframe()

title <- map(papers, "title") |>
  enframe()

corpusId <- map(papers, "corpusId") |>
  enframe()

fos <- map(papers, "fieldsOfStudy") |>
  enframe()

combined <- papid |>
  left_join(title, by = "name") |>
  left_join(abstract, by = "name") |>
  left_join(fos, by = "name") |>
  left_join(topics, by = "name") |>
  left_join(corpusId, by = "name") |>
  left_join(doi, by = "name")

combined <- combined |> 
  rename(id = value.x, 
         title = value.y, 
         abstract = value.x.x, 
         fos = value.y.y, 
         topics = value.x.x.x, 
         corpusId = value.y.y.y, 
         doi = value)


In [None]:
%%R
comb <- combined |>
  unnest(cols = "topics") |>
  unnest(cols = "fos") |>
  group_by(name) |>
  select(-c(topicId, url)) |>
  summarise(topic = paste(topic, collapse = "; ")) 

  comb

In [12]:
%%R
comb |>
  write_delim("test.txt")
  

In [30]:
%%R
abstracts <- combined |>
    select(id, abstract)
abstracts |>
    unnest(col = c("abstract")) |>
    write_delim("abstract.txt")

In [None]:
%pip install taxonerd
%pip install https://github.com/nleguillarme/taxonerd/releases/download/v1.3.0/en_ner_eco_biobert-1.0.0.tar.gz

In [10]:
from taxonerd import TaxoNERD

In [11]:
ner = TaxoNERD(model="en_ner_eco_biobert", prefer_gpu=True, with_linking="taxref", with_abbrev=True) # Add with_linking="gbif_backbone" or with_linking="taxref" to activate entity linking


In [14]:
doc = ner.find_in_file("/content/test.txt")


In [None]:
doc

In [31]:
doc1 = ner.find_in_file("/content/abstract.txt")
doc1

Unnamed: 0,offsets,text,entity
T0,LIVB 1632 1647,Geometrid moths,"[(TAXREF:185259, geometer moths, 0.80506289005..."
T1,LIVB 5392 5400,Rangifer,"[(TAXREF:197043, Rangifer, 1.0)]"
T2,LIVB 6123 6131,Rangifer,"[(TAXREF:197043, Rangifer, 1.0)]"
T3,LIVB 8163 8169,Insect,"[(TAXREF:51868, Water Stick Insect, 0.70065522..."
T4,LIVB 8735 8746,dwarf birch,"[(TAXREF:762263, Dwarf birch, 1.0)]"
...,...,...,...
T238,LIVB 138623 138632,sea hares,"[(TAXREF:188096, sea hares, 1.0)]"
T239,LIVB 144611 144622,butterflies,"[(TAXREF:185214, butterflies and moths, 0.7505..."
T240,LIVB 144627 144632,moths,"[(TAXREF:185244, case moths, 0.8185053467750549)]"
T241,LIVB 145740 145763,Ambrosia artemisiifolia,"[(TAXREF:82080, Ambrosia artemisiifolia, 1.0)]"


In [None]:
!pip install flair
import flair
from flair.data import Sentence
from flair.models import SequenceTagger

In [33]:
tagger = SequenceTagger.load("flair/ner-english-ontonotes-large")


Downloading:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

2022-08-12 08:50:22,529 loading file /root/.flair/models/ner-english-ontonotes-large/2da6c2cdd76e59113033adf670340bfd820f0301ae2e39204d67ba2dc276cc28.ec1bdb304b6c66111532c3b1fc6e522460ae73f1901848a4d0362cdf9760edb1


Downloading:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

2022-08-12 08:50:42,885 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY
