In [43]:
import torch.nn.functional as F
import requests
import json
from dataclasses import dataclass
from progressbar import ProgressBar
from typing import List
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
@dataclass
class Paper:
    BASE_URL = "https://api.semanticscholar.org/v1/paper/"
    CITATION_LIMIT = 1000
    paper_id: str
    
    def __post_init__(self):
        self.url = f'{self.BASE_URL}{self.paper_id}'
        self.paper = requests.get(self.url).json()
    
    @property
    def id(self) -> str:
        return self.paper_id
    
    @property
    def title(self) -> str:
        return self.paper['title']
        
    @property
    def abstract(self) -> str:
        return self.paper['abstract']
    
    @property
    def citations(self) -> List[str]:
        citations = []
        for c in self.paper['citations'][:self.CITATION_LIMIT]:
            citations.append(c["paperId"])

        return citations
    
    @property
    def references(self) -> List[str]:
        references = []
        for c in self.paper['references'][:self.CITATION_LIMIT]:
            references.append(c["paperId"])

        return references

In [3]:
p = Paper("d4b651d6a904f69f8fa1dcad4ebe972296af3a9a")

In [4]:
p.id
p.title
p.abstract
p.citations

'd4b651d6a904f69f8fa1dcad4ebe972296af3a9a'

'Identifying Relations for Open Information Extraction'

"Open Information Extraction (IE) is the task of extracting assertions from massive corpora without requiring a pre-specified vocabulary. This paper shows that the output of state-of-the-art Open IE systems is rife with uninformative and incoherent extractions. To overcome these problems, we introduce two simple syntactic and lexical constraints on binary relations expressed by verbs. We implemented the constraints in the ReVerb Open IE system, which more than doubles the area under the precision-recall curve relative to previous extractors such as TextRunner and woepos. More than 30% of ReVerb's extractions are at precision 0.8 or higher---compared to virtually none for earlier systems. The paper concludes with a detailed analysis of ReVerb's errors, suggesting directions for future work."

['a1d49c5ea00831d540f027c0c009cacd2c21f3b5',
 '9ea5874d261359e287eabb735de38a8edba1e091',
 '032244fb8ff881f4f12345e9afc7ea5627952f4a',
 '279cc657655eeb4e96a2eaf3d77f708edbf6a313',
 '47a541269d4ef70f37f0d3a57483312c4c6c2ad5',
 'd582909be7ad3ca80fcfca3e1d9ced2e60966db2',
 '28fdb929d1c4f87bbb9cc0b5bb880567e3c50429',
 '1f872354e0cfde91e86e68b35d89a6d447f48936',
 'bdb32ea23986f6dfe436c5dba0d13e95dea07c92',
 'fda21913e8d889a84677f96231a145ecf2c206a2',
 '9939871f8e0c6119e44734573758196de292561d',
 '89c8ef3a2616765f9008ebd7ec4c45e7144b1e9c',
 '130670cd2ffe6994ab4265f534de4aad74385c60',
 'fea760bdeef14b049c2e515838839116b91fb82a',
 'a80cd69bec615984b4eab7f55026fabffd315d3b',
 'e5f5a058c26b87dc94b8b0c35e9395bac578969c',
 'e82dad0188e97342fb88f2770db77d480f11c96e',
 'ce4b70a3da3887453a028c85f59451131972070b',
 '52eb3247b460fe8211e9304732dd7d8fc11b1b68',
 '3dde11c02e24ee3e9ba4903939e0a61e2880b89c',
 '1933a0ef47f8d2ba4a8277d702d522a06319302c',
 '33285e02758788b681754d283df20971fef6e31f',
 '6926acbf

In [6]:
papers = {}

In [7]:
import time

queue = []

bar = ProgressBar(max_value=1000)
while True:
    time.sleep(1)
    for s in p.citations:
        if len(papers) == 1000:
            break
        if s not in papers:
            x = Paper(s)
            a = x.abstract
            if a:
                papers[s] = x
                queue.append(x)
                bar.update(len(papers))
            
    if queue:
        p = queue.pop()
    else:
        break
    if len(papers) == 1000:
        break
    
    bar.update(len(papers))

  8% (88 of 1000) |#                     | Elapsed Time: 0:01:32 ETA:   0:08:31

KeyboardInterrupt: 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(lowercase=False)
corpus = [x.abstract for x in papers.values()]

In [None]:
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names_out()

In [190]:
X.sum(axis=0)[0, 1319]

1

In [9]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer

In [68]:
model_name = "allenai/scibert_scivocab_uncased"
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def extract_scientific_terms(text):
    input_ids = tokenizer.encode(text, return_tensors="pt")
    outputs = model(input_ids)
    predictions = outputs.logits.argmax(dim=1)[0]
    term = tokenizer.convert_ids_to_tokens([input_ids[0][predictions[0]]])
    return term

text = "Scientists have discovered a new species of bacteria that can survive in extreme environments."
term = extract_scientific_terms(text)
print(term)


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

['new']


In [100]:
!python -m spacy download displacy

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

[38;5;1m✘ No compatible package found for 'displacy' (spaCy v3.5.0)[0m



In [137]:
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_lg")

In [138]:
from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL
config = {
    "threshold": 0.5,
    "spans_key": "labeled_spans",
    "max_positive": None,
    "model": DEFAULT_SPANCAT_MODEL,
    "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
}
nlp.add_pipe("spancat", config=config)
nlp.begin_training()

<spacy.pipeline.spancat.SpanCategorizer at 0x7fef7c8d05e0>

ValueError: [E955] Can't find table(s) lexeme_norm for language 'en' in spacy-lookups-data. Make sure you have the package installed or provide your own lookup tables if no default lookups are available for your language.

In [99]:
from spacy import displacy

spacy.displacy.serve(x)

AttributeError: module 'spacy' has no attribute 'displacy'

In [127]:
x

Open Information Extraction (OpenIE) aims to extract structured relational tuples (subject, relation, object) from sentences and plays critical roles for many downstream NLP applications. Existing solutions perform extraction at sentence level, without referring to any additional contextual information. In reality, however, a sentence typically exists as part of a document rather than standalone; we often need to access relevant contextual information around the sentence before we can accurately interpret it. As there is no documentlevel context-aware OpenIE dataset available, we manually annotate 800 sentences from 80 documents in two domains (Healthcare and Transportation) to form a DocOIE dataset for evaluation. In addition, we propose DocIE, a novel document-level context-aware OpenIE model. Our experimental results based on DocIE demonstrate that incorporating documentlevel context is helpful in improving OpenIE performance. Both DocOIE dataset and DocIE model are released for pub

In [125]:
x = nlp(text)
x.noun_chunks

ValueError: [E109] Component 'spancat' could not be run. Did you forget to call `initialize()`?

In [121]:
{y: getattr(x.ents[0], y) for y in dir(x.ents[0])}

{'_': <spacy.tokens.underscore.Underscore at 0x7ff03bf38f70>,
 '__class__': spacy.tokens.span.Span,
 '__delattr__': <method-wrapper '__delattr__' of spacy.tokens.span.Span object at 0x7ff03bf08eb0>,
 '__dir__': <function Span.__dir__()>,
 '__doc__': 'A slice from a Doc object.\n\n    DOCS: https://spacy.io/api/span\n    ',
 '__eq__': <method-wrapper '__eq__' of spacy.tokens.span.Span object at 0x7ff03bf08dd0>,
 '__format__': <function Span.__format__(format_spec, /)>,
 '__ge__': <method-wrapper '__ge__' of spacy.tokens.span.Span object at 0x7fefee51c040>,
 '__getattribute__': <method-wrapper '__getattribute__' of spacy.tokens.span.Span object at 0x7fefee51c0b0>,
 '__getitem__': <method-wrapper '__getitem__' of spacy.tokens.span.Span object at 0x7fefee51c120>,
 '__gt__': <method-wrapper '__gt__' of spacy.tokens.span.Span object at 0x7fefee51c190>,
 '__hash__': <method-wrapper '__hash__' of spacy.tokens.span.Span object at 0x7fefee51c200>,
 '__init__': <method-wrapper '__init__' of spacy

In [122]:
[(e.label_, e.text) for e in x.ents]

[('ORG', 'NLP'),
 ('CARDINAL', '800'),
 ('CARDINAL', '80'),
 ('CARDINAL', 'two'),
 ('ORG', 'Healthcare and Transportation'),
 ('ORG', 'DocOIE'),
 ('ORG', 'DocOIE'),
 ('CARDINAL', '1')]

In [109]:
text

'Open Information Extraction (OpenIE) aims to extract structured relational tuples (subject, relation, object) from sentences and plays critical roles for many downstream NLP applications. Existing solutions perform extraction at sentence level, without referring to any additional contextual information. In reality, however, a sentence typically exists as part of a document rather than standalone; we often need to access relevant contextual information around the sentence before we can accurately interpret it. As there is no documentlevel context-aware OpenIE dataset available, we manually annotate 800 sentences from 80 documents in two domains (Healthcare and Transportation) to form a DocOIE dataset for evaluation. In addition, we propose DocIE, a novel document-level context-aware OpenIE model. Our experimental results based on DocIE demonstrate that incorporating documentlevel context is helpful in improving OpenIE performance. Both DocOIE dataset and DocIE model are released for pu

In [104]:
for y in papers.values():
    text = y.abstract
    x = nlp(text)
    spans = x.spans
    if spans:
        input(spans)

In [31]:
input_ids

tensor([[  102, 10639,   360,  8847,   106,   758,  1578,   131,  4738,   198,
           300, 14413,   121,  7424,  5097,   205,   103]])

In [33]:
outputs.logits

tensor([[[ 0.0448,  0.0619],
         [-0.2056,  0.4803],
         [-0.0202,  0.6416],
         [-0.4708,  0.1224],
         [-0.2484,  0.4870],
         [-0.3344,  0.2875],
         [ 0.2125,  0.0833],
         [-0.0063,  0.1687],
         [ 0.1368, -0.1288],
         [-0.6052,  0.5749],
         [-0.3328,  0.5135],
         [-0.1600, -0.4025],
         [ 0.0111,  0.1905],
         [-0.0403, -0.2249],
         [-0.0539, -0.4953],
         [-0.3412, -0.5488],
         [-0.0946,  0.1148]]], grad_fn=<ViewBackward0>)

In [42]:
input_ids.shape

torch.Size([1, 17])

In [40]:
outputs.logits.shape

torch.Size([1, 17, 2])

In [38]:
outputs.logits.max(1)

torch.return_types.max(
values=tensor([[0.2125, 0.6416]], grad_fn=<MaxBackward0>),
indices=tensor([[6, 2]]))

In [47]:
outputs.logits

tensor([[[ 0.0448,  0.0619],
         [-0.2056,  0.4803],
         [-0.0202,  0.6416],
         [-0.4708,  0.1224],
         [-0.2484,  0.4870],
         [-0.3344,  0.2875],
         [ 0.2125,  0.0833],
         [-0.0063,  0.1687],
         [ 0.1368, -0.1288],
         [-0.6052,  0.5749],
         [-0.3328,  0.5135],
         [-0.1600, -0.4025],
         [ 0.0111,  0.1905],
         [-0.0403, -0.2249],
         [-0.0539, -0.4953],
         [-0.3412, -0.5488],
         [-0.0946,  0.1148]]], grad_fn=<ViewBackward0>)

In [52]:
preidictions = F.softmax(outputs.logits, dim=-1).argmax(dim=1)

In [60]:
input_ids[0][6]

tensor(1578)

In [62]:
predictions[0]

tensor(6)

['species']

['have']

In [29]:
input_ids = tokenizer.encode(text, return_tensors="pt")
outputs = model(input_ids)
predictions = outputs.logits.argmax(dim=1)[0]
tokens = tokenizer.convert_ids_to_tokens(input_ids[predictions])
tokens

IndexError: index 6 is out of bounds for dimension 0 with size 1

In [13]:
input_ids = tokenizer.encode(text, return_tensors="pt")

In [14]:
input_ids

tensor([[  102, 10639,   360,  8847,   106,   758,  1578,   131,  4738,   198,
           300, 14413,   121,  7424,  5097,   205,   103]])

In [19]:
model(input_ids).logits.argmax(dim=1)

tensor([[10, 15]])

In [175]:
for i in np.asarray(X.sum(axis=0).argsort())[0]:
    input(features[i])

generalizing


''

inconsistencies


''

incomplete


''

incidents


''

inadequate


''

inability


''

improvements


''

implies


''

implicitly


''

implementing


''

impactful


''

incorporated


''

immunizes


''

identifying


''

identifiers


''

identical


''

ideas


''

ideal


''

idea


''

hypothesize


''

hypothesis


''

hyperrelational


''

hyperlinks


''

iitd


''

hypergraphs


''

incorporates


''

incorporation


''

inspection


''

insights


''

insight


''

inside


''

initially


''

initialized


''

inicial


''

infuse


''

informational


''

infomax


''

incorporating


''

influenced


''

inefficiency


''

industries


''

individuals


''

indicated


''

indian


''

independent


''

independence


''

indegree


''

incrementally


''

increasing


''

inferior


''

hybrid


''

however


''

horizons


''

governance


''

goods


''

good


''

globe


''

glance


''

gives


''

gether


''

genomes


''

generic


''

generally


''

governments


''

ﬁne


''

generalization


''

gene


''

gathering


''

gather


''

gap


''

gained


''

fusion


''

fuse


''

furnish


''

fundamental


''

generalized


''

gradually


''

gramatical


''

grammar


''

horizon


''

homozygous


''

historically


''

hierarchy


''

hierarchically


''

hierarchical


''

heterozygous


''

heterogeneous


''

here


''

helps


''

helpful


''

hectic


''

headlines


''

head


''

hbe


''

having


''

harms


''

harm


''

hand


''

guide


''

guidance


''

group


''

ground


''

inspired


''

function


''

instrumental


''

integrating


''

million


''

microorganisms


''

methodology


''

merits


''

mentioned


''

mention


''

memory


''

member


''

meet


''

mediation


''

millions


''

mechanical


''

meaning


''

maximization


''

mathematically


''

matched


''

massive


''

masked


''

markup


''

margins


''

manufacture


''

managing


''

meanings


''

makes


''

mind


''

minimalist


''

negatively


''

ned


''

necessitates


''

near


''

narrows


''

myristate


''

must


''

multiword


''

KeyboardInterrupt: Interrupted by user

In [41]:
corpus[0]

'Recently, many knowledge graph embedding models for knowledge graph completion have been proposed, ranging from the initial translation-based models such as TransE to recent convolutional neural network (CNN) models such as ConvE. However, these models only focus on semantic information of knowledge graph and neglect the natural graph structure information. Although graph convolutional network (GCN)-based models for knowledge graph embedding have been introduced to address this issue, they still suffer from fact incompleteness, resulting in the unconnectedness of knowledge graph. To solve this problem, we propose a novel model called deep relational graph infomax (DRGI) with mutual information (MI) maximization which takes the benefit of complete structure information and semantic information together. Specifically, the proposed DRGI consists of two encoders which are two identical adaptive relational graph attention networks (ARGATs), corresponding to catching semantic information an

In [36]:
next(iter(papers.values()))

Paper(paper_id='a1d49c5ea00831d540f027c0c009cacd2c21f3b5')

In [8]:
# Set up an API query to the Semantic Scholar API
# to get the paper with the given ID
# and print the title of the paper
# along with its abstract and the titles of the
# first 10 papers it cites


def get_paper_info(paper_id, option):
    #possible options include: id, title, abstract text, citations
    
    # Set the base URL for the Semantic Scholar API
    base_url = "https://api.semanticscholar.org/v1/paper/"

    # Set the full URL for the API query
    url = base_url + paper_id

    # Query the API
    response = requests.get(url)

    # Convert the response to JSON
    data = json.loads(response.text)

    # Print the title of the paper
    title = str(data["title"])

    # Print the abstract of the paper
    abstract = str(data["abstract"])

    # Put the first 10 paper IDs in an array
    # and if there's an error, append `None
    citations = []
    for i in range(10):
        try:
            citations.append(str(data["citations"][i]["paperid"]))
        except indexerror:
            citations.append(none)

    # return the options
    if option == "id":
        return paper_id
    elif option == "title":
        return title
    elif option == "abstract":
        return abstract
    elif option == "citations":
        return citations
    else:
        return "Invalid option"

# Testing Function
cit = get_paper_info("d4b651d6a904f69f8fa1dcad4ebe972296af3a9a", "citations")[0]

get_paper_info("d4b651d6a904f69f8fa1dcad4ebe972296af3a9a", "title")
get_paper_info(str(cit), "title")
get_paper_info(str(cit), "citations")

NameError: name 'indexerror' is not defined

In [None]:
def list_challenge_words(abstractString: str):
    # Given an abstract,
    # Returns array of strings of challenging words in the form:
    # ['Active Galactic Nuclei', 'BPT classification', 'Quenching scenarios', ...]

    

In [4]:
import nltk

In [16]:
import nltk
from nltk.corpus import wordnet

def hardest_words_synonym(text):
    # This function extracts the hardest words
    # Using the number of synonyms as a proxy
    # for the level of difficulty of the word
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove stop words
    stop_words = stopwords.words("english")
    filtered_words = [word for word in tokens if word.lower() not in stop_words]

    # Identify the hardest words
    hardest_words = []
    for word in filtered_words:
        synonyms = wordnet.synsets(word)
        if len(synonyms) == 1:
            hardest_words.append(word)

    return hardest_words

# Input text
text = "Nebular He ii𝜆4686Å line emission is useful to unveil active galactic nuclei (AGN) residing in actively star-forming (SF) galaxies, typically missed by the standard BPT classification. Here we adopt the He ii diagnostic to identify hidden AGN in the Local Universe using for the first time spatially-resolved data from the Data Release 15 of the Mapping Nearby Galaxies at APO survey (MaNGA DR15). By combining results from He ii and BPT diagnostics, we overall select 459 AGN host candidates (∼10% in MaNGA DR15), out of which 27 are identified as AGN by the He ii diagram only. The He ii-only AGN population is hosted by massive (M∗ & 1010 M) SF Main Sequence galaxies, and on average less luminous than the BPT-selected AGN. Given the He ii line faintness, we revisit our census accounting for incompleteness effects due to the He ii sensitivity limit of MaNGA. We thus obtain an overall increased fraction (11%) of AGN in MaNGA compared to the BPT-only census (9%), which further increases to 14% for galaxies more massive than 1010 M; interestingly, on the SF Main Sequence the increase is by about a factor of 2. A substantial number of AGN in SF galaxies points to significant, coeval star formation and black hole accretion, consistently with results from hydrodynamical simulations and with important implications on quenching scenarios. In view of exploring unprecedented high redshifts with JWST and new ground-based facilities, revisiting the standard BPT classification through novel emission-line diagnostics is fundamental to discover AGN in highly SF environments."

# Find the hardest words
hardest = hardest_words(text)

# Print the hardest words
print("Hardest words: ", hardest)

Hardest words:  ['actively', 'typically', 'diagnostics', 'hosted', 'luminous', 'revisit', 'incompleteness', 'interestingly', 'consistently', 'unprecedented', 'redshifts', 'revisiting', 'diagnostics']


In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [6]:
import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [17]:
tokens

NameError: name 'tokens' is not defined