In [2]:
import requests
import json
from dataclasses import dataclass
from typing import List

In [31]:
@dataclass
class Paper:
    BASE_URL = "https://api.semanticscholar.org/v1/paper/"
    CITATION_LIMIT = 10
    paper_id: str
    
    def __post_init__(self):
        self.url = f'{self.BASE_URL}{self.paper_id}'
        self.paper = requests.get(self.url).json()
    
    @property
    def id(self) -> str:
        return self.paper_id
    
    @property
    def title(self) -> str:
        return self.paper['title']
        
    @property
    def abstract(self) -> str:
        return self.paper['abstract']
    
    @property
    def citations(self) -> List[str]:
        citations = []
        for c in self.paper['citations'][:self.CITATION_LIMIT]:
            citations.append(c["paperId"])

        return citations

In [32]:
p = Paper("d4b651d6a904f69f8fa1dcad4ebe972296af3a9a")

In [33]:
p.id
p.title
p.abstract
p.citations

'd4b651d6a904f69f8fa1dcad4ebe972296af3a9a'

'Identifying Relations for Open Information Extraction'

"Open Information Extraction (IE) is the task of extracting assertions from massive corpora without requiring a pre-specified vocabulary. This paper shows that the output of state-of-the-art Open IE systems is rife with uninformative and incoherent extractions. To overcome these problems, we introduce two simple syntactic and lexical constraints on binary relations expressed by verbs. We implemented the constraints in the ReVerb Open IE system, which more than doubles the area under the precision-recall curve relative to previous extractors such as TextRunner and woepos. More than 30% of ReVerb's extractions are at precision 0.8 or higher---compared to virtually none for earlier systems. The paper concludes with a detailed analysis of ReVerb's errors, suggesting directions for future work."

['a1d49c5ea00831d540f027c0c009cacd2c21f3b5',
 '9ea5874d261359e287eabb735de38a8edba1e091',
 '032244fb8ff881f4f12345e9afc7ea5627952f4a',
 '279cc657655eeb4e96a2eaf3d77f708edbf6a313',
 '47a541269d4ef70f37f0d3a57483312c4c6c2ad5',
 'd582909be7ad3ca80fcfca3e1d9ced2e60966db2',
 '28fdb929d1c4f87bbb9cc0b5bb880567e3c50429',
 '1f872354e0cfde91e86e68b35d89a6d447f48936',
 'bdb32ea23986f6dfe436c5dba0d13e95dea07c92',
 'fda21913e8d889a84677f96231a145ecf2c206a2']

In [34]:
papers = {}

queue = []
while True:
    for s in p.citations:
        if s not in papers:
            papers[s] = Paper(s)
            queue.append(papers[s])
            
    if queue:
        p = queue.pop()
    else:
        break
    if len(papers) == 1000:
        break
    

In [36]:
next(iter(papers.values()))

Paper(paper_id='a1d49c5ea00831d540f027c0c009cacd2c21f3b5')

In [8]:
# Set up an API query to the Semantic Scholar API
# to get the paper with the given ID
# and print the title of the paper
# along with its abstract and the titles of the
# first 10 papers it cites


def get_paper_info(paper_id, option):
    #possible options include: id, title, abstract text, citations
    
    # Set the base URL for the Semantic Scholar API
    base_url = "https://api.semanticscholar.org/v1/paper/"

    # Set the full URL for the API query
    url = base_url + paper_id

    # Query the API
    response = requests.get(url)

    # Convert the response to JSON
    data = json.loads(response.text)

    # Print the title of the paper
    title = str(data["title"])

    # Print the abstract of the paper
    abstract = str(data["abstract"])

    # Put the first 10 paper IDs in an array
    # and if there's an error, append `None
    citations = []
    for i in range(10):
        try:
            citations.append(str(data["citations"][i]["paperid"]))
        except indexerror:
            citations.append(none)

    # return the options
    if option == "id":
        return paper_id
    elif option == "title":
        return title
    elif option == "abstract":
        return abstract
    elif option == "citations":
        return citations
    else:
        return "Invalid option"

# Testing Function
cit = get_paper_info("d4b651d6a904f69f8fa1dcad4ebe972296af3a9a", "citations")[0]

get_paper_info("d4b651d6a904f69f8fa1dcad4ebe972296af3a9a", "title")
get_paper_info(str(cit), "title")
get_paper_info(str(cit), "citations")

NameError: name 'indexerror' is not defined

In [None]:
def list_challenge_words(abstractString: str):
    # Given an abstract,
    # Returns array of strings of challenging words in the form:
    # ['Active Galactic Nuclei', 'BPT classification', 'Quenching scenarios', ...]

    

In [4]:
import nltk

In [16]:
import nltk
from nltk.corpus import wordnet

def hardest_words_synonym(text):
    # This function extracts the hardest words
    # Using the number of synonyms as a proxy
    # for the level of difficulty of the word
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove stop words
    stop_words = stopwords.words("english")
    filtered_words = [word for word in tokens if word.lower() not in stop_words]

    # Identify the hardest words
    hardest_words = []
    for word in filtered_words:
        synonyms = wordnet.synsets(word)
        if len(synonyms) == 1:
            hardest_words.append(word)

    return hardest_words

# Input text
text = "Nebular He ii𝜆4686Å line emission is useful to unveil active galactic nuclei (AGN) residing in actively star-forming (SF) galaxies, typically missed by the standard BPT classification. Here we adopt the He ii diagnostic to identify hidden AGN in the Local Universe using for the first time spatially-resolved data from the Data Release 15 of the Mapping Nearby Galaxies at APO survey (MaNGA DR15). By combining results from He ii and BPT diagnostics, we overall select 459 AGN host candidates (∼10% in MaNGA DR15), out of which 27 are identified as AGN by the He ii diagram only. The He ii-only AGN population is hosted by massive (M∗ & 1010 M) SF Main Sequence galaxies, and on average less luminous than the BPT-selected AGN. Given the He ii line faintness, we revisit our census accounting for incompleteness effects due to the He ii sensitivity limit of MaNGA. We thus obtain an overall increased fraction (11%) of AGN in MaNGA compared to the BPT-only census (9%), which further increases to 14% for galaxies more massive than 1010 M; interestingly, on the SF Main Sequence the increase is by about a factor of 2. A substantial number of AGN in SF galaxies points to significant, coeval star formation and black hole accretion, consistently with results from hydrodynamical simulations and with important implications on quenching scenarios. In view of exploring unprecedented high redshifts with JWST and new ground-based facilities, revisiting the standard BPT classification through novel emission-line diagnostics is fundamental to discover AGN in highly SF environments."

# Find the hardest words
hardest = hardest_words(text)

# Print the hardest words
print("Hardest words: ", hardest)

Hardest words:  ['actively', 'typically', 'diagnostics', 'hosted', 'luminous', 'revisit', 'incompleteness', 'interestingly', 'consistently', 'unprecedented', 'redshifts', 'revisiting', 'diagnostics']


In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [6]:
import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [17]:
tokens

NameError: name 'tokens' is not defined