In [12]:
import pickle
import os
import time, random
import math
import requests
from bs4 import BeautifulSoup
from semanticscholar import SemanticScholar

In [4]:
S2_API_KEY = os.environ['S2_API_KEY']
S2_API_KEY is not None

True

In [30]:
sch = SemanticScholar(api_key=S2_API_KEY)
example = 'Stochastic Neural Networks for Hierarchical Reinforcement Learning'
results = sch.search_paper(example)
print(results[0])
print(results[0].title)
print(results[0].url)
print(results[0].citationCount)
print(results[0].keys())

{'paperId': '3deecaee4ec1a37de3cb10420eaabff067669e17', 'externalIds': {'ArXiv': '1704.03012', 'DBLP': 'journals/corr/FlorensaDA17', 'MAG': '2963286043', 'CorpusId': 7774489}, 'corpusId': 7774489, 'publicationVenue': {'id': '939c6e1d-0d17-4d6e-8a82-66d960df0e40', 'name': 'International Conference on Learning Representations', 'type': 'conference', 'alternate_names': ['Int Conf Learn Represent', 'ICLR'], 'url': 'https://iclr.cc/'}, 'url': 'https://www.semanticscholar.org/paper/3deecaee4ec1a37de3cb10420eaabff067669e17', 'title': 'Stochastic Neural Networks for Hierarchical Reinforcement Learning', 'abstract': 'Deep reinforcement learning has achieved many impressive results in recent years. However, tasks with sparse rewards or long horizons continue to pose significant challenges. To tackle these important problems, we propose a general framework that first learns useful skills in a pre-training environment, and then leverages the acquired skills for learning faster in downstream tasks.

In [18]:
# how to generate urls for citing paper
paper_url = results[0].url
first_page = paper_url + '#citing-papers'
print("1st page: ", first_page)
page_num = 2
next_page = paper_url + '?sort=relevance&page={}'.format(page_num)
print("2nd page: ", next_page)

# get last page
last_page = math.ceil(results[0].citationCount/10)
print("last_page: ", last_page)

1st page:  https://www.semanticscholar.org/paper/3deecaee4ec1a37de3cb10420eaabff067669e17#citing-papers
2nd page:  https://www.semanticscholar.org/paper/3deecaee4ec1a37de3cb10420eaabff067669e17?sort=relevance&page=2
last_page:  32


### By request without api key

In [5]:
def html_parsing_from_url(url):
    response = requests.get(url)
    print(response)
    if response.status_code == 200:
        # Parse HTML with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    else:
        return None

In [9]:
url = 'https://www.semanticscholar.org/paper/Stochastic-Neural-Networks-for-Hierarchical-Florensa-Duan/3deecaee4ec1a37de3cb10420eaabff067669e17'
soup = html_parsing_from_url(url)
print(type(soup))
print(type(soup.select('link')[0]))

<Response [200]>
<class 'bs4.BeautifulSoup'>
<class 'bs4.element.Tag'>


In [8]:
#print(soup)

In [11]:
soup.select_one('link').get('href')

'https://www.semanticscholar.org/paper/Stochastic-Neural-Networks-for-Hierarchical-Florensa-Duan/3deecaee4ec1a37de3cb10420eaabff067669e17'

In [19]:
# test crawling
soup = html_parsing_from_url(first_page)
print(soup)

<Response [200]>
<!DOCTYPE html>

<html lang="en">
<!--


NOTE: We have a public API for this page with more data.
Check out https://www.semanticscholar.org/product/api for more info.


-->
<head>
<title>[PDF] Stochastic Neural Networks for Hierarchical Reinforcement Learning | Semantic Scholar</title>
<meta content="noarchive" name="robots"/>
<link href="https://www.semanticscholar.org/paper/Stochastic-Neural-Networks-for-Hierarchical-Florensa-Duan/3deecaee4ec1a37de3cb10420eaabff067669e17" rel="canonical"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta charset="utf-8"/>
<meta content="0807b5fb14218f579162fac615474c7a0386a6c8" name="s2-ui-version"/>
<meta content="This work proposes a general framework that first learns useful skills in a pre-training environment, and then leverages the acquired skills for learning faster in downstream tasks, and uses Stochastic Neural Networks combined with an information-theoretic regularizer to efficiently pre-train a la

In [20]:
# get paper titles
soup.select('h3.cl-paper-title')

[<h3 aria-describedby="" class="cl-paper-title" id="paper-2fed116dea9c36914b52b55e0f9688ccf641ee07" tabindex="">Sub-policy Adaptation for Hierarchical Reinforcement Learning</h3>,
 <h3 aria-describedby="" class="cl-paper-title" id="paper-ba0bf2bae46a97a7615af0a74356d293db1bc23b" tabindex="">Hierarchical Reinforcement Learning with Advantage-Based Auxiliary Rewards</h3>,
 <h3 aria-describedby="" class="cl-paper-title" id="paper-81a5864c21bf8e4018ac9004d618ccb99e261965" tabindex="">Efficient hindsight reinforcement learning using demonstrations for robotic tasks with sparse rewards</h3>,
 <h3 aria-describedby="" class="cl-paper-title" id="paper-72e87d27e8b3493981daca533b3956fae8b4f316" tabindex="">Learning Robot Skill Embeddings</h3>,
 <h3 aria-describedby="" class="cl-paper-title" id="paper-98b41528c58e6f5b7b28be5b54029e52ca90c4ab" tabindex="">Learning to Learn: Hierarchical Meta-Critic Networks</h3>,
 <h3 aria-describedby="" class="cl-paper-title" id="paper-46130875c8c2d89ea23dfb29c378

In [25]:
# filter only citing papers (first 10 papers) 
soup.select('h3.cl-paper-title')[:10]

[<h3 aria-describedby="" class="cl-paper-title" id="paper-2fed116dea9c36914b52b55e0f9688ccf641ee07" tabindex="">Sub-policy Adaptation for Hierarchical Reinforcement Learning</h3>,
 <h3 aria-describedby="" class="cl-paper-title" id="paper-ba0bf2bae46a97a7615af0a74356d293db1bc23b" tabindex="">Hierarchical Reinforcement Learning with Advantage-Based Auxiliary Rewards</h3>,
 <h3 aria-describedby="" class="cl-paper-title" id="paper-81a5864c21bf8e4018ac9004d618ccb99e261965" tabindex="">Efficient hindsight reinforcement learning using demonstrations for robotic tasks with sparse rewards</h3>,
 <h3 aria-describedby="" class="cl-paper-title" id="paper-72e87d27e8b3493981daca533b3956fae8b4f316" tabindex="">Learning Robot Skill Embeddings</h3>,
 <h3 aria-describedby="" class="cl-paper-title" id="paper-98b41528c58e6f5b7b28be5b54029e52ca90c4ab" tabindex="">Learning to Learn: Hierarchical Meta-Critic Networks</h3>,
 <h3 aria-describedby="" class="cl-paper-title" id="paper-46130875c8c2d89ea23dfb29c378

In [26]:
# get paper title
soup.select('h3.cl-paper-title')[:10][0].text

'Sub-policy Adaptation for Hierarchical Reinforcement Learning'

In [27]:
citation_page = {}

In [31]:
for page in range(1, last_page+1):
    #time.sleep(5)
    #print(page)
    if page == 1:
        url = first_page
    else:
        next_page = paper_url + '?sort=relevance&page={}'.format(page)
        url = next_page
    print(url)
    if page in citation_page.keys():
        print('already exists')
        continue
    else:
        try:
            soup = html_parsing_from_url(url)
            item = soup.select('h3.cl-paper-title')[:10]
            print(item)
            citation_page[page] = item
        except AttributeError as e:
            print(e)
            break


https://www.semanticscholar.org/paper/3deecaee4ec1a37de3cb10420eaabff067669e17#citing-papers
already exists
https://www.semanticscholar.org/paper/3deecaee4ec1a37de3cb10420eaabff067669e17?sort=relevance&page=2
already exists
https://www.semanticscholar.org/paper/3deecaee4ec1a37de3cb10420eaabff067669e17?sort=relevance&page=3
already exists
https://www.semanticscholar.org/paper/3deecaee4ec1a37de3cb10420eaabff067669e17?sort=relevance&page=4
already exists
https://www.semanticscholar.org/paper/3deecaee4ec1a37de3cb10420eaabff067669e17?sort=relevance&page=5
<Response [202]>
'NoneType' object has no attribute 'select'


## By request with api key
https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/get_graph_get_paper_citations

https://dgkim5360.tistory.com/entry/python-requests

In [95]:
import json
headers = {'x-api-key': S2_API_KEY}
r = requests.post(
    'https://api.semanticscholar.org/graph/v1/paper/batch',
    headers=headers,
    params={'fields': 'referenceCount,citationCount,title,year,citations,references'},
    json={"ids": ["649def34f8be52c8b66281af98ae884c09aef38b"]}
)
print(json.dumps(r.json(), indent=2))

[
  {
    "paperId": "649def34f8be52c8b66281af98ae884c09aef38b",
    "title": "Construction of the Literature Graph in Semantic Scholar",
    "year": 2018,
    "referenceCount": 27,
    "citationCount": 310,
    "citations": [
      {
        "paperId": "382301f0a9a85c298c6ec51ba4434ba040db960c",
        "title": "Accelerating science with human-aware artificial intelligence"
      },
      {
        "paperId": "116145af7e956b23d0d9a81fdad8cb4f9a166531",
        "title": "SEARCHFORMER: Semantic patent embeddings by siamese transformers for prior art search"
      },
      {
        "paperId": "ee2db2936524122cdeb4755dc4a8f12933f422af",
        "title": "Pre-training Multi-task Contrastive Learning Models for Scientific Literature Understanding"
      },
      {
        "paperId": "9f137af4f4cfc6bd6ad5512595db13f4d7a7aa73",
        "title": "Beyond Good Intentions: Reporting the Research Landscape of NLP for Social Good"
      },
      {
        "paperId": "7bbd44156ec3fc2549c43cf2352a7

In [96]:
len(r.json()[0]['citations']), len(r.json()[0]['references'])

(310, 27)