# IEEE Xplore Scraper
Here it is about the scraping commands in general, integration will follow seperately.

In [350]:
# Example urls
paper1 = 'https://ieeexplore.ieee.org/document/7887648'
paper2 = 'https://ieeexplore.ieee.org/document/6324427'
conference1 = 'https://ieeexplore.ieee.org/document/9378426'
conference2 = 'https://ieeexplore.ieee.org/document/5694044'
ieee_links = [paper1, paper2, conference1, conference2]

In [351]:
import re
import time
import json

import cloudscraper
import requests
from bs4 import BeautifulSoup
from helium import *
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

## Accessing Soup and JSON Data
Our primary goal is to extract the meta data json file from the HTML of the IEEE Xplore page. We will use the `requests` library for this. For more complex scraping we can load  the page with Helium, which is a wrapper for Selenium (like already used in the ScienceDirect file).


In [352]:
def get_bs(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15'}
        r = requests.get(url, headers=headers)
        print(r.status_code)
        bs = BeautifulSoup(r.text, 'html.parser')
    except:
        print('Error: ', url)
        return None
    return bs

Create soups of test publications

In [353]:
soup_paper1 = get_bs(paper1)
soup_paper2 = get_bs(paper2)
soup_conference1 = get_bs(conference1)
soup_conference2 = get_bs(conference2)
ieee_soups = [soup_paper1, soup_paper2, soup_conference1, soup_conference2]
soup_paper1

200
200
200
200


<!DOCTYPE html>

<script type="text/javascript">

var home = {	
			metadata:{
				searchCount: '5,749,604',
				logoRelPath: '/customer_logos',
				thirdParthAuth: false,
				currentPage:  'document',
				xploreVirtual:'https://ieeexplore.ieee.org',
				isWebAccount: false,
				isProvisioned: false,
				globalNotification:{},
				cart: {
						count: 0
				}
			}						
		};
		





		
</script>
<html lang="en-US">
<head>
<meta content="Astrophysics and cosmology are rich with data. The advent of wide-area digital cameras on large aperture telescopes has led to ever more ambitious surveys of th" id="meta-description" name="Description"/>
<link href="https://ieeexplore.ieee.org/document/7887648" rel="canonical"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
<!-- Disable "click" touch event 300ms delay for Chrome/Firefox on Android -->
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Big Universe, Big Data: Machine Learning and Image An

The json object can be found: xplGlobal.document.metadata={...} in the HTML. We can use a regular expression to extract it.

In [354]:
def get_json_data(url):
    """
    Extracts the json object with meta data from the page and parses it to a dict.
    :param url: URL of th publication
    :return: Dict with content of the JSON object
    """
    # extract the line of the text containing the json object
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15'}
    try:
        r = requests.get(url, headers=headers)
        print(r.status_code)
        if r.status_code != 200:
            raise Exception('Connection not successful - Status code not 200')
    except Exception as e:
        print('Error: ', e)
        print('Error: ', url)
        return None
    html_text = r.text
    json_data_regex_pattern = re.compile(r'xplGlobal.document.metadata=.*};')
    json_regex_matches = re.findall(json_data_regex_pattern, html_text)

    assert len(json_regex_matches) == 1

    json_string = json_regex_matches[0]
    json_string = json_string.removeprefix('xplGlobal.document.metadata=')  # remove JS prefix
    # delete backslashes from json_result stjson_string
    #json_result = json_result.replace('\\', '')
    # remove last character of json_result
    json_string = json_string.removesuffix(';')  # remove semicolon at the end
    json_parsed = json.loads(json_string)
    assert (isinstance(json_parsed, dict))
    return json_parsed

Test the function

In [355]:
json_paper1 = get_json_data(paper1)
json_paper2 = get_json_data(paper2)
json_conference1 = get_json_data(conference1)
json_conference2 = get_json_data(conference2)

ieee_jsons = [json_paper1, json_paper2, json_conference1, json_conference2]

200
200
200
200


In [356]:
# Quick check
for json_data in ieee_jsons:
    print(json_data.get('title'))

Big Universe, Big Data: Machine Learning and Image Analysis for Astronomy
Bridging the Gap Between Manufacturing and Service Through IT-Based Boundary Objects
Approximate Nearest-Neighbour Fields via Massively-Parallel Propagation-Assisted K-D Trees
Resilient K-d Trees: K-Means in Space Revisited


In [357]:
# testing
def test_function_on_publication_links(function):
    for x in ieee_links:
        print(function(x))


def test_function_on_publication_soups(function):
    for x in ieee_soups:
        function(x)


def test_function_on_publication_jsons(function):
    print(f'Function: {function.__name__} \n')
    for idx, x in enumerate(ieee_jsons):
        print(f'Publication No. {idx + 1}:')
        print(function(x), '\n')


## Main fields
### Authors


In [359]:
def get_authors(json_data):
    """
    Extracts the authors from the json object.
    :param json_data: JSON object with meta data
    :return: List of dicts with author name and their ieee id
    """
    authors_raw = json_data.get('authors')
    authors = []
    for el in authors_raw:
        author_name = el.get('name')
        author_id = el.get('id')
        authors.append({
            'name': author_name,
            'id_ieee': author_id
        })
    return authors

In [360]:
test_function_on_publication_jsons(get_authors)

Function: get_authors 

Publication No. 1:
[{'name': 'Jan Kremer', 'id_ieee': '37086104276'}, {'name': 'Kristoffer Stensbo-Smidt', 'id_ieee': '37086102090'}, {'name': 'Fabian Gieseke', 'id_ieee': '37857080600'}, {'name': 'Kim Steenstrup Pedersen', 'id_ieee': '37837129200'}, {'name': 'Christian Igel', 'id_ieee': '37281518600'}] 

Publication No. 2:
[{'name': 'Jörg Becker', 'id_ieee': '37361144400'}, {'name': 'Daniel Beverungen', 'id_ieee': '37659396600'}, {'name': 'Ralf Knackstedt', 'id_ieee': '37659396300'}, {'name': 'Martin Matzner', 'id_ieee': '37659394200'}, {'name': 'Oliver Müller', 'id_ieee': '37425634600'}, {'name': 'Jens Pöppelbuß', 'id_ieee': '38230416800'}] 

Publication No. 3:
[{'name': 'Cosmin Eugen Oancea', 'id_ieee': '37706606200'}, {'name': 'Ties Robroek', 'id_ieee': '37088809064'}, {'name': 'Fabian Gieseke', 'id_ieee': '37857080600'}] 

Publication No. 4:
[{'name': 'Fabian Gieseke', 'id_ieee': '37857080600'}, {'name': 'Gabriel Moruz', 'id_ieee': '37992782800'}, {'name': 

### Title

In [361]:
def get_title(json_data):
    """
    Extracts the titleof the publication.
    :param json_data: JSON object with meta data
    :return: String
    """
    return json_data.get('title')

In [362]:
test_function_on_publication_jsons(get_title)

Function: get_title 

Publication No. 1:
Big Universe, Big Data: Machine Learning and Image Analysis for Astronomy 

Publication No. 2:
Bridging the Gap Between Manufacturing and Service Through IT-Based Boundary Objects 

Publication No. 3:
Approximate Nearest-Neighbour Fields via Massively-Parallel Propagation-Assisted K-D Trees 

Publication No. 4:
Resilient K-d Trees: K-Means in Space Revisited 



### Keywords

In [363]:
def get_keywords(json_data):
    """
    Extracts the keywords from the json object. \n
    There are multiple types of keywords merged. Can de split when demanded.
    :param json_data: JSON object with meta data
    :return: List of strings
    """
    keywords_raw = json_data.get('keywords')
    keywords = []
    for el in keywords_raw:
        keywords.extend(el.get('kwd'))
    return keywords

In [364]:
test_function_on_publication_jsons(get_keywords)

Function: get_keywords 

Publication No. 1:
['Extraterrestrial measurements', 'Telescopes', 'Image analysis', 'Extrasolar planets', 'Big data', 'Astronomy', 'Computer vision', 'Machine learning', 'astronomical image processing', 'Big Data', 'learning (artificial intelligence)', 'Big Data', 'machine learning', 'image analysis', 'astronomy', 'digital cameras', 'aperture telescopes', 'image analysis algorithms', 'computer science research', 'data analysis', 'cosmology', 'astrophysics', 'big data', 'astronomy', 'machine learning', 'computer vision', 'intelligent systems'] 

Publication No. 2:
['Communities', 'Recycling', 'Manufacturing', 'Companies', 'Information management', 'Analytical models', 'manufacturing industries', 'service industries', 'manufacturing companies', 'service companies', 'IT-based boundary objects', 'integrated solution', 'complementary resources', 'supply chain management', 'business process management', 'systematic identification', 'service blueprinting', 'boundary-

### Abstract

In [365]:
def get_abstract(json_data):
    """
    Extracts the abstract from the publicatoin.
    :param json_data: JSON object with meta data
    :return: String
    """
    return json_data.get('abstract')

In [366]:
test_function_on_publication_jsons(get_abstract)

Function: get_abstract 

Publication No. 1:
Astrophysics and cosmology are rich with data. The advent of wide-area digital cameras on large aperture telescopes has led to ever more ambitious surveys of the sky. Data volumes of entire surveys a decade ago can now be acquired in a single night, and real-time analysis is often desired. Thus, modern astronomy requires big data know-how, in particular, highly efficient machine learning and image analysis algorithms. But scalability isn't the only challenge: astronomy applications touch several current machine learning research questions, such as learning from biased data and dealing with label and measurement noise. The authors argue that this makes astronomy a great domain for computer science research, as it pushes the boundaries of data analysis. They focus here on exemplary results, discuss main challenges, and highlight some recent methodological advancements in machine learning and image analysis triggered by astronomical applications

### Publisher

In [367]:
def get_publisher(json_data):
    """
    Extracts the publisher from the publication json data.
    :param json_data: JSON object with meta data
    :return: String
    """
    return json_data.get('publisher')

In [368]:
test_function_on_publication_jsons(get_publisher)

Function: get_publisher 

Publication No. 1:
IEEE 

Publication No. 2:
IEEE 

Publication No. 3:
IEEE 

Publication No. 4:
IEEE 



### Publication year

In [369]:
def get_year(json_data):
    """
    Extracts the publication year from the publication json data.
    :param json_data: JSON object with meta data
    :return: String
    """
    return json_data.get('publicationYear')

In [370]:
test_function_on_publication_jsons(get_year)

Function: get_year 

Publication No. 1:
2017 

Publication No. 2:
2013 

Publication No. 3:
2020 

Publication No. 4:
2010 



### Page ranges

In [371]:
def get_start_page(json_data):
    """
    Extracts the start page from the publication json data.
    :param json_data: JSON object with meta data
    :return: String
    """
    return json_data.get('startPage')

In [372]:
test_function_on_publication_jsons(get_start_page)

Function: get_start_page 

Publication No. 1:
16 

Publication No. 2:
468 

Publication No. 3:
5172 

Publication No. 4:
815 



In [373]:
def get_end_page(json_data):
    """
    Extracts the end page from the publication json data.
    :param json_data: JSON object with meta data
    :return: String
    """
    return json_data.get('endPage')

In [374]:
test_function_on_publication_jsons(get_end_page)

Function: get_end_page 

Publication No. 1:
22 

Publication No. 2:
482 

Publication No. 3:
5181 

Publication No. 4:
820 



### Publication type

In [375]:
def get_publication_type(json_data):
    """
    Extracts the publication type from the publication json data.
    :param json_data: JSON object with meta data
    :return: String
    """
    return json_data.get('xploreDocumentType')

In [376]:
test_function_on_publication_jsons(get_publication_type)

Function: get_publication_type 

Publication No. 1:
Journals & Magazine 

Publication No. 2:
Journals & Magazine 

Publication No. 3:
Conference Publication 

Publication No. 4:
Conference Publication 



### Full text
### Omitted

### References

In [377]:
def fetch_reference_data_json(json_data):
    """
    Fetches the reference data from IEEE (Rest API) and returns the the dict of the json response.
    :param json_data: JSON meta data object
    :return: Dict
    """
    id = get_publication_id(json_data)
    url = f'https://ieeexplore.ieee.org/rest/document/{id}/references'
    payload = ''
    headers = {
        'Referer': f'https://ieeexplore.ieee.org/document/{id}'
    }
    response = requests.request("GET", url, data=payload, headers=headers)
    reference_data = json.loads(response.text).get('references')
    return reference_data


In [378]:
def get_references(json_data):
    """
    Extracts the references of a publication
    :param json_data: JSON meta data object
    :return: Dict
    """
    try:
        ref_raw = fetch_reference_data_json(json_data)
    except:
        return None
    references = []
    for ref in ref_raw:
        reference_text = None
        doi_link = None
        google_scholar_link = None
        reference_text = ref.get('text')
        if ref.get('links') is not None:
            doi_link = ref.get('links').get('crossRefLink')
            google_scholar_link = ref.get('links').get('googleScholarLink')
        references.append({
            'text': reference_text,
            'doi_link': doi_link,
            'google_scholar_link': google_scholar_link
        })
    return references

In [379]:
for data in ieee_jsons:
    print('###########################################################################')
    print(f'Publikation: {get_title(data)}')
    print('Referenzen: \n')
    if get_references(data) is not None:
        for idx, ref in enumerate(get_references(data)):
            print(f'Quelle {idx} : {ref.get("text")}')
            print(f' > DOI link: {ref.get("doi_link")}')
            print(f' > google_scholar_link: {ref.get("google_scholar_link")}')
    print('\n \n \n')


###########################################################################
Publikation: Big Universe, Big Data: Machine Learning and Image Analysis for Astronomy
Referenzen: 

Quelle 0 : D.J. Mortlock et al., "A Luminous Quasar at a Redshift of z = 7.085", <em>Nature</em>, vol. 474, no. 7353, pp. 616-619, 2011.
 > DOI link: https://doi.org/10.1038/nature10159
 > google_scholar_link: None
Quelle 1 : A.A. Collister and O. Lahav, "ANNz: Estimating Photometric Redshifts Using Artificial Neural Networks", <em>Publications of the Astronomical Society of the Pacific</em>, vol. 116, no. 818, pp. 345, 2004.
 > DOI link: https://doi.org/10.1086/383254
 > google_scholar_link: None
Quelle 2 : C.J. Lintott et al., "Galaxy Zoo: Morphologies Derived from Visual Inspection of Galaxies from the Sloan Digital Sky Survey", <em>Monthly Notices of the Royal Astronomical Soc.</em>, vol. 389, pp. 1179-1189, 2008.
 > DOI link: https://doi.org/10.1111/j.1365-2966.2008.13689.x
 > google_scholar_link: None
Quel

## Journal fields
### Journal name

In [267]:
def get_journal_conference_name(json_data):
    """
    Extracts the journal/conference publication name from the publication json data.
    :param json_data: JSON object with meta data
    :return: String
    """
    return json_data.get('publicationTitle')

In [269]:
test_function_on_publication_jsons(get_journal_conference_name)

Function: get_journal_conference_name 

Publication No. 1:
IEEE Intelligent Systems 

Publication No. 2:
IEEE Transactions on Engineering Management 

Publication No. 3:
2020 IEEE International Conference on Big Data (Big Data) 

Publication No. 4:
2010 IEEE International Conference on Data Mining 



In [270]:
def get_journal_volume(json_data):
    """
    Extracts the journal volume from the publication json data.
    :param json_data: JSON object with meta data
    :return: String
    """
    return json_data.get('volume')

In [271]:
test_function_on_publication_jsons(get_journal_volume)

Function: get_journal_volume 

Publication No. 1:
32 

Publication No. 2:
60 

Publication No. 3:
None 

Publication No. 4:
None 



### Journal issue

In [272]:
def get_journal_issue(json_data):
    """
    Extracts the journal issue from the publication json data.
    :param json_data: JSON object with meta data
    :return: String
    """
    return json_data.get('issue')

In [273]:
test_function_on_publication_jsons(get_journal_issue)

Function: get_journal_issue 

Publication No. 1:
2 

Publication No. 2:
3 

Publication No. 3:
None 

Publication No. 4:
None 



## Conference fields
### Conference name
c.f. above
[ ... ]
### Venue / Location

In [275]:
def get_conference_location(json_data):
    """
    Extracts the conference location from the publication json data.
    :param json_data: JSON object with meta data
    :return: String
    """
    return json_data.get('confLoc')

In [276]:
test_function_on_publication_jsons(get_conference_location)

Function: get_conference_location 

Publication No. 1:
None 

Publication No. 2:
None 

Publication No. 3:
Atlanta, GA, USA 

Publication No. 4:
Sydney, NSW, Australia 



## Special IEEE fields
### Publication id

In [278]:
def get_publication_id(json_data):
    """
    Extracts the publication id from the publication json data.
    :param json_data: JSON object with meta data
    :return: String
    """
    return json_data.get('articleNumber')

In [280]:
test_function_on_publication_jsons(get_publication_id)

Function: get_publication_id 

Publication No. 1:
7887648 

Publication No. 2:
6324427 

Publication No. 3:
9378426 

Publication No. 4:
5694044 



### # of citations

In [386]:
def get_amount_citations(json_data):
    """
    Extracts the amount of citations from the publication json data.
    :param json_data: JSON object with meta data
    :return: String
    """
    metrics = json_data.get('metrics')
    if metrics is not None:
        return metrics.get('citationCountPaper')

In [387]:
test_function_on_publication_jsons(get_amount_citations)

Function: get_amount_citations 

Publication No. 1:
48 

Publication No. 2:
26 

Publication No. 3:
0 

Publication No. 4:
2 

