In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import chromedriver_binary
from bs4 import BeautifulSoup
from credential import *
import json
import time
import pandas as pd
import re
import requests
import os
from IPython.display import display, HTML, Image
from tqdm.auto import tqdm


options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
driver = webdriver.Chrome(options=options)
driver.set_window_size(1920, 1080)

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.60",
}


def maintext_formatter(article):
    for i in article.find_all("div", {"class": "kicker"}):
        i.append(" ")
    for i in article.find_all("h2"):
        i.append(": ")
    for i in article.find_all("h3"):
        i.append(": ")
    for i in article.find_all("p"):
        i.append(" ")
    return re.sub(r"^ +", "", article.text, flags=re.MULTILINE).strip()

In [2]:
metadata_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQ6zCX7UhgM9Q7O9gp8IaAFwOfotCT7_jSbJc8ab0IY-Pmnf1fNOpVwnAG-Jn_WVdP1XKRHntDxHIBU/pub?gid=101839528&single=true&output=csv"
df_meta = pd.read_csv(metadata_url)

In [3]:
ieee_list = df_meta[
    df_meta.isPaper
    & df_meta.hasHtml
    & (df_meta.platform == "IEEE")
    & ~df_meta.needReview
].doi.tolist()
# ieee_list

In [5]:
errors = []
processed = []

for doi in tqdm(ieee_list):
    if doi in processed:
        continue

    paper_id = doi.split("/")[-1]
    os.makedirs(f"ieee-10.1109/{paper_id}", exist_ok=True)
    url = f"https://doi-org.proxy.lib.ohio-state.edu/{doi}"

    driver.get(url)
    try:
        WebDriverWait(driver, timeout=10).until(
            lambda d: "Login Required" in d.page_source or "IEEE" in d.page_source
        )
    except:
        errors.append((f"{url}: Invalid URL", driver.get_screenshot_as_png()))
        continue
    if "Login Required" in driver.page_source:
        print(f"OSU proxy login required.")
        username_element = driver.find_element(By.ID, "username")
        password_element = driver.find_element(By.ID, "password")
        username_element.send_keys(osu_username)
        password_element.send_keys(osu_passwd)
        login_button = driver.find_element(By.ID, "submit")
        login_button.click()
        print(f"OSU proxy login success.")
    try:
        WebDriverWait(driver, timeout=10).until(
            lambda d: d.find_element(By.ID, "article").is_displayed()
        )
    except:
        errors.append((f"{url}: Article not found", driver.get_screenshot_as_png()))
        continue

    soup = BeautifulSoup(driver.page_source, "html.parser")
    meta_match = re.search(r"xplGlobal\.document\.metadata ?= ?({.*?});", str(soup))
    if meta_match:
        meta = json.loads(meta_match.group(1))
        with open(f"ieee-10.1109/{paper_id}/meta.json", "w") as f:
            json.dump(meta, f, indent=4)
        abstract = meta["abstract"]
        with open(f"ieee-10.1109/{paper_id}/abstract.txt", "w") as f:
            f.write(abstract)
    else:
        errors.append((f"{url}: No metadata found"), driver.get_screenshot_as_png())
        continue

    article = soup.find("div", {"id": "article"})

    figures = article.find_all("div", {"class": "figure figure-full"})
    for fig in figures:
        img_url = "https://ieeexplore.ieee.org" + fig.find("a")["href"]
        img_name = img_url.split("/")[-1]
        imgb = requests.get(img_url, headers=headers).content
        with open(f'ieee-10.1109/{paper_id}/{fig["id"]}-{img_name}', "wb") as f:
            f.write(imgb)
        fig_caption = fig.find("div", {"class": "figcaption"}).text
        with open(f'ieee-10.1109/{paper_id}/{fig["id"]}-caption.txt', "w") as f:
            f.write(fig_caption)
        fig.find("div", {"class": "img-wrap"}).extract()
        fig.find("p", {"class": "links"}).extract()

    tables = article.find_all("div", {"class": "figure figure-full table"})
    for tab in tables:
        img_url = "https://ieeexplore.ieee.org" + tab.find("a")["href"]
        img_name = img_url.split("/")[-1]
        imgb = requests.get(img_url, headers=headers).content
        with open(f'ieee-10.1109/{paper_id}/{tab["id"]}-{img_name}', "wb") as f:
            f.write(imgb)
        fig_caption = tab.find("div", {"class": "figcaption"}).text
        with open(f'ieee-10.1109/{paper_id}/{tab["id"]}-caption.txt', "w") as f:
            f.write(fig_caption)
        tab.find("div", {"class": "img-wrap"}).extract()

    with open(f"ieee-10.1109/{paper_id}/maintext.txt", "w") as f:
        f.write(maintext_formatter(article))

    processed.append(doi)

print(f"Scraped {len(processed)} papers")
print(f"Errors: {len(errors)}")
for e, s in errors:
    print(e)
    display(Image(s))

  0%|          | 0/109 [00:00<?, ?it/s]

Scraped 105 papers
Errors: 0


In [17]:
element = driver.find_element(By.ID, "table1")
driver.execute_script("arguments[0].scrollIntoView(true);", element)
element.screenshot("element.png")

True