# RSNA Diagnosis Quiz Crawler
##### Dependency
- tqdm
- selenium
- undetected_chromedriver

### Import

In [1]:
import re
import csv
import time
from tqdm import tqdm
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
import undetected_chromedriver as uc

### Initiate undetected_chromedriver

In [2]:
driver = uc.Chrome()

### Get DOI links of RSNA Diagnosis Please Quiz
- There are links in issue table of contents starting from Vol. 207, Issue 1

In [124]:
doi_datas = []
volume = 207 #207
issue = 1

while True:
    # page open
    driver.get(f"https://pubs.rsna.org/toc/radiology/{volume}/{issue}")

    wait = WebDriverWait(driver, 10)
    element = wait.until(ec.element_to_be_clickable((By.ID, "pb-page-content")))
    time.sleep(1)

    # 404 not found
    if "404 not found" in element.text.lower(): break

    # get titles
    issue_candidates = driver.find_elements(By.CLASS_NAME, "issue-item__title")

    # if candidate title is "Case 00"
    for candidate in issue_candidates:
        obj = re.search("case [0-9][0-9]*", candidate.text.strip().lower())
        if obj:
            title = candidate.text.strip()
            case_id = obj.group().split()[1]
            if ":" in title: type = "answer"
            else: type = "problem"
            doi = candidate.find_element(By.TAG_NAME, "a").get_attribute("href")
            doi_datas.append({
                "id": case_id,
                "type": type,
                "doi": doi
            })

    # next issue
    issue = issue + 1
    if issue > 3:
        issue = 1
        volume = volume + 1

### Add some code for exporting to csv, and edit some data

### Get links from csv

In [3]:
with open("../rsna_case_links.csv", "r") as f:
    links = [{k: v for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)]

### Crawl from DOI links
- There is a crawling cap, for each 100 pages you will be banned for 2 hours

In [14]:
start_index = 305
pbar = tqdm(links[start_index:])
rsna_data = []
for row in pbar:
    # initialize all variables
    problem_link = ""
    answer_link = ""
    problem_date = ""
    answer_date = ""
    history = ""
    findings = ""
    diagnosis = ""
    problem_images = []
    answer_images = []

    # get row data
    case_id = row["id"]
    problem_link = row["problem"]#.replace("pubs.rsna.org", "pubs-rsna-org-ssl.access.hanyang.ac.kr:8443")
    answer_link = row["answer"]#.replace("pubs.rsna.org", "pubs-rsna-org-ssl.access.hanyang.ac.kr:8443")

    # problem side
    pbar.set_description(f"Crawling {case_id}\t[Problem page]")
    if problem_link.strip() != "":
        problem_page = driver.get(problem_link)

        # wait for page to load
        wait = WebDriverWait(driver, 10)
        element = wait.until(ec.element_to_be_clickable((By.ID, "pb-page-content")))
        time.sleep(.5)

        # get problem date
        problem_date = driver.find_element(By.NAME, "dc.Date").get_attribute("content")
        
        # get history
        history = ""
        titles = driver.find_elements(By.CLASS_NAME, "article-section__title")
        for t in titles:
            if "history" in t.text.strip().lower() and "article" not in t.text.strip().lower():
                history = t.find_element(By.XPATH, "following-sibling::p").text

        # get problem images
        problem_images = []
        figures = driver.find_elements(By.CLASS_NAME, "article__inlineFigure")
        for fig in figures:
            src = f"https://pubs.rsna.org{fig.find_element(By.TAG_NAME, 'img').get_attribute('data-lg-src')}"
            caption = fig.find_element(By.CLASS_NAME, "figure__caption").text
            problem_images.append({"img": src, "caption": caption})
        
    # answer side
    pbar.set_description(f"Crawling {case_id}\t[Answer page]")
    if answer_link.strip() != "":
        answer_page = driver.get(answer_link)

        # wait for page to load
        wait = WebDriverWait(driver, 10)
        element = wait.until(ec.element_to_be_clickable((By.ID, "pb-page-content")))
        time.sleep(.5)

        # get answer date
        answer_date = driver.find_element(By.NAME, "dc.Date").get_attribute("content")

        # get findings
        findings = ""
        titles = driver.find_elements(By.CLASS_NAME, "article-section__title")
        for t in titles:
            if "findings" in t.text.strip().lower():
                findings = t.find_element(By.XPATH, "following-sibling::p").text
        
        # get diagnosis
        diagnosis = ""
        diagnosis = re.sub(' +', ' ', (driver.find_element(By.NAME, "dc.Title").get_attribute("content")+" ").split(":")[1].strip())
        
        # get answer images
        answer_images = []
        figures = driver.find_elements(By.CLASS_NAME, "article__inlineFigure")
        for fig in figures:
            src = f"https://pubs.rsna.org{fig.find_element(By.TAG_NAME, 'img').get_attribute('data-lg-src')}"
            caption = fig.find_element(By.CLASS_NAME, "figure__caption").text
            answer_images.append({"img": src, "caption": caption})

    rsna_data.append({
        "id" : case_id,
        "link" : [problem_link, answer_link],
        "date" : [problem_date, answer_date],
        "history" : history,
        "findings" : findings,
        "diagnosis" : diagnosis,
        "images" : [problem_images, answer_images]
    })

Crawling 319	[Answer page]: 100%|██████████| 14/14 [01:28<00:00,  6.33s/it] 


### Export to csv file

In [1]:
keys = rsna_data[0].keys()
with open("../crawled_data/rsna_306-319.csv", "w", encoding='utf-8-sig', newline="") as csv_file: # Add newline=""
    csv_writer = csv.DictWriter(csv_file, keys) # Pass the writer to DictWriter
    csv_writer.writeheader()
    csv_writer.writerows(rsna_data)

NameError: name 'rsna_data' is not defined