# KSTR Weekly Chest Cases Crawler
##### Dependency
- bs4

### Import

In [193]:
from urllib.request import urlopen
from urllib import parse
from bs4 import BeautifulSoup
from bs4 import Comment
from tqdm import tqdm
import json
import csv
import time

### Base URLs

In [111]:
base_url = "https://kstr.radiology.or.kr"
archive_url = "/weekly/archive/"
case_list_url = "list.php?menu_num=2&sub_num="
yearly_urls = [base_url+archive_url+case_list_url+str(i) for i in range(1997, 2024)]
datas = []

### Crawl

In [197]:
for year in yearly_urls:
    response = urlopen(year)
    soup = BeautifulSoup(response, 'html.parser')

    pages = soup.find("li", {"class" : "next"})
    page_li = pages.find_previous_sibling("li")
    page_num = int(page_li.text.replace("[", "").replace("]", ""))

    for page in range(1, page_num+1):
        response = urlopen(f"{year}&page={page}")
        soup = BeautifulSoup(response, 'html.parser')
        cases = soup.find_all("span", {"class" : "tit"})

        pbar = tqdm(cases)
        for case in pbar:
            # case number, case link
            case_id = int(case.text.split()[-1].replace("]",""))
            

            #
            if (case_id <= len(datas)): continue
            #

            
            pbar.set_description(f"Crawling case {case_id}")
            href = case.parent["href"].split("&")[0]
            link = (base_url + archive_url + href)
            
            case_response = urlopen(link)
            case_soup = BeautifulSoup(case_response, 'html.parser')

            # case date
            div_case = case_soup.find("div", {"class" : "case"})
            case_date = div_case.find("span", {"class" : "date"}).text.split()[-1]

            # case age / sex / complaint
            div_case_lis = div_case.find_all("li")
            for li in div_case_lis:
                if "Age" in li.text:
                    age_sex = li.text.split("Sex")[1].strip().replace(" ", "") + " "
                    age, sex = age_sex.split("/")
                    sex = sex.strip()
                if "Complaint" in li.text:
                    complaint = li.text.replace("Chief Complaint", "")

            # case diagnosis / findings / brief review
            div_diag_dl = case_soup.find("dl", {"class" : "toggleCon"})
            diagnosis = div_diag_dl.find(string="Diagnosis").parent.find_next_sibling("dd").text
            findings = div_diag_dl.find(string="Radiologic Findings").parent.find_next_sibling("dd").text
            brief_review = div_diag_dl.find(string="Brief Review").parent.find_next_sibling("dd").text

            # case images
            img_num = len(case_soup.find("ul", {"class" : "thumbList"}).find_all("li"))
            p_img = case_soup.find("div", {"class" : "bigPhoto"}).find("p", {"class" : "img"})
            img_src = p_img.find(string=lambda text: isinstance(text, Comment)).split('"')[1]
            img_links = [base_url + img_src.replace("-1.", f"-{i}.") for i in range(1, img_num+1)]
            
            # correct answer rate
            div_answer = case_soup.find("div", {"class" : "answer"})
            applicants = int(div_answer.find("h3").find("span", {"class" : "fcRed"}).text)
            toggles = div_answer.find_all("a", {"class" : "_toggle"})

            ans_rates = []
            for t in toggles:
                if "Correct Answer" in t.text:
                    fraction, percentage = t.text.split(":")[1].strip().replace(" ", "").split(",")
                    if "Semi" in t.text: ans_rate = f"semi:{fraction}"
                    elif "Diff" in t.text: ans_rate = f"diff:{fraction}"
                    else: ans_rate = f"correct:{fraction}"
                    ans_rates.append(ans_rate)

            data = {
                "id" : case_id,
                "link" : link,
                "date" : case_date,
                "age" : age,
                "sex" : sex,
                "complaint" : complaint,
                "diagnosis" : diagnosis,
                "findings" : findings,
                "brief_review" : brief_review,
                "img_links" : img_links,
                "applicants" : applicants,
                "answer_rates" : ans_rates
            }

            datas.append(data)


100%|██████████| 9/9 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<00:00, 9984.06it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 3/3 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]


### Export to csv file

In [201]:
keys = datas[0].keys()
with open("../crawled_data/kstr_data.csv", "w", encoding="utf-8-sig", newline="") as csv_file: # Add newline=""
    csv_writer = csv.DictWriter(csv_file, keys) # Pass the writer to DictWriter
    csv_writer.writeheader()
    csv_writer.writerows(datas)