# 필수 모듈 임포트

In [1]:
# 필수 모듈 임포트
import requests
import re
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import json
import os

# 함수 정의

## json 저장 및 로드 함수

In [2]:
def save_to_json(data, filename):
    # 파일명에 디렉토리 경로가 포함되어 있지 않으면 현재 디렉토리를 사용
    directory = os.path.dirname(filename) or '.'
    os.makedirs(directory, exist_ok=True)  # 디렉토리 생성
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

        
def read_from_json(filename):
    with open(filename, 'r') as file:
        return json.load(file)

## - 의사 정보 크롤링

In [3]:
def scrape_doctor_profiles(max_pages, start_page=0):
    base_url = 'https://kin.naver.com/people/expert/index.naver?type=DOCTOR&page={}'
    doctor_info = []

    for page in tqdm(range(start_page, max_pages + 1)):
        url = base_url.format(page)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # 각 의사의 상세 페이지 링크와 정보를 찾아 리스트에 추가
        for index, item in enumerate(soup.select('.pro_list li'), start=1):
            doctor_link_tag = item.find('h5').find('a')
            if doctor_link_tag:
                doctor_name = doctor_link_tag.text.strip()  # 닥터 이름 추출
                doctor_id = doctor_link_tag['href'].split('u=')[1]  # 사용자 ID 추출
                specialty_tag = item.find('h6')
                specialty = specialty_tag.text.strip() if specialty_tag else '정보 없음' # 전문과목 추출
                affiliation_tag = item.find('th', string='소속기관')
                affiliation = affiliation_tag.find_next('td').text.strip() if affiliation_tag else '정보 없음' # 소속기관 추출
                answer_count_tag = item.find('th', string='총 답변')
                answer_count = int(answer_count_tag.find_next('td').text.strip().replace(',', '')) if answer_count_tag else 0 # 총 답변 수 추출

                # 의사 정보를 딕셔너리로 저장
                doctor_info.append({
                    'index': index,
                    'doctor_id': doctor_id,
                    'doctor_name': doctor_name,
                    'specialty': specialty,
                    'total_answers': answer_count,
                    'affiliation': affiliation
                })

    return doctor_info

In [4]:
# 결과값 예시
doctor_profiles = scrape_doctor_profiles(1)
doctor_profiles

100%|██████████| 2/2 [00:00<00:00,  2.25it/s]


[{'index': 1,
  'doctor_id': 'ivN%2B5Asi7CP717cEauHD78CX79cbV0HYeF3J%2FMPe7s8%3D',
  'doctor_name': '김영조',
  'specialty': '오렌지성형외과  원장',
  'total_answers': 14344,
  'affiliation': '대한의사협회'},
 {'index': 2,
  'doctor_id': 'HqUhEWj6HRrbfqwtC7sns6U6i4QH%2Fb6LcOOIQ%2BVEysQ%3D',
  'doctor_name': '이현재',
  'specialty': '이현재피부과의원 대표원장',
  'total_answers': 500,
  'affiliation': '닥톡 의사'},
 {'index': 3,
  'doctor_id': 'ZAUkqq%2Fooy0xj6pdD7zcG5%2FENCUCJVnNU%2Bes5zWAP74%3D',
  'doctor_name': '주봉현',
  'specialty': '나비한의원 대표원장',
  'total_answers': 764,
  'affiliation': '닥톡 한의사'},
 {'index': 4,
  'doctor_id': 'PJVSJSNiRY6ke0PhLzPvQr3kaRXqkeJvbRr%2B3RuMBas%3D',
  'doctor_name': '홍지헌',
  'specialty': '연세이비인후과 원장',
  'total_answers': 7973,
  'affiliation': '대한의사협회'},
 {'index': 5,
  'doctor_id': 'U8Xzb5u3oRknC5L1LVLBVvXFAKfL5iS73r8Ckhtw8YQ%3D',
  'doctor_name': '채기헌',
  'specialty': '생기나라한의원 원장',
  'total_answers': 2147,
  'affiliation': '닥톡 한의사'},
 {'index': 6,
  'doctor_id': 'qUqCSqj5n5%2Bnb4YmxVp4pzF%2

## - 질문 id 크롤링(의사별)

In [5]:
def scrape_info(doctor_id, total_answers, index):
    max_pages = total_answers // 20 + (1 if total_answers % 20 else 0)
    base_url = 'https://kin.naver.com/userinfo/expert/answerList.naver?u={user_id}&page={page}'
    all_info = []

    for page in tqdm(range(1, max_pages + 1)):
        url = base_url.format(user_id=doctor_id, page=page)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 정규 표현식으로 doc_id 추출
        pattern = r'<a href="/qna/detail\.naver\?d1id=\d+&dirId=\d+&docId=(\d+)"'
        matches = re.findall(pattern, response.text)

        # 날짜 데이터 추출
        dates = [date.text.strip() for date in soup.select('.t_num.tc')]
        
        for i, doc_id in enumerate(matches):
            date = dates[i] if i < len(dates) else '날짜 없음'
            all_info.append({'doc_id': doc_id, 'date': date})
        
    # 모든 페이지 정보가 포함된 하나의 JSON 파일 저장
    folder_name = 'doc_id_data'
    os.makedirs(folder_name, exist_ok=True) # 폴더가 없으면 생성
    filename = os.path.join(folder_name, f'{index}_{doctor_id}.json')

    save_to_json(all_info, filename)

    return all_info

In [6]:
doc_ids = scrape_info(doctor_profiles[1]['doctor_id'],doctor_profiles[1]['total_answers'])
doc_ids

100%|██████████| 25/25 [00:11<00:00,  2.14it/s]


[{'doc_id': '464304738', 'date': '2024.02.14.'},
 {'doc_id': '464307124', 'date': '2024.02.14.'},
 {'doc_id': '464291264', 'date': '2024.02.14.'},
 {'doc_id': '464294713', 'date': '2024.02.14.'},
 {'doc_id': '464299692', 'date': '2024.02.14.'},
 {'doc_id': '464303982', 'date': '2024.02.14.'},
 {'doc_id': '464306094', 'date': '2024.02.14.'},
 {'doc_id': '464284056', 'date': '2024.02.14.'},
 {'doc_id': '464274175', 'date': '2024.02.14.'},
 {'doc_id': '464276484', 'date': '2024.02.14.'},
 {'doc_id': '464267662', 'date': '2024.02.14.'},
 {'doc_id': '464265878', 'date': '2024.02.14.'},
 {'doc_id': '464257808', 'date': '2024.02.14.'},
 {'doc_id': '464262851', 'date': '2024.02.14.'},
 {'doc_id': '464261092', 'date': '2024.02.14.'},
 {'doc_id': '464263799', 'date': '2024.02.14.'},
 {'doc_id': '464231294', 'date': '2024.02.13.'},
 {'doc_id': '464204026', 'date': '2024.02.13.'},
 {'doc_id': '464220928', 'date': '2024.02.13.'},
 {'doc_id': '464221282', 'date': '2024.02.13.'},
 {'doc_id': '4642190

## 질문 답변 크롤링

In [None]:
# 스크래핑 함수 정의
def scrape_details(doc_ids):
    base_url = 'https://kin.naver.com/qna/detail.naver?d1id=7&dirId=70201&docId={}'
    head = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7,ja;q=0.6',
    'Cache-Control': 'max-age=0',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
    'Upgrade-Insecure-Requests': '1',}
    retry_delays = [5, 10, 20]
    
    # 데이터를 저장할 빈 리스트 생성
    title_list = []
    question_list = []
    answer_list = []

    # tqdm을 사용하여 진행 상황을 시각화
    for doc_id in tqdm(doc_ids):
        attempt = 0
        while attempt <= len(retry_delays):
            try:
                url = base_url.format(doc_id['doc_id'])
                r = requests.get(url, headers=head)
                r.raise_for_status()
                bs = BeautifulSoup(r.text, 'html.parser')

                title_data = bs.select_one('.title') # 질문 제목
                title = title_data.text.strip() if title_data else None
                
                question_data = bs.select_one('.c-heading__content') # 질문 내용
                question = question_data.text.strip() if question_data else None
                
                answer_data = bs.select_one('.se-main-container') # 답변
                answer = answer_data.text.strip() if answer_data else None

                title_list.append(title)
                question_list.append(question)
                answer_list.append(answer)
                break

            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 429:  # Too Many Requests
                    if attempt == len(retry_delays):
                        tqdm.write(f"Failed to scrape doc_id {doc_id['doc_id']} after multiple attempts.")
                        break  # 최대 재시도 횟수에 도달하면 실패로 간주
                    tqdm.write(f"Rate limit reached. Retrying in 1 seconds...")
                    time.sleep(1)  # 지정된 시간만큼 대기
                    attempt += 1
                else:
                    tqdm.write(f"Failed to scrape doc_id {doc_id['doc_id']}: {e}")
                    break  # 다른 유형의 HTTP 에러는 재시도하지 않음
            except requests.exceptions.RequestException as e:
                tqdm.write(f"Failed to scrape doc_id {doc_id['doc_id']}: {e}")
                break  # 네트워크 문제 등 다른 예외에 대한 처리
            time.sleep(1)  # 다음 요청까지의 기본 지연 시간


    data = {'title': title_list, 'question': question_list, 'answer': answer_list}
    return pd.DataFrame(data)

# ⭐️모든 doc_id 추출⭐️

In [8]:
# all_doctor_profiles 추출
max_pages = 526
doctor_profiles = scrape_doctor_profiles(max_pages)

# json 저장
filename = "doctor_profiles.json"
save_to_json(doctor_profiles, filename)
print(f"Data saved to {filename}")

100%|██████████| 527/527 [02:44<00:00,  3.20it/s]

Data saved to doctor_profiles.json





In [11]:
# doctor_profiles.json에서 의사 프로필 정보를 로드
doctor_profiles = read_from_json("doctor_profiles.json")

start_index = 2
# 각 의사의 정보에 대해 scrape_info 함수를 호출하여 정보 수집 및 저장
for profile in doctor_profiles[start_index-1:131]:
    doctor_id = profile['doctor_id']
    total_answers = profile['total_answers']
    index = profile['index']
    scrape_info(doctor_id, total_answers, index) 
    
print("모든 의사의 답변 정보가 성공적으로 저장되었습니다.")

  0%|          | 0/718 [00:00<?, ?it/s]

100%|██████████| 718/718 [05:12<00:00,  2.30it/s]
100%|██████████| 25/25 [02:02<00:00,  4.91s/it]
100%|██████████| 399/399 [07:14<00:00,  1.09s/it] 
100%|██████████| 108/108 [00:53<00:00,  2.00it/s]
100%|██████████| 2284/2284 [19:58<00:00,  1.91it/s]
100%|██████████| 92/92 [02:07<00:00,  1.38s/it]
 28%|██▊       | 384/1383 [11:53:07<30:55:14, 111.43s/it]


KeyboardInterrupt: 

In [10]:
527/4

131.75

## 의사 1명에 대한 샘플 추출

In [None]:
doctor_profiles = scrape_doctor_profiles(1)
doctor_profiles[0], doctor_profiles

In [None]:
doc_ids_info = scrape_info(doctor_profiles[0]['doctor_id'],doctor_profiles[0]['total_answers'])
doc_ids_info, doc_ids_info[doctor_profiles[0]['doctor_id']]

In [None]:
df = scrape_details(doc_ids_info[doctor_profiles[0]['doctor_id']])

In [None]:
for doctor in doctor_profiles:
    doctor_id = doctor['doctor_id']
    total_answers = doctor['total_answers']
    doc_ids_info = scrape_info(doctor_id, total_answers)
    if doctor_id in doc_ids_info:
        df = scrape_details(doc_ids_info[doctor_id])
        # 파일명에 의사 ID를 포함하여 각 의사별로 고유한 JSON 파일 생성
        filename = f'doctor_data_{doctor_id}.json'
        df.to_json(filename, orient='records', force_ascii=False, lines=True)
        break