# 필수 모듈 임포트

In [1]:
# 필수 모듈 임포트
import requests
import re
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import json
import os
from requests.exceptions import Timeout

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# scrape_details 함수에서 timeout된 doc_ids를 저장할 list 생성

timeout_doc_ids=[]

# 함수 정의

## json 저장 및 로드 함수

In [3]:
def save_to_json(data, filename):
    # 파일명에 디렉토리 경로가 포함되어 있지 않으면 현재 디렉토리를 사용
    directory = os.path.dirname(filename) or '.'
    os.makedirs(directory, exist_ok=True)  # 디렉토리 생성
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

        
def read_from_json(filename):
    with open(filename, 'r',encoding='utf-8') as file:
        return json.load(file)

## - 의사 정보 크롤링

In [4]:
def scrape_doctor_profiles(max_pages, start_page=0):
    base_url = 'https://kin.naver.com/people/expert/index.naver?type=DOCTOR&page={}'
    doctor_info = []

    for page in tqdm(range(start_page, max_pages + 1)):
        url = base_url.format(page)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # 각 의사의 상세 페이지 링크와 정보를 찾아 리스트에 추가
        for index, item in enumerate(soup.select('.pro_list li'), start=1):
            doctor_link_tag = item.find('h5').find('a')
            if doctor_link_tag:
                doctor_name = doctor_link_tag.text.strip()  # 닥터 이름 추출
                doctor_id = doctor_link_tag['href'].split('u=')[1]  # 사용자 ID 추출
                specialty_tag = item.find('h6')
                specialty = specialty_tag.text.strip() if specialty_tag else '정보 없음' # 전문과목 추출
                affiliation_tag = item.find('th', string='소속기관')
                affiliation = affiliation_tag.find_next('td').text.strip() if affiliation_tag else '정보 없음' # 소속기관 추출
                answer_count_tag = item.find('th', string='총 답변')
                answer_count = int(answer_count_tag.find_next('td').text.strip().replace(',', '')) if answer_count_tag else 0 # 총 답변 수 추출

                # 의사 정보를 딕셔너리로 저장
                doctor_info.append({
                    'index': index,
                    'doctor_id': doctor_id,
                    'doctor_name': doctor_name,
                    'specialty': specialty,
                    'total_answers': answer_count,
                    'affiliation': affiliation
                })

    return doctor_info

In [5]:
# # 결과값 예시
# doctor_profiles = scrape_doctor_profiles(1)
# doctor_profiles

## - 질문 id 크롤링(의사별)

In [6]:
def scrape_info(doctor_id, total_answers, index):
    max_pages = total_answers // 20 + (1 if total_answers % 20 else 0)
    base_url = 'https://kin.naver.com/userinfo/expert/answerList.naver?u={user_id}&page={page}'
    all_info = []

    for page in tqdm(range(1, max_pages + 1)):
        url = base_url.format(user_id=doctor_id, page=page)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 정규 표현식으로 doc_id 추출
        pattern = r'<a href="/qna/detail\.naver\?d1id=\d+&dirId=\d+&docId=(\d+)"'
        matches = re.findall(pattern, response.text)

        # 날짜 데이터 추출
        dates = [date.text.strip() for date in soup.select('.t_num.tc')]
        
        for i, doc_id in enumerate(matches):
            date = dates[i] if i < len(dates) else '날짜 없음'
            all_info.append({'doc_id': doc_id, 'date': date})
        
    # 모든 페이지 정보가 포함된 하나의 JSON 파일 저장
    folder_name = 'doc_id_data'
    os.makedirs(folder_name, exist_ok=True) # 폴더가 없으면 생성
    filename = os.path.join(folder_name, f'{index}_{doctor_id}.json')

    save_to_json(all_info, filename)

    return all_info

In [7]:
# doc_ids = scrape_info(doctor_profiles[1]['doctor_id'],doctor_profiles[1]['total_answers'])
# doc_ids

## 질문 답변 크롤링

In [8]:
# CSV 파일에 데이터를 추가하는 함수
def append_to_csv(data, file_path):
    # 파일이 존재하지 않으면 헤더를 포함하여 새 파일을 생성, 이미 존재하면 데이터만 추가
    if not os.path.isfile(file_path):
        df = pd.DataFrame([data])
        df.to_csv(file_path, mode='w', index=False, encoding='utf-8-sig')
    else:
        df = pd.DataFrame([data])
        df.to_csv(file_path, mode='a', index=False, encoding='utf-8-sig', header=False)

# 스크래핑 함수 정의
def scrape_details(doc_id,i):
    base_url = 'https://kin.naver.com/qna/detail.naver?d1id=7&dirId=70201&docId={}'
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7,ja;q=0.6',
        'Cache-Control': 'max-age=0',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
        'Upgrade-Insecure-Requests': '1',
    }
    file_index = i // 10000
    try:
        url = base_url.format(doc_id)
        response = requests.get(url,timeout=5)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        data = {
            'doc_id': doc_id,
            'title': soup.select_one('.title').text.strip() if soup.select_one('.title') else 'N/A',
            'question': soup.select_one('.c-heading__content').text.strip() if soup.select_one('.c-heading__content') else (soup.select_one('.c-heading__title').text.strip() if soup.select_one('.c-heading__title') else 'N/A'),
            'answer': soup.select_one('.se-main-container').text.strip() if soup.select_one('.se-main-container') else (soup.select_one('._endContentsText.c-heading-answer__content-user').text.strip() if soup.select_one('._endContentsText.c-heading-answer__content-user') else 'N/A'),
        }

        # CSV 파일 경로 정의
        csv_file_path = f'naver_QNA_details_9_{file_index}.csv'
        # CSV 파일에 데이터 추가
        append_to_csv(data, csv_file_path)

    except Timeout:
        print(f"경고: {doc_id}에 대한 처리가 {5}초 동안 시간 초과되었습니다. 다음 반복으로 넘어갑니다.")
        timeout_doc_ids.append(doc_id)
        time.sleep(5)
    except Exception as e:
        print(f"Error scraping doc_id {doc_id}: {e}")

In [9]:
timeout_doc_ids

[]

# ⭐️모든 doc_id 추출⭐️

In [10]:
doc_ids_list = []
for filename in os.listdir('doc_id_data'):
    if filename.endswith('.json'):
        file_path = os.path.join('doc_id_data', filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            doc_ids_list.extend(item['doc_id'] for item in data)

In [11]:
#여기서 각자 맡은 번호 주석 제거해서 실행하면됨
'''
0,1,2 민석

3,4,5 민수

6,7,8 승표

9,10,11 성철
'''


# start = len(doc_ids_list)//12 * 0
# end = len(doc_ids_list)//12 * 1
# doc_ids_list= doc_ids_list[start:end]


# start = len(doc_ids_list)//12 * 1
# end = len(doc_ids_list)//12 * 2
# doc_ids_list= doc_ids_list[start:end]


# start = len(doc_ids_list)//12 * 2
# end = len(doc_ids_list)//12 * 3
# doc_ids_list= doc_ids_list[start:end]


# start = len(doc_ids_list)//12 * 3
# end = len(doc_ids_list)//12 * 4
# doc_ids_list= doc_ids_list[start:end]


# start = len(doc_ids_list)//12 * 4
# end = len(doc_ids_list)//12 * 5
# doc_ids_list= doc_ids_list[start:end]


# start = len(doc_ids_list)//12 * 5
# end = len(doc_ids_list)//12 * 6
# doc_ids_list= doc_ids_list[start:end]


# start = len(doc_ids_list)//12 * 6
# end = len(doc_ids_list)//12 * 7
# doc_ids_list= doc_ids_list[start:end]


# start = len(doc_ids_list)//12 * 7
# end = len(doc_ids_list)//12 * 8 
# doc_ids_list= doc_ids_list[start:end]


# start = len(doc_ids_list)//12 * 8
# end = len(doc_ids_list)//12 * 9
# doc_ids_list= doc_ids_list[start:end]


start = len(doc_ids_list)//12 * 9
end = len(doc_ids_list)//12 * 10
doc_ids_list= doc_ids_list[start:end]


# start = len(doc_ids_list)//12 * 10
# end = len(doc_ids_list)//12 * 11
# doc_ids_list= doc_ids_list[start:end]


# start = len(doc_ids_list)//12 * 11
# end = len(doc_ids_list)//12 * 12 + len(doc_ids_list)%12
# doc_ids_list= doc_ids_list[start:end]





In [15]:
len(doc_ids_list)
last_saved = -1
start = 0

In [16]:
k = start
try:
    for i, doc_id in enumerate(tqdm(doc_ids_list[start:], initial=start)):
        k += 1
        scrape_details(doc_id,k)
        # last_saved = i  # 성공적으로 처리될 때마다 업데이트
except Exception as e:
    print(f"{last_saved}번까지 저장되었습니다. 오류: {e}")
else:
    print(f"{last_saved}번까지 저장되었습니다. 상세 데이터가 성공적으로 저장되었습니다.")

  0%|          | 47/276650 [00:37<61:44:07,  1.24it/s]


KeyboardInterrupt: 

### 중복 제거 / 미작업 doc_id 찾기

In [None]:
# # CSV 파일 불러오기
# df = pd.read_csv('naver_QNA_details.csv')

# # 'doc_id' 기준으로 중복 로우 제거, 첫 번째 출현을 유지
# df_unique = df.drop_duplicates(subset='doc_id', keep='first')

# # 중복 제거된 데이터프레임을 다시 CSV 파일로 저장
# df_unique.to_csv('naver_QNA_details_unique.csv', index=False)

In [None]:
# # CSV 파일에서 doc_id 열을 불러옴
# df = pd.read_csv('naver_QNA_details_unique.csv')
# completed_doc_ids = df['doc_id'].tolist()

# # 작업하지 않은 doc_id만 찾기
# remaining_doc_ids = list(set(doc_ids_list) - set(completed_doc_ids))