In [25]:
import requests
from bs4 import BeautifulSoup as bs

In [26]:
# 0 = 즉위년, 1 = 1년 ...
def getMonthKeysFromYear(year):
    res = requests.get("http://sillok.history.go.kr/search/inspectionMonthList.do?id=kda")
    soup = bs(res.content, "html.parser")
    body_year = soup.find("ul", "king_year2").findAll("ul", "clear2")[year]

    key = list(map(lambda a: a.text, body_year.findAll("a")))
    value = list(map(lambda a: a["href"].split("(")[1].split(",")[0].replace('\'', ""), body_year.findAll("a")))
    return [key, value]

In [27]:
def isSiteValid(res):
    return "조선왕조실록 : 요청하신 페이지를 찾을 수 없습니다." not in res.text

def getDayUrlFromMonthKey(monthKey, start=1, end=33):
    answer = []
    for day in range(start, end):
        articles = []
        for article in range(1, 100):
            url = f'http://sillok.history.go.kr/id/{monthKey}{day:02d}_{article:03d}'
            res = requests.get(url)
            if not isSiteValid(res): break
            articles.append(url)
        answer.append([day, articles])
    return answer

In [28]:

# output: volume, date, hangul, hanza
def getFromUrl(url):
    res = requests.get(url)
    if not isSiteValid(res): raise "Invalid url on getFromUrl function"
    soup = bs(res.content, "html.parser")

    # finding volume and date
    parent = soup.find("span", "tit_loc")
    child = parent.find("span")
    child.extract() # extract child tag from parent tag
    volume, date = list(map(lambda x: x.strip(), parent.text.strip().split(",")))

    # finding hangul
    hangul = soup.find("div", "ins_left_in").find("div", "ins_view_pd")
    ## Remove footnotes
    foots = hangul.findAll("a", "idx_annotation04_foot")
    for foot in foots: foot.extract()
    foots = hangul.findAll("a", "footnote_super")
    for foot in foots: foot.extract()
    foots = hangul.findAll("ul", "ins_source")
    for foot in foots: foot.extract()
    foots = hangul.findAll("ul", "ins_footnote")
    for foot in foots: foot.extract()
    ## Paragraph
    paragraph_p = list(map(lambda p: p.text.strip(), hangul.findAll("p", "paragraph")))
    paragraph_hangul = "\n".join(paragraph_p)
        
    # finding hanza
    hanza = soup.find("div", "ins_right_in").find("div", "ins_view_pd")
    removeables = hanza.findAll("ul", "ins_source")
    for removeable in removeables: removeable.extract()
    paragraph_p = list(map(lambda p: p.text.strip(), hanza.findAll("p", "paragraph")))
    paragraph_hanza = "\n\n".join(paragraph_p)
    
    return [volume, date, paragraph_hangul, paragraph_hanza]

In [29]:
import csv

ganz = []

with open('ganz.csv', newline='', encoding="UTF8") as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    for row in spamreader:
        ganz.append(list(row))

def getGanzFromHangul(hangul):
    answer = "---없습니다.---"
    for g in ganz:
        if hangul == g[2]:
            answer = g[1]
            break;
    return answer
    
def ganji(year):
    cheongan = ["경", "신", "임", "계", "갑", "을", "병", "정", "무", "기"];
    jiji = ["신", "유", "술", "해", "자", "축", "인", "묘", "진", "사", "오", "미"];
    sibiji = ["원숭이", "닭", "개", "돼지", "쥐", "소", "호랑이", "토끼", "용", "뱀", "말", "양"];
    ganji1 = year%10;
    ganji2 = year%12;

    return cheongan[ganji1] + jiji[ganji2]


In [30]:
# Output: [volume, date, hangul, hanza]
def getFromDay(monthKey, day):
    days = getDayUrlFromMonthKey(monthKey, day, day+1)

    hanguls = []
    hanza_answer = ""

    if len(days[0]) == 0:
      return

    for url in days[0][1]:
        print(".", end="")
        volume, date, hangul, hanza = getFromUrl(url)
        
        hanguls.append(hangul.strip())
        hanza_answer = hanza_answer + "\n" + hanza
    
    day_h = date.split(" ")[-2]
    return volume, date, day_h + "일(" + getGanzFromHangul(day_h) + "日-" + str(day) + "일)에 " + "\n\n○ ".join(hanguls), hanza_answer


In [None]:
def getFromMonthKey(year, month, monthKey):
    months = getDayUrlFromMonthKey(monthKey)
    hanguls = []
    hanzas = []
    for day, urls in months:
        print (f"{day}(", end="")
        if len(urls) == 0: continue
        volume, date, hangul, hanza = getFromDay(monthKey, day)
        hanguls.append(hangul)
        hanzas.append(hanza)
        print (f") ", end="")
    if len(urls) == 0:
        print (f") ", end="")
    print ("")
    
    # ---
    print (f"{month}의 정보를 파일로 만드는 중입니다.")
    hangul = "\n\n".join(hanguls)
    hanza = "\n\n".join(hanzas)
    
    year_hangul = f"세종{year}년"
    if year == 0:
        year_hangul = "세종 즉위년"
    
    # 세종1년 (1418년) 무술년 (무술년) 8월
    gan = ganji(year+1417)
    title_hangul = f"{year_hangul} ({year+1417}년) {gan}년 ({getGanzFromHangul(gan)}年) {month}"
    # 원문 (세종실록 1권, 세종 즉위년 8월)
    title_hanza = f"원문 ({volume}, {year_hangul} {month})"
    
    with open(f'{month} 국역.txt', 'w', encoding="UTF8") as f:
        f.write(volume + "\n" + title_hangul + "\n\n" + hangul)
    with open(f'{month} 원문.txt', 'w', encoding="UTF8") as f:
        f.write(title_hanza + "\n" + hanza)

In [32]:
getMonthKeysFromYear(1)

[['1월', '2월', '3월', '4월', '5월', '6월', '7월', '8월', '9월', '10월', '11월', '12월'],
 ['kda_101010',
  'kda_101020',
  'kda_101030',
  'kda_101040',
  'kda_101050',
  'kda_101060',
  'kda_101070',
  'kda_101080',
  'kda_101090',
  'kda_101100',
  'kda_101110',
  'kda_101120']]

In [35]:
getFromMonthKey(1, "1월", "kda_101010")

1(..) 2(...) 3(...) 4(.) 5(....) 6(.........) 7(....) 8(.....) 9(.......) 10(......) 11(.....) 12(.....) 13(....) 14(....) 15(.....) 16(.......) 17(........) 18(....) 19(.....) 20(.....) 21(....) 22(.....) 23(.....) 24(......) 25(.......) 26(...) 27(.......) 28(......) 29(......) 30(......) 31(32(
1월의 정보를 파일로 만드는 중입니다.


In [36]:
getFromMonthKey(1, "3월", "kda_101030")

1(.......) 2(.......) 3(.) 4(..........) 5(.....) 6(.........) 7(........) 8(......) 9(......) 10(.) 11(..) 12(..) 13(..) 14(...) 15(..) 16(.) 17(..) 18(.) 19(.) 20(.) 21(.....) 22(.) 23(....) 24(..) 25(..) 26(.......) 27(..........) 28(....) 29(...) 30(....) 31(32(
3월의 정보를 파일로 만드는 중입니다.


In [37]:
getFromMonthKey(1, "4월", "kda_101040")

1(...) 2(....) 3(4(.......) 5(..) 6(...) 7(..) 8(...) 9(...) 10(.) 11(......) 12(....) 13(......) 14(.......) 15(...........) 16(........) 17(.........) 18(...) 19(.......) 20(.....) 21(..) 22(...) 23(...) 24(...) 25(..) 26(.........) 27(......) 28(..) 29(...) 30(......) 31(32(
4월의 정보를 파일로 만드는 중입니다.


In [38]:
getFromMonthKey(1, "5월", "kda_101050")

1(.....) 2(....) 3(....) 4(......) 5(.......) 6(..) 7(....) 8(....) 9(...) 10(........) 11(.....) 12(

In [None]:
getFromMonthKey(1, "6월", "kda_101060")

In [None]:
getFromMonthKey(1, "7월", "kda_101070")

In [None]:
getFromMonthKey(1, "8월", "kda_101080")

In [None]:
getFromMonthKey(1, "9월", "kda_101090")

In [None]:
getFromMonthKey(1, "10월", "kda_101100")

In [None]:
getFromMonthKey(1, "11월", "kda_101110")

In [None]:
getFromMonthKey(1, "12월", "kda_101120")