In [7]:
import requests
from bs4 import BeautifulSoup
import csv
import json
from elasticsearch import Elasticsearch, helpers
import configparser
import pandas as pd
from datetime import datetime, timedelta

present_date = str(datetime.utcnow() + timedelta(hours=9))[:10]

index="sm_best_100_"

config = configparser.ConfigParser()
config.read('example.ini')

es = Elasticsearch(
    cloud_id=config['ELASTIC']['cloud_id'],
    http_auth=(config['ELASTIC']['user'], config['ELASTIC']['password'])
)

if es.indices.exists(index = index + present_date):
    pass
else:
    es.indices.create(index = index + present_date, body = {
  "mappings": {
    "properties": {
      "제목": {
        "type": "text"
      },
      "genre": {
        "type": "keyword"
      },
      "writer": {
        "type": "text"
      },
      "pulisher": {
        "type": "keyword"
      },
      "date": {
        "type": "date"
      },
      "count": {
        "type": "keyword"
      },
      "reservation": {
        "type": "integer"
      },
      "checkout": {
        "type": "integer"
      },
      "like": {
        "type": "integer"
      },
      "present_date": {
        "type": "date"
      }
    }
  }
})

# 상명대학교 전자도서관 베스트 셀러 URL
URL = "https://libebook.smu.ac.kr/FxLibrary/product/list/?category_type=book&cateopt=best" 

# 총 페이지 수 추출 함수
def get_page_count():
    result = requests.get(URL)

    soup = BeautifulSoup(result.text, "html.parser")

    count = len(soup.select("div.paging span a"))+1
    
    return count

# 각 페이지마다 책 정보 추출 함수
def extract_books(page_count):
    book_list = []
    
    for page in range(page_count):
        print(f"Scrapping page {page+1}")
        
        rs = requests.get(f"https://libebook.smu.ac.kr/FxLibrary/product/list/?itemdv=1&sort=3&page={page+1}&itemCount=20&pageCount=10&category=&middlecategory=&cateopt=best&group_num=recommand&catenavi=main&category_type=book&searchoption=&keyoption=&keyoption2=&keyword=&listfilter=all_list&selectview=list_on&searchType=&name=&publisher=&author=&terminal=")
        
        soup = BeautifulSoup(rs.text, "html.parser")
        
        results=soup.select("div.book_list ul.list.typelist li.item")
        
        for rs in results:
            try:
                subject = rs.select_one("div.subject a").text
                
                category = rs.select_one("div.category span").text
    
                writer = rs.select("div.info ul.i1 a")[0].text

                publisher = rs.select("div.info ul.i1 a")[1].text

                date = rs.select("div.info ul.i1 li")[2].text
    
                if date == '0000-00-00' or date == "NaN":         
                    date = '1970-01-01'
                
                count = rs.select("div.info ul.i2 li")[0].text
                count = count.split(' ')[1]
                
                reservation = rs.select("div.info ul.i2 li")[1].text
                reservation = reservation.split(' ')[1]

                checkout = rs.select("div.info ul.i2 li")[2].text
                checkout = checkout.split(' ')[1]
                
                like = rs.select("div.info ul.i2 li")[3].text
                like = like.split(' ')[1]
            
                book_list.append([subject, category, writer, publisher, date, count, reservation, checkout, like, present_date])
            
                es.index(
                    index = index + present_date,
                    document = {
                        'subject':subject,
                        'genre':category,
                        'writer':writer,
                        'publisher':publisher,
                        'date':date,
                        'count':count,
                        'reservation':reservation,
                        'checkout':checkout,
                        'like':like,
                        'present_date':present_date
                    })
                
            except Exception as e:    # 모든 예외의 에러 메시지를 출력할 때는 Exception을 사용
                print('예외가 발생했습니다.', e)
                pass
            
    return book_list

# CSV 저장
page_count = get_page_count()

books = extract_books(page_count)

df = pd.DataFrame(books, columns = ["제목", "genre", "writer", "pulisher", "date", "count", "reservation", "checkout","like", "present_date"])

df.to_csv(f"{index + present_date}.csv", index = False, encoding = 'utf-8-sig')

  es = Elasticsearch(
  es.indices.create(index = index + present_date, body = {


Scrapping page 1
Scrapping page 2
Scrapping page 3
Scrapping page 4
Scrapping page 5


In [8]:
df

Unnamed: 0,제목,genre,writer,pulisher,date,count,reservation,checkout,like,present_date
0,트렌드 코리아 2023,[경제/비즈니스],"김난도, 전미영, 최지혜, 이수진, 권정윤, 이준영, 이향은, 한다혜, 이혜원, 추예린",미래의창,2022-10-06,20/20,4,163,1,2023-02-23
1,물고기는 존재하지 않는다,[자연/과학],룰루 밀러,곰출판,2022-01-19,20/20,3,191,2,2023-02-23
2,달러구트 꿈 백화점,[문학],이미예,북닻,2020-07-28,10/10,0,204,4,2023-02-23
3,지구 끝의 온실,[문학],김초엽,자이언트북스,2022-04-21,10/10,1,137,0,2023-02-23
4,불편한 편의점,[문학],김호연,나무옆의자,2022-02-22,7/10,0,152,0,2023-02-23
...,...,...,...,...,...,...,...,...,...,...
95,3년 후 부의 흐름이 보이는 경제지표 정독법,[경제/비즈니스],김영익,한즈미디어,2022-08-29,1/2,0,2,0,2023-02-23
96,4차 산업혁명 문제는 과학이야,[자연/과학],"박재용 , 서검교, 윤신영, 임창환",MID,2019-06-14,1/1,0,6,0,2023-02-23
97,4차산업혁명과 스마트 비즈니스,[강의지원도서],배재권,박영사,2020-11-12,0/6,0,3,0,2023-02-23
98,Do it! BERT와 GPT로 배우는 자연어 처리,[컴퓨터/인터넷],이기창,이지스퍼블리싱 (주),2021-12-01,1/2,0,8,0,2023-02-23
