In [12]:
import pandas as pd
import requests
from openpyxl import load_workbook


# 엑셀 파일 읽기
file_path = r"C:\Users\laptop_hyuntaklee\Downloads\붙임1.Gachon Mibrary 도서 구입 신청서(양식)-글로벌_이현탁.xlsx"
df = pd.read_excel(file_path,skiprows=4)

df.head()
df.tail()

Unnamed: 0,순번,도서명,저자명,출판사,출판년,ISBN(13자리),정가,비고
117,118,Applied Time Series Analysis: A Practical Guid...,Terence C. Miils,Academic Press,2019.0,9780128131176.0,,계량경제학
118,119,,,,,,,
119,120,,,,,,,
120,121,,,,,,,
121,122,,,,,,,


In [6]:
# 검색 함수 정의
def search_amazon(isbn):
    url = f"https://www.amazon.com/s?k={isbn}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        if str(isbn) in response.text:
            return 'match', isbn
        else:
            return 'no match', None
    except Exception as e:
        return 'no match', None

def search_kyobo(isbn):
    url = f"https://search.kyobobook.co.kr/web/search?vPstrKeyWord={isbn}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        if str(isbn) in response.text:
            return 'match', isbn
        else:
            return 'no match', None
    except Exception as e:
        return 'no match', None

# 매칭 여부 확인
def check_isbn(row):
    isbn = str(row['ISBN(13자리)'])
    if len(isbn) != 13:
        return 'invalid isbn', None

    # 아마존 검색 (국외서)
    match, correct_isbn = search_amazon(isbn)
    if match == 'match':
        return match, correct_isbn

    # 교보문고 검색 (국내서)
    match, correct_isbn = search_kyobo(isbn)
    return match, correct_isbn

# 데이터프레임 업데이트
results = df.apply(lambda row: check_isbn(row), axis=1)
df['매칭여부'], df['정확한ISBN'] = zip(*results)

In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import logging
from difflib import SequenceMatcher
import time

# 로깅 설정
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# 유사도 계산 함수
def calculate_similarity(a, b):
    a, b = a.strip(), b.strip()  # 공백 제거
    return SequenceMatcher(None, a, b).ratio()

# 교보문고 검색 함수
def search_kyobo(isbn):
    logging.info(f"Searching on Kyobo for ISBN: {isbn}")
    url = f"https://search.kyobobook.co.kr/web/search?vPstrKeyWord={isbn}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # 제목 파싱
        title_element = soup.find('span', id=lambda x: x and x.startswith('cmdtName_'))
        if title_element:
            title = title_element.text.strip()
            logging.info(f"Title found on Kyobo: {title}")
            return 'match', isbn, title
        else:
            logging.info(f"No title found on Kyobo for ISBN: {isbn}")
            return 'no match', None, None
    except Exception as e:
        logging.error(f"Error searching on Kyobo for ISBN {isbn}: {e}")
        return 'no match', None, None

# 아마존 검색 함수
def search_amazon(isbn):
    logging.info(f"Searching on Amazon for ISBN: {isbn}")
    url = f"https://www.amazon.com/s?k={isbn}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # 제목 파싱
        title_element = soup.find('h2', class_='a-size-medium a-spacing-none a-color-base a-text-normal')
        if title_element:
            title = title_element.text.strip()
            logging.info(f"Title found on Amazon: {title}")
            return 'match', isbn, title
        else:
            logging.info(f"No title found on Amazon for ISBN: {isbn}")
            return 'no match', None, None
    except Exception as e:
        logging.error(f"Error searching on Amazon for ISBN {isbn}: {e}")
        return 'no match', None, None

# 매칭 여부 확인 함수
def check_isbn(row):
    isbn = str(row['ISBN(13자리)']).strip()  # ISBN을 문자열로 변환하고 공백 제거

    # '도서명' 값이 NaN인지 확인하고 처리
    if pd.isna(row['도서명']):
        local_title = ""  # 빈 문자열로 처리
    else:
        local_title = str(row['도서명']).strip()  # 문자열로 변환 후 공백 제거

    logging.info(f"Processing ISBN: {isbn}")

    if len(isbn) != 13:
        logging.warning(f"ISBN {isbn} is invalid (length is not 13).")
        return 'invalid isbn', None, None, None

    # 아마존 검색
    match, correct_isbn, web_title = search_amazon(isbn)
    if match == 'match':
        similarity = calculate_similarity(local_title, web_title)
        return match, correct_isbn, web_title, similarity

    # 교보문고 검색
    match, correct_isbn, web_title = search_kyobo(isbn)
    if match == 'match':
        similarity = calculate_similarity(local_title, web_title)
        return match, correct_isbn, web_title, similarity

    return 'no match', None, None, None

# 엑셀 파일 읽기
# file_path = "your_file.xlsx"  # 사용자 엑셀 파일 경로
output_file_path = "updated_file_with_similarity.xlsx"
df = pd.read_excel(file_path, skiprows=4)  # 필요한 경우 5번 행부터 읽기 위해 skiprows 사용

# 데이터프레임 업데이트
results = df.apply(lambda row: check_isbn(row), axis=1)
df['매칭여부'], df['정확한ISBN'], df['검색된도서명'], df['유사도'] = zip(*results)

# # 결과 저장
# df.to_excel(output_file_path, index=False)
# logging.info(f"Updated Excel file saved to {output_file_path}")


2025-01-10 16:35:55,649 - INFO - Processing ISBN: 9780691121376
2025-01-10 16:35:55,651 - INFO - Searching on Amazon for ISBN: 9780691121376
2025-01-10 16:35:55,909 - ERROR - Error searching on Amazon for ISBN 9780691121376: 503 Server Error: Service Unavailable for url: https://www.amazon.com/s?k=9780691121376
2025-01-10 16:35:55,910 - INFO - Searching on Kyobo for ISBN: 9780691121376
2025-01-10 16:35:56,797 - INFO - Title found on Kyobo: Asset Pricing
2025-01-10 16:35:56,798 - INFO - Processing ISBN: 9780691043012
2025-01-10 16:35:56,798 - INFO - Searching on Amazon for ISBN: 9780691043012
2025-01-10 16:35:57,048 - ERROR - Error searching on Amazon for ISBN 9780691043012: 503 Server Error: Service Unavailable for url: https://www.amazon.com/s?k=9780691043012
2025-01-10 16:35:57,050 - INFO - Searching on Kyobo for ISBN: 9780691043012
2025-01-10 16:35:58,045 - INFO - Title found on Kyobo: The Econometrics of Financial Markets
2025-01-10 16:35:58,048 - INFO - Processing ISBN: 9781260013

In [16]:
df[df['매칭여부'] != 'match']

Unnamed: 0,순번,도서명,저자명,출판사,출판년,ISBN(13자리),정가,비고,매칭여부,정확한ISBN,검색된도서명,유사도
2,3,Principles of Corporate Finance,"Richard A. Brealey, Stewart C. Myers, Franklin...",McGraw-Hill Education,2019.0,9781260013900.0,,기업재무,no match,,,
3,4,Corporate Finance,"Jonathan Berk, Peter DeMarzo",Pearson,2020.0,9780135183809.0,,기업재무,no match,,,
14,15,Machine Learning in Business: An Introduction ...,John C. Hull,World Scientific Publishing Company,2021.0,9789811223328.0,,금융 머신러닝,no match,,,
28,29,Quantitative Trading Strategies using Python,Peng Liu,O'REILLY,2023.0,9781484296752.0,,데이터분석,no match,,,
43,44,Machine Learning for Financial Risk Management...,Abdullah Karasan,O'Reilly,2021.0,9781492085249.0,,데이터분석,no match,,,
46,47,Practical Statistics for Data Scientists,"Peter Bruce, Andrew Bruce",O'REILLY,2020.0,9788194435006.0,,데이터분석,no match,,,
75,76,Machine Learning Approaches in Financial Analy...,"Leandros A. Maglaras, Sonali Das, Naliniprava ...",Springer,2023.0,9783031832659.0,,머신러닝(finance),no match,,,
79,80,Text Data Mining,Chengqing Zong,Springer,2021.0,9789811601002.0,,텍스트마이닝,no match,,,
80,81,Text Mining for Information Professionals: An ...,"Manika Lamba, Margam Madhusudhan",Springer,2022.0,9783030850852.0,,텍스트마이닝,no match,,,
91,92,Customer Insights - Second Edition,"Aila Khan, Mohammad Munir Hossain, Sabreena Zo...",Western Sydney University,2023.0,9780731420181.0,,소비자 데이터분석,no match,,,


In [17]:
df[df['유사도'] <0.5]

Unnamed: 0,순번,도서명,저자명,출판사,출판년,ISBN(13자리),정가,비고,매칭여부,정확한ISBN,검색된도서명,유사도
9,10,자산운용을 위한 금융 머신러닝\n(원서: Machine Learning for As...,Marcos López de Prado,Cambridge University Press,2021.0,9791161754918,,금융 머신러닝,match,9791161754918,자산운용을 위한 금융 머신러닝,0.329897
10,11,실전 금융 머신러닝 완벽 분석\n(원서: Advances in Financial M...,Marcos López de Prado,Wiley,2019.0,9791161752334,,금융 머신러닝,match,9791161752334,실전 금융 머신러닝 완벽 분석,0.323232
15,16,금융 머신러닝(번역본)\n(원서: Machine Learning in Finance...,Matthew F. Dixon,Springer,2022.0,9791161755939,,금융 머신러닝,match,9791161755939,금융 머신러닝,0.197183
20,21,실용 SQL: PostgreSQL로 시작하는 데이터 스토리텔링 가이드북\n(원서: ...,Anthony DeBarros\n(번역: 임소정),영진닷컴,2023.0,9788931465952,,데이터베이스 시스템,match,9788931465952,Bible Origins (Portions of the New Testament +...,0.12766
21,22,AWS 쿡북\n(원서: AWS Cookbook),"John Culkin, Mike Zazon\n(",에어콘출판,2022.0,9791161757087,,클라우드 컴퓨팅,match,9791161757087,AWS 쿡북,0.387097
22,23,양자 컴퓨팅 개론\n(원서: An Introduction to Quantum Com...,"Phillip Ronald Kaye, Raymond Laflamme\n(번역: 김주현)",에어콘출판,2022.0,9791161757131,,컴퓨터,match,9791161757131,양자 컴퓨팅 개론,0.24
23,24,리월월드 암호학\n(원서: Real-World Cryptography/David W...,David Wong\n(번역: 임지순),제이펍,2023.0,9791192469539,,네트워크,match,9791192469539,리얼월드 암호학,0.245614
25,26,파이썬 라이브러리를 활용한 텍스트 분석\n(원서: Blueprints for Tex...,Jens Albrecht 외 2인\n(번역: 심상진),한빛미디어,2022.0,9791169210331,,비정형분석,match,9791169210331,파이썬 라이브러리를 활용한 텍스트 분석,0.461538
26,27,파이썬 기반 금융 인공지능\n(원서: Artificial Intelligence i...,Yves Hilpisch\n(번역: 김도형),한빛미디어,2022.0,9791169210300,,데이터분석,match,9791169210300,파이썬 기반 금융 인공지능,0.337349
27,28,파이썬을 활용한 금융 분석\n(원서: Python for Finance/Yves H...,Yves Hilpisch\n(번역: 김도형),한빛미디어,2022.0,9791162245170,,데이터분석,match,9791162245170,"파이썬을 활용한 금융 분석: 파이썬의 기초부터 금융공학, 머신러닝, 퀀트 분석, 매...",0.348624
