# KCI Open API를 활용한 논문 서지정보 수집

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import requests
import pickle
import time
import sys
# sys.setrecursionlimit(10000000)
# from mpire import WorkerPool
import pandas as pd
from tqdm.notebook import tqdm
from random import uniform
import lxml
import lxml.etree as et
from bs4 import BeautifulSoup
import glob
import urllib3
urllib3.disable_warnings()
from natsort import natsorted
import re

In [None]:
# from google.colab import userdata
# userdata.get('key')

In [None]:
# KCI Open API Key
key = '            '

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "DNT": "1",
}

### 논문 기본 정보
https://www.kci.go.kr/kciportal/po/openapi/openDataView.kci?datasetBean.dtstSeqNo=1

In [None]:
def request_until_success(url, headers, timeout=3, delay=1):
    """
    Continuously makes a request to the specified URL with a timeout until the request is successful.

    Args:
    - url (str): The URL to request.
    - timeout (int): The timeout for the request in seconds.
    - delay (int): The delay between retries in seconds.

    Returns:
    - response: The successful response from the server.
    """
    attempt = 1
    while True:
        try:
            # print(f"Attempt {attempt}")
            response = requests.get(url,headers=headers, timeout=timeout)
            response.raise_for_status()  # Raises HTTPError for bad responses
            # print("Request successful")
            return response
        except requests.Timeout as e:
            print(f"Request timed out: {e}")
        except requests.RequestException as e:
            print(f"Request failed: {e}")

        # print(f"Retrying in {delay} seconds...")
        time.sleep(delay)
        attempt += 1

In [None]:
url = f'http://open.kci.go.kr/po/openapi/openApiSearch.kci?apiCode=articleSearch&key={key}&displayCount=10&title=박완서&page=1'
basic = request_until_success(url,headers).text

Request failed: HTTPConnectionPool(host='open.kci.go.kr', port=80): Read timed out.


In [None]:
basic

'<?xml version="1.0" encoding="UTF-8"?>\r\n<MetaData>\r\n  <inputData>\r\n    <key>98999546</key>\r\n    <apiCode>articleSearch</apiCode>\r\n    <title><![CDATA[박완서]]></title>\r\n    <page>1</page>\r\n    <displayCount>10</displayCount>\r\n  </inputData>\r\n  <outputData>\r\n    <result>\r\n      <total>278</total>\r\n    </result>\r\n    <record>\r\n      <journalInfo>\r\n        <journal-name>한국문학논총</journal-name>\r\n        <publisher-name>한국문학회</publisher-name>\r\n        <pub-year>2010</pub-year>\r\n        <pub-mon>04</pub-mon>\r\n        <volume />\r\n        <issue>54</issue>\r\n      </journalInfo>\r\n      <articleInfo article-id="ART001438956">\r\n        <article-categories>한국어와문학</article-categories>\r\n        <article-regularity>Y</article-regularity>\r\n        <title-group>\r\n          <article-title lang="original"><![CDATA[박완서 노년소설의 젠더시학]]></article-title>\r\n          <article-title lang="foreign"><![CDATA[Gender Poetics found in Park Wan Seo\'s Old age novel - Foc

In [None]:
# XML을 파싱하여 dict list로 변환하는 함수 정의(교수님 개발!)
def xml_to_dict_list(xml_data):
    root = et.fromstring(bytes(xml_data, encoding='utf8'))

    records = []

    # 각 record 태그에 대해 반복
    for record in root.findall('.//record'):
        # article-id를 먼저 추출하여 첫 번째 키로 설정
        article_id = record.find('.//articleInfo').attrib.get('article-id', '')
        record_dict = {'article-id': article_id}

        # 각 요소에 대해 반복
        for element in record.iter():
            if element.tag in ['article-title', 'abstract']:
                key = f"{element.tag}_{element.attrib.get('lang', 'unknown')}"
                record_dict[key] = element.text.strip() if element.text else ''
            elif element.tag not in ['record', 'journalInfo', 'articleInfo', 'title-group', 'author-group', 'abstract-group', 'article-id'] and element.text:
                record_dict[element.tag] = element.text.strip()

        records.append(record_dict)

    return records

In [None]:
basic

'<?xml version="1.0" encoding="UTF-8"?>\r\n<MetaData>\r\n  <inputData>\r\n    <key>98999546</key>\r\n    <apiCode>articleSearch</apiCode>\r\n    <title><![CDATA[박완서]]></title>\r\n    <page>1</page>\r\n    <displayCount>10</displayCount>\r\n  </inputData>\r\n  <outputData>\r\n    <result>\r\n      <total>273</total>\r\n    </result>\r\n    <record>\r\n      <journalInfo>\r\n        <journal-name>한국문학논총</journal-name>\r\n        <publisher-name>한국문학회</publisher-name>\r\n        <pub-year>2010</pub-year>\r\n        <pub-mon>04</pub-mon>\r\n        <volume />\r\n        <issue>54</issue>\r\n      </journalInfo>\r\n      <articleInfo article-id="ART001438956">\r\n        <article-categories>한국어와문학</article-categories>\r\n        <article-regularity>Y</article-regularity>\r\n        <title-group>\r\n          <article-title lang="original"><![CDATA[박완서 노년소설의 젠더시학]]></article-title>\r\n          <article-title lang="foreign"><![CDATA[Gender Poetics found in Park Wan Seo\'s Old age novel - Foc

In [None]:
# 딕셔너리로 만들기
xml_to_dict_list(basic)

[{'article-id': 'ART001438956',
  'journal-name': '한국문학논총',
  'publisher-name': '한국문학회',
  'pub-year': '2010',
  'pub-mon': '04',
  'issue': '54',
  'article-categories': '한국어와문학',
  'article-regularity': 'Y',
  'article-title_original': '박완서 노년소설의 젠더시학',
  'article-title_foreign': "Gender Poetics found in Park Wan Seo's Old age novel - Focusing on a collection of her short stories -",
  'article-title_english': "Gender Poetics found in Park Wan Seo's Old age novel - Focusing on a collection of her short stories -",
  'author': '유제분(부산대학교)',
  'abstract_original': '본고는 박완서의 전 시기에 걸친 단편을 대상으로 노년소설에 나타난 노인의 젠더변주를 살펴보고자 한다. 박완서는 노인의 젠더에 관한 지속적인 관심을 30여 년에 걸쳐 단편에 담아 전개하며 그녀만의 독특한 노인성 문학을 성취해 내고 있다. 젠더는 문화연구의 정치적 이슈로 부상했지만 노인의 젠더는 진지한 성적 관심의 대상에선 벗어난 것이었다. 노인의 젠더 탐색은 우리 안에서 자행되어온 배제와 편견의 논리인 폭력성을 고발하고 증언해 온 박완서 서사의 한 축이다.\n박완서 노년소설의 젠더 시학은 초기- 중기- 후기의 단계로 나눠지며 현실을 진단하고 전망한다. 초기에는 주변화된 노인의 ‘젠더’가 기괴한(uncanny) 몸을 통해 환기된다. ‘기괴함’은 성적 존재로서의 노인과 노인을 읽는 우리들의 부정적 시선을 함의하는 것이다. 중기에는 노인의 젠더가 ‘부인된 애착’(

In [None]:
# 데이터 프레임(테이블)로 만들기
pd.DataFrame(xml_to_dict_list(basic))

Unnamed: 0,article-id,journal-name,publisher-name,pub-year,pub-mon,issue,article-categories,article-regularity,article-title_original,article-title_foreign,...,author,abstract_original,abstract_english,fpage,lpage,doi,citation-count,url,verified,volume
0,ART001438956,한국문학논총,한국문학회,2010,4,54.0,한국어와문학,Y,박완서 노년소설의 젠더시학,Gender Poetics found in Park Wan Seo's Old age...,...,유제분(부산대학교),본고는 박완서의 전 시기에 걸친 단편을 대상으로 노년소설에 나타난 노인의 젠더변주를...,The purpose of this study is to examine how Pa...,273,300,http://dx.doi.org/10.16873/tkl.2010..54.273,24,https://www.kci.go.kr/kciportal/ci/sereArticle...,Y,
1,ART000995637,한국언어문학,한국언어문학회,2006,2,56.0,한국어와문학,Y,박완서 자전소설의 서술 전략,,...,박성천(광주대),,This paper attempts to inquire into Park Wan－s...,197,220,,0,https://www.kci.go.kr/kciportal/ci/sereArticle...,N,
2,ART001534045,우리문학연구,우리문학회,2011,2,32.0,한국어와문학,Y,박완서 소설의 오빠 표상 연구,A Study on the representation of the brother i...,...,이경재(숭실대학교),이 글은 박완서 소설 전체를 대상으로 하여 오빠가 표상되는 방식의 변화양상과 그것이...,This thesis examined a change aspect and meani...,363,392,,16,https://www.kci.go.kr/kciportal/ci/sereArticle...,Y,
3,ART001947341,한국언어문학,한국언어문학회,2014,12,91.0,한국어와문학,Y,박완서 소설의 치유 공간 연구,A Study on Healing Space in Park Wan-seo's Novels,...,김미영(전주대학교),This study set out to examine the issues of sp...,This study set out to examine the issues of sp...,175,202,,2,https://www.kci.go.kr/kciportal/ci/sereArticle...,Y,
4,ART000873367,한국문학이론과 비평,한국문학이론과비평학회,2003,9,3.0,한국어와문학,Y,박완서의 두 겹의 글쓰기,Park Wan Seo's Double Writing,...,한혜선(경문대),,,346,370,,1,https://www.kci.go.kr/kciportal/ci/sereArticle...,N,7.0
5,ART002026282,한어문교육,한국언어문학교육학회,2015,8,,국어교육,Y,박완서 소설 속에 나타난 근대를 향한 시선과 변주양상 - 박완서의 단편소설을 중심으로,Viewpoint & Variations on Modern Times in Nove...,...,이지혜(이화여자대학교),"In novels by Park Wan Seo, since dailiness is ...","In novels by Park Wan Seo, since dailiness is ...",265,289,,0,https://www.kci.go.kr/kciportal/ci/sereArticle...,Y,33.0
6,ART002211949,한국문학이론과 비평,한국문학이론과비평학회,2017,3,1.0,한국어와문학,Y,박완서 동화의 비판의식과 풍자,Park Wanseo's Criticism and Sarcasm of Fairy T...,...,박산향(동아대학교),박완서는 소설가로서 문학성을 인정받으며 꾸준히 연구되고 있는 작가다. 소설뿐만 아니...,Park Wanseo is a writer who has been recognize...,161,181,http://dx.doi.org/10.20461/KLTC.2017.3.74.161,1,https://www.kci.go.kr/kciportal/ci/sereArticle...,Y,21.0
7,ART001551678,한국문예창작,한국문예창작학회,2011,4,1.0,문학,Y,박완서 대중소설의 서사성 연구,"A study on narrative in Park, Wan-Seo's popula...",...,박성천(전남대학교),The prominent characteristic of Park s novel l...,The prominent characteristic of Park s novel l...,61,89,,2,https://www.kci.go.kr/kciportal/ci/sereArticle...,Y,10.0
8,ART002255265,동악어문학,동악어문학회,2017,8,72.0,한국어와문학,Y,정치적 텍스트로서의 박완서 소설,Park Wan-seo's Novel as a Political Text,...,신샛별(동국대학교),공적인 것과 사적인 것을 구분하는 오랜 관행 속에서 정치는 곧 남성의 영역으로 여겨...,Politics has long been regarded as the domain ...,205,232,http://dx.doi.org/10.25150/dongak.2017..72.007,3,https://www.kci.go.kr/kciportal/ci/sereArticle...,Y,
9,ART002649180,한국어문교육,한국어문교육연구소,2020,11,33.0,국어교육,Y,박완서의 단편소설과 죽음교육,Short Stories by Park Wan-seo and Death Education,...,박수현(공주대학교),"이 논문은 박완서의 단편소설 「여덟 개의 모자로 남은 당신」, 「대범한 밥상」, 「...",This paper presents the content of death educa...,209,235,http://dx.doi.org/10.24008/klle.2020..33.009,5,https://www.kci.go.kr/kciportal/ci/sereArticle...,Y,


In [None]:
# 검색결과 총 개수
total = int(et.fromstring(bytes(basic, encoding='utf8')).find('.//total').text)
total

278

In [None]:
# 반복할 페이지 수
pages = int(total / 10 +1)
pages

28

In [None]:
title = '박완서'
basic_list = []
for page in tqdm(range(1,pages+1)):
  url = f'http://open.kci.go.kr/po/openapi/openApiSearch.kci?apiCode=articleSearch&key={key}&displayCount=10&title={title}&page={page}'
  basic = request_until_success(url,headers).text
  basic_list.extend(xml_to_dict_list(basic))

  0%|          | 0/28 [00:00<?, ?it/s]

Request failed: HTTPConnectionPool(host='open.kci.go.kr', port=80): Read timed out.
Request timed out: HTTPSConnectionPool(host='open.kci.go.kr', port=443): Read timed out. (read timeout=3)
Request timed out: HTTPSConnectionPool(host='open.kci.go.kr', port=443): Read timed out. (read timeout=3)
Request timed out: HTTPSConnectionPool(host='open.kci.go.kr', port=443): Read timed out. (read timeout=3)
Request failed: HTTPConnectionPool(host='open.kci.go.kr', port=80): Read timed out.
Request timed out: HTTPSConnectionPool(host='open.kci.go.kr', port=443): Read timed out. (read timeout=3)
Request timed out: HTTPSConnectionPool(host='open.kci.go.kr', port=443): Read timed out. (read timeout=3)
Request timed out: HTTPSConnectionPool(host='open.kci.go.kr', port=443): Read timed out. (read timeout=3)
Request failed: HTTPConnectionPool(host='open.kci.go.kr', port=80): Read timed out.
Request timed out: HTTPSConnectionPool(host='open.kci.go.kr', port=443): Read timed out. (read timeout=3)
Reques

In [None]:
basic_list

In [None]:
# 데이터 프레임(테이블)로 만들기
df = pd.DataFrame(basic_list)
df.to_excel('/content/drive/MyDrive/201_[Lecture] Tutoring/허수빈2/data/KCI/박완서_논문기본정보.xlsx', index= False)

In [None]:
# 엑셀로 저장
df.to_excel('./논문기본정보.xlsx',index=None)