# 1. 필요 라이브러리 선언

In [1]:
from datetime import datetime, timedelta
import pandas as pd
import requests
from lxml import html
from bs4 import BeautifulSoup 
from urllib.request import Request, urlopen
from urllib.parse import urlencode, quote_plus, unquote
import os
import time
# from concurrent.futures import ThreadPoolExecutor
# import concurrent.futures

# 2. 함수 선언

In [2]:
### 함수정의: 시작일시(datetime 타입)를 입력 시 한달 후의 시작 및 종료일시를 반환하는 함수
### 파마리터정의:
###   - inDf: 시작일시 (예: datetime(2020,1,1))
def getAfterMonthDate(inDt):
    startYear = inDt.year
    startMonth = inDt.month
    endYear = inDt.year
    endMonth = inDt.month + 1
    if startMonth == 12:
        startYear += 1
        startMonth = 1
        endYear = startYear
        endMonth = startMonth + 1
    elif startMonth == 11:
        startMonth += 1
        endYear = startYear + 1
        endMonth = 1
    else:
        startMonth += 1
        endMonth = startMonth + 1
    outBgnDt = datetime(startYear, startMonth, inDt.day)
    outEndDt = datetime(endYear, endMonth, inDt.day) - timedelta(seconds=1)
    return outBgnDt, outEndDt

In [3]:
### 함수정의: 사이트 메타정보를 받아 데이터를 수집 후 수집결과를 반환하는 함수
### 파마리터정의: 
###   - inurl: 메타정보의 "URL"컬럼값 (예: https://www.calspia.go.kr/io/openapi/cm/selectIoCmConstructionList.do )
###   - inSiteName: 메타정보의 "자료대상" (예: 건설사업정보시스템)
###   - inDataName: 메타정보의 "자료명" (예: 공사정보 목록)
###   - inServiceName: 메타정보의 "서비스명" (예: getCntrctInfoListCnstwk)
###   - inParam: 메타정보의 "기본키 정보" (예: {'ServiceKey': 't1howSPLxqyKOseR6gxDm7IGYVVLGc+w3wF7N4e9ufwr2g9sttHbYCQTR4dBbiVc16v1tnmTEkn/baD6et/L6g==','type':'json',...}")
###   - inPageYn: 메타정보의 "페이지 정보" (예: 페이지 파라미터 존재 시 1 값")
### 함수정의: 사이트 메타정보를 받아 데이터를 수집 후 수집결과를 반환하는 함수
def scrapy(inUrl, inSiteName, inDataName, inServiceName, inParam, inPageYn):
    emptyPd = pd.DataFrame()
    i=1
    while True:
        print("{} page scraping start".format(i))

        if(inPageYn==1):
            inParam["pageNo"] = i
        queryParams = '?' + urlencode(inParam)

        response = requests.get(inUrl+queryParams)
        
        time.sleep(0.1)
        
        print(inUrl+queryParams)
        response.encoding=STDENCODING

        try:
            jsondata = response.json()["response"]["body"]["items"]
        except Exception as e:
            if e.args[0] == 'Expecting value: line 1 column 1 (char 0)':
                xmlobj = BeautifulSoup(response.text,"lxml-xml")
                errorCode = xmlobj.find("returnReasonCode").text
                raise Exception(errorCode)

        if( jsondata == []):
            print("{} page is empty".format(i))
            break

        rowData = pd.DataFrame(jsondata)
#        print(rowData)
        emptyPd = emptyPd.append(rowData)

        if(inPageYn == 0):
            print("{} no pageNo".format(inPageYn))
            break
        i = i+1

    print("dataframe {} completed".format(inDataName))
    return emptyPd 

In [4]:
### 함수정의: 데이터프레임을 Append 모드로 OUTPUT PATH에 저장하는 함수
### 파마리터정의:  (★★TBD 추후 HDFS경로 및 메타정보로 컬럼 추가 필요!!★★)
###   - inDf: 저장할 대상 데이터프레임
###   - inSiteName: 메타정보의 "자료대상" (예: 건설사업정보시스템)
###   - inDataName: 메타정보의 "자료명" (예: 공사정보 목록)
###   - inServiceName: 메타정보의 "서비스명" (예: getCntrctInfoListCnstwk)
def savedata(inDf, inSiteName, inDataName, inServiceName):
    # DATA SAVE TO THE OUTPUT PATH FOLDER
    outDir = os.path.join(OUTPUTPATH,inSiteName,inDataName)
    outFile = os.path.join( outDir, inServiceName) + ".csv"
    createFolder(outDir)
    if not os.path.exists(outFile):
        inDf.to_csv(outFile, index=False, encoding="ms949", mode="w")
    else:
        inDf.to_csv(outFile, index=False, encoding="ms949", mode="a", header=False)
    print("{} save compled".format(inDataName) )

In [5]:
### 함수정의: 사이트 메타정보를 받아 데이터를 수집 후 수집결과를 반환하는 함수 (★★TBD 추후 HDFS경로 및 메타정보로 컬럼 추가 필요!!★★)
### 파마리터정의: 
###   - directory: outputpath 
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)

In [6]:
# def thread_crawl(paramList, inUrl, inSiteName, inDataName, inServiceName, inPageYn, fuction):
#     per_thread_num = 8
#     thread_list = []
#     result_list = []
#     with ThreadPoolExecutor(max_workers=per_thread_num) as exe:
#         for param in paramList:
#             fs = exe.submit(fuction,URL,SITENAME,DATANAME,SERVICENAME,param,PAGEYN)
#             thread_list.append(fs)
#         for execution in concurrent.futures.as_completed(thread_list):
#             df = execution.result()
#             result_list.append(df)
#     return result_list

# 3. 필요 변수 선언

### 가) 메타데이터 업로드

In [7]:
metadata = pd.read_excel("../../input/datalake_meta22.xlsx", sheet_name="2. 조달청-조달정보개방포털", dtype={"순서":str, "비고":str})

### 나) output 폴더 생성 변수

In [8]:
SITENAME = "조달청"

In [9]:
DATANAME= "나라장터 검색조건에 의한 낙찰된 목록 현황 용역조회"

In [10]:
targetData = metadata.loc[metadata.자료명==DATANAME]

In [11]:
SERVICENAME = targetData["서비스키"].values[0]

### 다) 기본키 설정 (서비스별 변경 요)

#### 1) API KEY

In [12]:
initServiceKey = "t1howSPLxqyKOseR6gxDm7IGYVVLGc+w3wF7N4e9ufwr2g9sttHbYCQTR4dBbiVc16v1tnmTEkn/baD6et/L6g=="

In [13]:
ServiceKeyLst = ["hMw9eN6ZYKvMWUUvEykxVelOH07uAKhQYzFMUV5r4QpvdYgJehQANKE5bay+8Hgy660JdC9mVQCFhRKupVGwaw==",\
                 "Yc4hy9aka4aSXaZzlPFTem6eM79Fm4r2PPJXGxxYxlZVUhXWeYkFbodwmpsXrkEhqUzKYoq7hFyfx3x+MPLarA==",\
                "h2pHFUSHMnsx/wJfhd+fyC4L0X+g16L0FhJvDcos8Px4Fqttih1HONUkKXQ5ITmYlq1vjIYT8/G+twwMfA9m8Q==",\
                "gDMXFxO2+zC7ZuEFiUXl+aWOrzfRGc7F52fIwOg5cdI92qRAnlOAAcL5qdsWx46zn5jMTFiMzG/0rQ/g/k2elw==",\
                "6wr0NwD9FgzjDWjNbT4iC0RBvs7vESA5VpmfNHZWE0BHguUn6YSLbh4FJ0FpMzLsEMtCiz3WxDYI5tcxCuPcPA=="]

#### 2) 조회 구분

In [14]:
inqryDiv = 1

#### 3) 등록일시 기준 조회 시작일자 및 종료일자

In [15]:
startYear, startMonth, startDay = 2012, 1, 1
bgnDt = datetime(startYear, startMonth, startDay)
endDt = datetime(startYear, startMonth + 1, startDay) - timedelta(seconds=1)

#### 4) 조회 rows 수

In [16]:
numOfRows = 999

#### 5) 기본키 딕셔너리 생성

In [17]:
BASEPARAM_KEY = targetData.기본키.values[0].split(",")

In [18]:
BASEPARAM_Lst = []

In [19]:
nowDt = datetime.now().strftime("%Y%m%d%H%M")

In [20]:
while True:
    inqryBgnDt = bgnDt.strftime("%Y%m%d%H%M")
    inqryEndDt = endDt.strftime("%Y%m%d%H%M")
    if inqryBgnDt > nowDt:
        break
    BASEPARAM = {}
    BASEPARAM_VAL = [numOfRows, initServiceKey, inqryDiv, inqryBgnDt, inqryEndDt]
    for i in range(len(BASEPARAM_VAL)):
        BASEPARAM[BASEPARAM_KEY[i]] = BASEPARAM_VAL[i]
        BASEPARAM["type"] = "json"
    BASEPARAM_Lst.append(BASEPARAM)    
    bgnDt, endDt = getAfterMonthDate(bgnDt)

### 5) 함수 파라미터 설정

In [21]:
URL = targetData["URL"].values[0]

In [22]:
PAGEYN=1

In [23]:
STDENCODING='utf-8'

### 6) OUTPUT 기본경로 설정

In [24]:
OUTPUTPATH="../../output"

# 4. 데이터 수집

In [25]:
filePath = os.path.join(OUTPUTPATH,SITENAME,DATANAME,SERVICENAME) + '.csv'

In [26]:
filePath = os.path.join(OUTPUTPATH,SITENAME,DATANAME,SERVICENAME) + '.csv'

if os.path.isfile(filePath):
    os.remove(filePath)

In [27]:
breakPoint = 0

## 가) 초기 인증키를 통한 수집

In [28]:
flag = False

In [29]:
starttime = time.time()
print("수집시작 : ", time.strftime('%c', time.localtime(time.time())))
for i in range(breakPoint, len(BASEPARAM_Lst)):
    try:
        resultDf = scrapy(URL,SITENAME,DATANAME,SERVICENAME,BASEPARAM_Lst[i],PAGEYN)
        print(resultDf)
        savedata(resultDf,SITENAME,DATANAME,SERVICENAME)
    except Exception as e:
        print(e)
        breakPoint = i
        if e.args[0] == "22":
            print("LIMITED_NUMBER_OF_SERVICE_REQUESTS_EXCEEDS_ERROR")
        break
    if i == len(BASEPARAM_Lst) - 1:
        flag = True

수집시작 :  Tue Apr  5 13:38:02 2022
1 page scraping start
http://apis.data.go.kr/1230000/ScsbidInfoService/getScsbidListSttusServcPPSSrch?numOfRows=999&type=json&ServiceKey=t1howSPLxqyKOseR6gxDm7IGYVVLGc%2Bw3wF7N4e9ufwr2g9sttHbYCQTR4dBbiVc16v1tnmTEkn%2FbaD6et%2FL6g%3D%3D&inqryDiv=1&inqryBgnDt=201201010000&inqryEndDt=201201312359&pageNo=1
2 page scraping start
http://apis.data.go.kr/1230000/ScsbidInfoService/getScsbidListSttusServcPPSSrch?numOfRows=999&type=json&ServiceKey=t1howSPLxqyKOseR6gxDm7IGYVVLGc%2Bw3wF7N4e9ufwr2g9sttHbYCQTR4dBbiVc16v1tnmTEkn%2FbaD6et%2FL6g%3D%3D&inqryDiv=1&inqryBgnDt=201201010000&inqryEndDt=201201312359&pageNo=2
3 page scraping start
http://apis.data.go.kr/1230000/ScsbidInfoService/getScsbidListSttusServcPPSSrch?numOfRows=999&type=json&ServiceKey=t1howSPLxqyKOseR6gxDm7IGYVVLGc%2Bw3wF7N4e9ufwr2g9sttHbYCQTR4dBbiVc16v1tnmTEkn%2FbaD6et%2FL6g%3D%3D&inqryDiv=1&inqryBgnDt=201201010000&inqryEndDt=201201312359&pageNo=3
4 page scraping start
http://apis.data.go.kr/1230000/Sc

## 나) 인증키 관련 에러 시 타인증키로 수집

In [30]:
for each in ServiceKeyLst:
    if flag:
        print("수집완료")
        break
    for i in range(breakPoint, len(BASEPARAM_Lst)):
        if i == len(BASEPARAM_Lst) - 1:
            flag = True
        BASEPARAM_Lst[i]["ServiceKey"] = each
        try:
            resultDf = scrapy(URL,SITENAME,DATANAME,SERVICENAME,BASEPARAM_Lst[i],PAGEYN)
            savedata(resultDf,SITENAME,DATANAME,SERVICENAME)
        except Exception as e:
            print(e)
            breakPoint = i
            if e.args[0] == "22":
                print("LIMITED_NUMBER_OF_SERVICE_REQUESTS_EXCEEDS_ERROR")
            else: print(e.args[0])
            break

수집완료


In [31]:
print("종료 시점 파라미터키 : ",BASEPARAM_Lst[breakPoint])
print("수집 종료 : ", time.time()-starttime)

종료 시점 파라미터키 :  {'numOfRows': 999, 'type': 'json', 'ServiceKey': 't1howSPLxqyKOseR6gxDm7IGYVVLGc+w3wF7N4e9ufwr2g9sttHbYCQTR4dBbiVc16v1tnmTEkn/baD6et/L6g==', 'inqryDiv': 1, 'inqryBgnDt': '201201010000', 'inqryEndDt': '201201312359', 'pageNo': 6}
수집 종료 :  1538.4918022155762


In [32]:
# #다중스레드 수집
# try:
#     starttime = time.time()
#     resultdfList = thread_crawl(BASEPARAM_Lst,URL,SITENAME,DATANAME,SERVICENAME,PAGEYN,scrapy)
#     print("time : ", time.time()-starttime)
# #오류 대비 원스레드 수집
# except Exception as e:
#     print(e)
#     print("multithread error, try onethread")
#     starttime = time.time()
#     resultdfList = []
#     for i in urlList:
#         try :
#             eachdata = getdata(i)
#         except :
#             continue
#         if eachdata is not None:
#             resultdfList.append(eachdata)
#     print("time : ", time.time()-starttime)