# Scrap Data from KCDC

## Definitions

In [1]:
# your path folder to save results
PATH_FOLDER_SAVE = '../../data'

PATH_CSV_DATA = PATH_FOLDER_SAVE + '/data.csv'

## Helper functions

In [2]:
# save before scraping

import shutil
# importing os module
import os
# import datetime module
import datetime


def clean_file(path_file_name):
    '''
    Clean file already traited : rename file with date
    '''
    try:
        d = datetime.datetime.now()
        str_date = '_' + d.strftime("%Y%m%d_%H_%M_%S")
        res_re = re.search('\.\w+$',path_file_name)
        path_file_name_saved = \
            path_file_name[0:res_re.start()] + str_date + res_re.group(0)
            
        shutil.move(path_file_name, path_file_name_saved) 
        print('File {} moved!'.format(path_file_name_saved))
    except:
        print('File {} does not exist!'.format(path_file_name))



In [3]:
import scrapy
import scrapy.crawler as crawler
from multiprocessing import Process, Queue
from twisted.internet import reactor

# the wrapper to make it run more times
def run_spider(spider):
    '''
    function to run several times scraping process
    '''
    def f(q):
        try:
            runner = crawler.CrawlerRunner()
            deferred = runner.crawl(spider)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)

    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    result = q.get()
    p.join()

    if result is not None:
        raise result


In [4]:
import re

class KCDCPageSpider(scrapy.Spider):
    '''
    Spider to scrap all Le Gorafi pages from selected category
    Configure : 
    - KCDCPageSpider.custom_settings : save location 
    - num_max_pages : the number of next page to scrap
    - url_first_page : web page to start with
    '''
    name = "kcdc_updates_page"
    
    custom_settings = {
      'FEED_FORMAT': 'json',
      'FEED_URI': 'pages_kcdc_updates.json'
    }
    
    num_max_pages = 1
    
    url_first_page = \
        'https://www.cdc.go.kr/board.es?mid=a30402000000&bid=0030&nPage=1'
    
    pattern = 'updates'
    
    def start_requests(self):
        urls = [
                self.url_first_page,
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)
    
    def parse(self, response):
        #//div[@class="dbody"]/ul/li/a[@title]/@href
        #//div[@class="dbody"]/ul/li/a[contains(@title,"updates")]/@href
        for post in response.xpath(
            '//div[@class="dbody"]/ul/li/a[contains(@title,"updates")]'):
            yield {
                'link': post.xpath('@href').extract_first()
            }
        # https://www.cdc.go.kr/board.es?mid=a30402000000&bid=0030&nPage=2
        #//a[@class="pageNext"]/@href
        next_page = response.xpath('//a[@class="pageNext"]/@href').get()
        
        if next_page is not None:
            try:
                num_next_page = int(re.search("(?<=\&nPage\=)\d+$", 
                         next_page).group(0))
                
                if (num_next_page < self.num_max_pages):
                    #next_page = response.urljoin(next_page)
                    yield scrapy.Request(next_page, callback=self.parse)
            except:
                next_page = None

In [5]:
#int(re.search("(?<=\&nPage\=)\d+$", 
#                         "https://www.cdc.go.kr/board.es?mid=a30402000000&bid=0030&nPage=1").group(0))

#response.xpath('//div[@class="dbody"]/ul/li/a[contains(@title,"updates")]')

In [6]:
URL_PAGES_KCDC_UPDATES = \
    "https://www.cdc.go.kr/board.es?mid=a30402000000&bid=0030&nPage=1"

PATH_PAGES_KCDC_UPDATES = PATH_FOLDER_SAVE + '/pages_kcdc_updates.json'

In [7]:
# clean (move file if exist)
clean_file(PATH_PAGES_KCDC_UPDATES)

File ../../data/pages_kcdc_updates.json does not exist!


In [None]:
# configure scraping
KCDCPageSpider.url_first_page = URL_PAGES_KCDC_UPDATES
KCDCPageSpider.custom_settings = {
      'FEED_FORMAT': 'json',
      'FEED_URI': URL_PAGES_KCDC_UPDATES
    }
KCDCPageSpider.num_max_pages = 1

# clean last output
clean_file(PATH_PAGES_KCDC_UPDATES)
# scraping page urls 
run_spider(KCDCPageSpider)

File ../../data/pages_kcdc_updates.json does not exist!
