# Selenium 국회의원 데이터 스크래핑

## Selenium 및 웹 드라이버 설치

In [None]:
!pip install Selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-brower/chromedriver /usr/bin

import sys
sys.path.insert(0, '/usr/lib/chrome-brower/chromedriver')

from selenium import webdriver

Collecting Selenium
[?25l  Downloading https://files.pythonhosted.org/packages/80/d6/4294f0b4bce4de0abf13e17190289f9d0613b0a44e5dd6a7f5ca98459853/selenium-3.141.0-py2.py3-none-any.whl (904kB)
[K     |████████████████████████████████| 911kB 4.3MB/s 
Installing collected packages: Selenium
Successfully installed Selenium-3.141.0
Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:6 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Hit:8 htt

## 라이브러리 import

In [None]:
import os
import shutil
import time
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from urllib.request import urlretrieve

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# 국회의원 기본정보 스크래핑

In [None]:
def get_politician_info(dir_name, wd):

  profile_info = wd.find_element_by_class_name('depAnalProfileTbl')

  cols = profile_info.find_elements_by_tag_name('th')
  columns = [col.text for col in cols]

  infos = profile_info.find_elements_by_tag_name('td')
  
  detail_info = [info.text for info in infos]

  info_dic = { col:info for col, info in zip(columns, detail_info) }

  return info_dic

# 국회의원 뉴스 인용문

In [None]:
def get_news_quote(dir_name, name, wd, news_max=5):

  page_no = 0
  news_count = 0
  news_total = wd.find_element_by_id('newsInQoutTotalCount').text[1:-2]
  news_file = open(dir_name + '/' + name + '뉴스 인용문.txt', 'w')

  while True:
    try:
      page_no +=1
      paging = wd.find_element_by_id('newsInQuotListPaging')
      pagination = paging.find_element_by_class_name('pagination')
      pagination.find_element_by_link_text(str(page_no)).click()
      time.sleep(1)
      print(f"뉴스 인용문 {page_no} 페이지")

      box_list = wd.find_element_by_id('newsInQuotList')
      quotes = box_list.find_elements_by_tag_name('b')
      for quote in quotes:
        news_file.write(quote.text + '\n')
        news_count += 1
        print(f"  {news_count}/{news_total}", [quote.text])

        if news_count >= news_max:
          break

      if news_count >= news_max:
        break
        

      next_button = pagination.find_elements_by_tag_name('li')[-1]
      if next_button.get_attribute('class') == 'disabled':
        break

      if page_no % 5 == 0:
        next_button.find_element_by_tag_name('a').click()
        time.sleep(1)
    
    except:
      break

  news_file.close()

## 국회의원 회의록

In [None]:
def get_record_quote(dir_name, name, wd, record_max=5):

  page_no = 0
  recoed_count = 0
  record_total = wd.find_element_by_id('recordTotalCount').text
  record_file = open(dir_name + '/' + name + '회의록 인용문.txt', 'w')

  while True:
    try:
      page_no += 1
      paging = wd.find_element_by_id('recordTableListPaging')
      pagination.find_element_by_link_text(str(page_no)).click()
      time.sleep(1)
      print(f"회의록 인용문 {page_no}페이지")

      record_table = wd.find_element_bt_id('birefTableList')
      record_list = record_table.find_elements_by_tag_name('a')
      for record in record_list:
        record.click()
        time.sleep(1)

        qoute_list = wd.find_element_by_id('birefTableList')
        qoutes = qoute_list.find_elements_by_tag_name('td')
        for qoute in qoutes:
          record_file.write(qoute.text +'\n')

        record_count += 1
        print(f" {record_count}/{record_total}", {qoute.text})
        
        if record_count >= record_max:
          break

      if record_count >= record_max:
          break

      next_button = pagination.find_elements_by_tag_name('li')[-1]
      if next_button.get_attribute('class') =='disable':
        break
      
      if page_no % 5 ==0:
        next_button.find_element_by_tag_name('a').click()
        time.sleep(1)
    
    except:
      break

  record_file.close()

## `scraping()`: 스크래핑

In [None]:
def scraping(dir_name, politician_max=3):
  page_no = 0
  politician_count = 0
  politician_df = pd.DataFrame()

  wd = webdriver.Chrome('chromedriver', options=chrome_options)
  wd.execute_script('window.open("about:blank", "_blank");')
  tabs = wd.window_handles

  while True:

    try:
      page_no += 1
      wd.switch_to.window(tabs[0])

      url = f"https://www.bigkinds.or.kr/v2/depthAnalysis/assembly.do?page={page_no}"
      wd.get(url)
      politician_total = wd.find_element_by_xpath('//*[@id="contents"]/div/div[3]/div/button[1]/span').text[1:-1]
      politician_items = wd.find_elements_by_class_name('assembly-item')

      if not politician_items:
        break

      for item in politician_items:
        wd.switch_to.window(tabs[0])

        politician_count += 1
        print("---------------------------------------")
        print(f"국회의원 {politician_count}/{politician_total} 명")
        name = item.find_element_by_class_name('assembly-item__profile__name').text
        name = name.split('\n')[0]
        print(name)

        detail_link = item.find_element_by_tag_name('a').get_attribute('href')

        wd.switch_to.window(tabs[1])
        wd.get(detail_link)

        print("[프로필 이미지 다운로드]")
        profile_image = wd.find_element_by_class_name('depAnalProfileImg')
        image_src = profile_image.find_element_by_tag_name('img').get_attribute('src')
        file_name = dir_name + '/' + name + '.jpg'
        print(file_name)
        urlretrieve(image_src, file_name)

        print("[기본 정보 스크래핑]")
        politician_dic = get_politician_info(dir_name, wd)
        print(" ", politician_dic)
        politician_df = politician_df.append(politician_dic, ignore_index=True)

        print("[뉴스 인용문 스크래핑]")
        wd.find_element_by_id('depthAnalTab2_1').find_element_by_tag_name('a').click()
        get_news_qoute(dir_name, name, wd)

        print("[회의록 인용문 스크래핑]")
        get_record_quote(dir_name, name, wd)

        if politician_count >= politician_max:
          break

      if politician_count >= politician_max:
        break

    except AttributeError as e:
      print(e)
      break

    except NoSuchElementException as e:
      print(e)
      break


  wd.close()

  return politician_df

# 스크래핑 시작

In [None]:
dir_name = './politician'
if os.path.isdir(dir_name):
  shutil.rmtree(dir_name)

os.makedirs(dir_name)
print(f"{dir_name} 디렉토리 생성")

politician_df = scraping(dir_name)

./politician 디렉토리 생성
---------------------------------------
국회의원 1/300 명
강기윤 (姜起潤)
[프로필 이미지 다운로드]
./politician/강기윤 (姜起潤).jpg
[기본 정보 스크래핑]
  {'이름': '강기윤(姜起潤)', '이름(영문)': 'KANG GIYUN', '출생일': '1960-06-04', '정당': '미래통합당', '지역구': '경남 창원시성산구', '소속위원회': '보건복지위원회', '당선기록': '재선(19대, 21대)', '사무실전화': '02-784-1751', '홈페이지': 'http://blog.naver.com/ggotop', '이메일': 'ggotop@naver.com', '보좌관': '강종길 , 김홍광', '경력': '[학력] 마산공고(26회) 창원대학교 행정학과 중앙대학교 행정대학원 지방의회과 석사 창원대학교 대학원 행정학 박사 [경력] 보건복지위원회 위원 미래통합당 소상공인살리기 특별위원회 부위원장 미래통합당 경남도당 민생특위 위원장 제19대 국회의원 (새누리당/경남 창원시 성산구) 새누리당 원내부대표'}
[뉴스 인용문 스크래핑]
[회의록 인용문 스크래핑]
---------------------------------------
국회의원 2/300 명
강대식 (姜大植)
[프로필 이미지 다운로드]
./politician/강대식 (姜大植).jpg
[기본 정보 스크래핑]
  {'이름': '강대식(姜大植)', '이름(영문)': 'KANG DAESIK', '출생일': '1959-11-02', '정당': '미래통합당', '지역구': '대구 동구을', '소속위원회': '국방위원회', '당선기록': '초선(21대)', '사무실전화': '', '홈페이지': '', '이메일': '', '보좌관': '박홍규 , 정운태', '경력': ''}
[뉴스 인용문 스크래핑]
[회의록 인용문 스크래핑]
---------------------------------------
국회의원 3/300 명
강

In [None]:
politician_df

Unnamed: 0,경력,당선기록,보좌관,사무실전화,소속위원회,이름,이름(영문),이메일,정당,지역구,출생일,홈페이지
0,[학력] 마산공고(26회) 창원대학교 행정학과 중앙대학교 행정대학원 지방의회과 석사...,"재선(19대, 21대)","강종길 , 김홍광",02-784-1751,보건복지위원회,강기윤(姜起潤),KANG GIYUN,ggotop@naver.com,미래통합당,경남 창원시성산구,1960-06-04,http://blog.naver.com/ggotop
1,,초선(21대),"박홍규 , 정운태",,국방위원회,강대식(姜大植),KANG DAESIK,,미래통합당,대구 동구을,1959-11-02,
2,2018. 10. ~ 2019. 5. 민주연구원 자치발전연구센터 본부장 2016. ...,초선(21대),유진우,02-784-2747~9,교육위원회,강득구(姜得求),KANG DEUKGU,mainsail440@daum.net,더불어민주당,경기 안양시만안구,1963-05-27,https://blog.naver.com/dulipapa


## 스크래핑 확인

In [None]:
!ls politician/

'강대식 (姜大植).jpg'		    '강득구 (姜得求)회의록 인용문.txt'
'강대식 (姜大植)뉴스 인용문.txt'    '강기윤 (姜起潤).jpg'
'강대식 (姜大植)회의록 인용문.txt'  '강기윤 (姜起潤)뉴스 인용문.txt'
'강득구 (姜得求).jpg'		    '강기윤 (姜起潤)회의록 인용문.txt'
'강득구 (姜得求)뉴스 인용문.txt'


In [None]:
from google.colab import files
files.download('./politician/강대식 (姜大植).jpg')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
! cat './politician/강대식 (姜大植)뉴스 인용문.txt'

In [None]:
! cat './politician/강대식 (姜大植)회의록 인용문.txt'