# Selenium CGV 영화 리뷰 스크래핑

## Selenium 및 웹 드라이버 설치

In [None]:
!pip install Selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin/

import sys
sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')

from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

Collecting Selenium
[?25l  Downloading https://files.pythonhosted.org/packages/80/d6/4294f0b4bce4de0abf13e17190289f9d0613b0a44e5dd6a7f5ca98459853/selenium-3.141.0-py2.py3-none-any.whl (904kB)
[K     |████████████████████████████████| 911kB 2.7MB/s 
Installing collected packages: Selenium
Successfully installed Selenium-3.141.0
Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ Packages [95.3 kB]
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]


## CGV 영화 리뷰 긁어오기

* 아이언맨: http://www.cgv.co.kr/movies/detail-view/?midx=38262#1
* 다크나이트: http://www.cgv.co.kr/movies/detail-view/?midx=76417#1
* url을 통해 리뷰 페이지 접근 불가
* 셀레니움으로 페이지 번호를 클릭하여 접근

In [None]:
import time
import pandas as pd
from selenium.common.exceptions import NoSuchElementException #예외처리

def get_movie_reviews(url, page_num=10):

  wd = webdriver.Chrome('chromedriver', options=chrome_options)
  wd.get(url)

  writer_list = []
  review_list = []
  date_list = []

  for page_no in range(1, page_num+1):
    try:
      page_ul = wd.find_element_by_id('paging_point')
      page_a = page_ul.find_element_by_link_text(str(page_no))
      page_a.click()
      time.sleep(1)

      writers = wd.find_elements_by_class_name('writer-name')
      writer_list += [writer.text for writer in writers]
      reviews = wd.find_elements_by_class_name('box-comment')
      review_list += [review.text for review in reviews]
      dates = wd.find_elements_by_class_name('day')
      date_list += [date.text for date in dates]

      if page_no % 10 == 0:
        next_button = page_ul.find_element_by_class_name('btn-paging.next')
        next_button.click()
        time.sleep(1)

    except NoSuchElementException: #except된 경우 break 처리
      break

  movie_review_df = pd.DataFrame({"Writer": writer_list,
                                  "Review": review_list,
                                  "Date": date_list})
  return movie_review_df

In [None]:
url = "http://www.cgv.co.kr/movies/detail-view/?midx=83327"
movie_review_df = get_movie_reviews(url, 12)
movie_review_df

Unnamed: 0,Writer,Review,Date
0,판도라의박스,굿굿굿우우우웃웃웃뜨,2020.08.28
1,우례리,"스토리는 많이 다뤄진 주제이지만, 하드코어 액션으로 볼만합니다.",2020.08.28
2,gu**ks826,되게 재밌게 봤어요~다른 사람이랑 같이 한번 더 봐도 재밌을거 같아여~,2020.08.28
3,림율파파,재밌네요. 간만에 영화봤는데 지루하지않고 긴장감도 좋고,2020.08.28
4,정클룬,이야기는 별로지만 액션은 좋네요.,2020.08.28
...,...,...,...
67,영화꿀렁,역시 믿고보는 연기...????,2020.08.28
68,katiepark,재밌어요! 잔인하긴해여 ㅋㅋ,2020.08.28
69,윤거지,구우굿구우굿구우굿굿,2020.08.28
70,al**ma4,무난하게 보기 좋았어요,2020.08.28


## CGV 상영작 스크래핑

* http://www.cgv.co.kr/movies/

In [None]:
url = "http://www.cgv.co.kr/movies/"

wd = webdriver.Chrome('chromedriver', options=chrome_options)
wd.get(url)

movie_chart = wd.find_element_by_class_name('sect-movie-chart')
contents = movie_chart.find_elements_by_class_name('box-contents')
for content in contents:
  link = content.find_element_by_tag_name('a').get_attribute('href')
  title = content.find_element_by_class_name('title').text
  percent = content.find_element_by_class_name('percent').text
  info = content.find_element_by_class_name('txt-info').text
  print(title, percent, info, link)
  print(get_movie_reviews(link, 2)) #함수 호출
  print()
  print()
  print()

테넷 예매율84.5% 2020.08.26 개봉 http://www.cgv.co.kr/movies/detail-view/?midx=83381
         Writer                                             Review        Date
0     whitemini                                   크리스퍼 놀란 그 자체의 영화  2020.08.28
1          하늘편지                    시간과 시간...그사이,또다른 시간 모두가 그곳에 있었다  2020.08.28
2        kissel                                  알듯말듯ㅋㄷㅋㄷ 호불호가 갈릴듯  2020.08.28
3        판도라의박스                                         굿굿굿입니다아아아아  2020.08.28
4    sa**ykim62                    woaldlTrp qhdkkTdma whgdkTdjdy.  2020.08.28
5          신올라프  영화뿐만 아니라 감독까지도 극찬하게 되는 영화, 이해하기는 조금 어려웠지만 그만큼 ...  2020.08.28
6           우서기                                       재밋었어효~~~????  2020.08.28
7       sp**133                                      영화가 어렵네요 ㅠㅜㄷㄷ  2020.08.28
8            엉금                                     오류가 없을 수 없는 타임  2020.08.28
9   bi**woo0307                                       재미있게 잘 봤습니다.  2020.08.28
10          서희츄                               진짜 명작..