In [1]:
from src.crawler.base_crawler import BaseCrawler
from bs4 import BeautifulSoup
import pandas as pd
from pprint import pprint
import re
import json
import os

# requests

基本的にはURLから情報取得の為には「requests」と言うライブラリーを主に使います。

ライブラリーの詳細の情報は[公式ホームページ](https://requests.readthedocs.io/en/master/)で確認してください。

## リスポンス確認

URLから取得したリスポンスは下記のようです。

In [2]:
class TestCrawler(BaseCrawler):
      def run(self):
            test_url = 'https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm'
            res = self.get_response(test_url)
            print(type(res))

test = TestCrawler()
test.run()

<class 'requests.models.Response'>


## Encoding確認
たまにEncodingの問題で文字化けする場合がありますので、下記のようにEncodingの情報を確認することが出来ます。

In [3]:
class TestCrawler(BaseCrawler):
      def run(self):
            test_url = 'https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm'
            res = self.get_response(test_url)
            print(res.encoding)

test = TestCrawler()
test.run()

UTF-8


## コンテンツの確認1
コンテンツの確認は下記のように可能です。

In [4]:
class TestCrawler(BaseCrawler):
      def run(self):
        test_url = 'https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm'
        res = self.get_response(test_url)
        pprint(res.content)

test = TestCrawler()
test.run()

## コンテンツの確認２

コンテンツをテキストとしても確認出来ます。

In [5]:
class TestCrawler(BaseCrawler):
      def run(self):
        test_url = 'https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm'
        res = self.get_response(test_url)
        pprint(res.text)

test = TestCrawler()
test.run()

# BeatifulSoup

取得したリスポンスをパーシング、分析する際によく使われているのが「BeatifulSoup」と言うライブラリーです。

詳細の情報は[公式ホームページ](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)で確認ください。

In [6]:
class TestCrawler(BaseCrawler):
      def run(self):
        test_url = 'https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm'
        res = self.get_response(test_url)
        soup = BeautifulSoup(markup=res.content, features='html.parser')
        pprint(soup)

test = TestCrawler()
test.run()

## HTMLパーシング

Beautifulsoupは下記のようにHTML Elementを基準としてパーシングすることが出来ます。

In [7]:
class TestCrawler(BaseCrawler):
      def run(self):
        test_url = 'https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm'
        res = self.get_response(test_url)
        soup = BeautifulSoup(markup=res.content, features='html.parser')
        pprint(soup.find('table', {'class' : 'chart'}).find_all('tr'))

test = TestCrawler()
test.run()

# 応用編1

この二つのライブラリーを応用して下記の見たいにデータを取り出すことが出来ます。

リスポンスの構造の把握には、各ブラウザーのインスペクターを利用することをお勧めします。

In [8]:
class TestCrawler(BaseCrawler):
      def run(self):
        test_url = 'https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm'
        res = self.get_response(test_url)
        soup = BeautifulSoup(markup=res.content, features='html.parser')
        data = []
        velocity_pattern = re.compile(r'(\d+)[\s\n]*(\(no\schange\))?')
        number_pattern = re.compile(r'([\d\,]+)')
        for tr in soup.find('table').find_all('tr'):
            info = {}
            title_column = tr.find('td', {'class' : 'titleColumn'})
            if title_column is None:
                continue
            info['title'] = title_column.find('a').getText()
            info['year'] = int(number_pattern.search(title_column.find('span').getText()).group(1))
            velocity_div = title_column.find('div', {'class' : 'velocity'})
            re_res = velocity_pattern.match(velocity_div.getText())
            info['rank'] = int(re_res.group(1))
            info['rank_change'] = 0
            if re_res.group(2) is None:
                # rank changed
                snd_info = velocity_div.find('span', {'class' : 'secondaryInfo'})
                re_res = number_pattern.search(snd_info.getText())
                info['rank_change']  = int(re_res.group(1).replace(',', ''))
                info['rank_change'] *= 1 if snd_info.find('span', {'class' : 'up'}) else -1
            info['rating'] = None
            rating = tr.find('strong')
            if rating:
                info['rating'] = float(rating.getText())
            data.append(info)
        df = pd.DataFrame(data)
        df.to_csv('tmp.csv', index=False)

test = TestCrawler()
test.run()

df = pd.read_csv('tmp.csv')
df

Unnamed: 0,title,year,rank,rank_change,rating
0,Enola Holmes,2020,1,0,6.7
1,The Devil All the Time,2020,2,0,7.1
2,Tenet,2020,3,0,7.8
3,Borat Subsequent Moviefilm: Delivery of Prodig...,2020,4,1602708,
4,The Witches,2020,5,1169,
...,...,...,...,...,...
95,Mighty Thor: Battle Royale,2017,96,22,7.9
96,Motherless Brooklyn,2019,97,365,6.8
97,It,2017,98,35,7.3
98,Bombshell,2019,99,-30,6.8


# 応用編2

レスポンスからの画像の保存も可能です。

In [9]:
class TestCrawler(BaseCrawler):
      def run(self):
        test_url = 'https://www.imdb.com/title/tt7846844/mediaindex?ref_=tt_mv_sm'
        res = self.get_response(test_url)
        soup = BeautifulSoup(markup=res.content, features='html.parser')
        img = soup.find('script', {'type' : 'application/ld+json'})
        parsed_json=json.loads(img.contents[0])
        os.makedirs('test', exist_ok=True)
        for idx, image in enumerate(parsed_json.get('image')):
            img_url = image['url']
            res_img = self.get_response(img_url)
            try:
                file_name = f'test/{image["caption"]}_{idx}.jpg'
                with open(file_name, 'wb') as img_file:
                    img_file.write(res_img.content)
                print(f'Saving image success with {file_name}')
            except Exception as e:
                print(f'Saving image failure with {file_name}')

test = TestCrawler()
test.run()

Saving image success with test/Millie Bobby Brown in Enola Holmes (2020)_0.jpg
Saving image success with test/Henry Cavill, Sam Claflin, John Pallotta, and Millie Bobby Brown in Enola Holmes (2020)_1.jpg
Saving image success with test/Millie Bobby Brown in Enola Holmes (2020)_2.jpg
Saving image success with test/Henry Cavill and Millie Bobby Brown in Enola Holmes (2020)_3.jpg
Saving image success with test/Henry Cavill, Sam Claflin, and Millie Bobby Brown in Enola Holmes (2020)_4.jpg
Saving image success with test/Louis Partridge in Enola Holmes (2020)_5.jpg
Saving image success with test/Fiona Shaw in Enola Holmes (2020)_6.jpg
Saving image success with test/Susan Wokoma in Enola Holmes (2020)_7.jpg
Saving image success with test/Adeel Akhtar in Enola Holmes (2020)_8.jpg
Saving image success with test/Henry Cavill in Enola Holmes (2020)_9.jpg
Saving image success with test/Helena Bonham Carter in Enola Holmes (2020)_10.jpg
Saving image success with test/Millie Bobby Brown in Enola Holm