In [1]:
import itertools
import json
from datetime import datetime, timedelta
from pathlib import Path

import requests
import yarl
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

from my_utils import get_datetimes

In [2]:
DATA_DIR = 'lol'
DATA_TYPE = 'popular'

data_dir = Path(DATA_DIR)
data_dir.mkdir(exist_ok=True)

In [3]:
dates = get_datetimes(start=datetime(2015, 8, 1), stop=datetime(2020, 5, 1))
len(dates)

1735

In [6]:
news_list_url = 'https://sports.news.naver.com/%s/news/list.nhn' % DATA_DIR
news_read_url = 'https://sports.news.naver.com/news.nhn'
news_reaction_url = 'https://sports.like.naver.com/v1/search/contents'
news_comment_url = 'https://apis.naver.com/commentBox/cbox/web_naver_list_jsonp.json'

In [7]:
s = requests.session()
s.headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/81.0.4044.122 Safari/537.36 Edg/81.0.416.64',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'ko,en;q=0.9,en-US;q=0.8'
}


class NotOKError(Exception):
    pass


def get_resp(url, **kwargs):
    r = s.get(url, **kwargs)
    if not r.ok:
        raise NotOKError(r)
    return r

In [8]:
def get_news_infos(date):
    news_infos = []
    for page in itertools.count(start=1):
        news_list_params = {'page': page, 'date': date, 'isphoto': 'N', 'type': DATA_TYPE}
        page_info = get_resp(news_list_url, params=news_list_params).json()
        news_infos += page_info['list']
        if page == page_info['totalPages']:
            break
    return news_infos

In [9]:
news_info_path = data_dir / ('info-%s.json' % DATA_TYPE)
if news_info_path.exists():
    with open(news_info_path) as f:
        news_infos_by_date = json.load(f)
else:
    try:
        news_infos_by_date
    except NameError:
        news_infos_by_date = {date: get_news_infos(date) for date in tqdm(dates)}
    with open(news_info_path, 'w') as f:
        json.dump(news_infos_by_date, f, ensure_ascii=False, indent=2)

sum(len(news_infos) for news_infos in news_infos_by_date.values())

51816

In [10]:
def download_content(url, file_path, **kwargs):
    r = get_resp(url, **kwargs)
    return open(file_path, 'wb').write(r.content)


def download_html(url, file_path, **kwargs):
    r = get_resp(url, **kwargs)
    text = BeautifulSoup(r.text, 'html5lib').prettify()
    return open(file_path, 'w').write(text)


def download_json(url, file_path, **kwargs):
    r = get_resp(url, **kwargs)
    text = json.dumps(r.json(), ensure_ascii=False, indent=2)
    return open(file_path, 'w').write(text)


def download_json_containing_callback(url, file_path, **kwargs):
    def rreplace(s, old, new, count=-1):
        return new.join(s.rsplit(old, count))

    r= get_resp(url, **kwargs)
    raw = rreplace(r.text.replace('_callback(', '', 1), ');', '', 1)
    text = json.dumps(json.loads(raw), ensure_ascii=False, indent=2)
    return open(file_path, 'w').write(text)


def replace_thumbnail_url(url):
    return url.replace('http://imgnews.naver.net/image/thumb154/', 'https://imgnews.pstatic.net/image/origin/', 1) + '?type=nf210_122'

In [11]:
for date in tqdm(dates):
    date_dir = data_dir / date
    date_dir.mkdir(exist_ok=True)

    news_infos = news_infos_by_date[date]
    for rank, news_info in enumerate(news_infos, 1):
        oid, aid = news_info['oid'], news_info['aid']
        news_id = 'ne_%s_%s' % (oid, aid)
        object_id = 'news%s,%s' % (oid, aid)
        news_path_prefix = '%04d-%s' % (rank, news_id)

        # thumbnail.jpg
        if news_info.get('thumbnail'):
            news_thumbnail_url = replace_thumbnail_url(news_info['thumbnail'])
            news_thumbnail_path = date_dir / ('%s-thumbnail.jpg' % news_path_prefix)
            try:
                download_content(news_thumbnail_url, news_thumbnail_path)
            except NotOKError as e:
                r ,= e.args
                print('[!] rank=%d, status_code=%d: %s' % (rank, r.status_code, r.url))

        # read.html
        news_read_path = date_dir / ('%s-read.html' % news_path_prefix)
        news_read_params = {'oid': oid, 'aid': aid}
        download_html(news_read_url, news_read_path, params=news_read_params)

        # reaction.json
        news_reaction_path = date_dir / ('%s-reaction.json' % news_path_prefix)
        news_reaction_params = {'q': 'SPORTS[%s]|SPORTS_MAIN[%s]' % (news_id, news_id)}
        download_json(news_reaction_url, news_reaction_path, params=news_reaction_params)

        # comment.json
        news_comment_path = date_dir / ('%s-comment.json' % news_path_prefix)
        news_comment_params = {'ticket': 'sports', 'pool': 'cbox2', 'lang': 'ko', 'country': 'KR', 'objectId': object_id}
        news_comment_headers = {'Referer': yarl.URL(news_reaction_url).with_query(news_read_params).human_repr()}
        download_json_containing_callback(news_comment_url, news_comment_path, params=news_comment_params, headers=news_comment_headers)

 32%|████████████████████████████▋                                                             | 552/1735 [53:05<55:01,  2.79s/it]

[!] rank=3, status_code=404: https://imgnews.pstatic.net/image/origin/241/2017/02/03/2636969.jpg?type=nf210_122


 32%|████████████████████████████▎                                                           | 557/1735 [53:38<2:19:46,  7.12s/it]

[!] rank=48, status_code=404: https://imgnews.pstatic.net/image/origin/241/2017/02/08/2638485.jpg?type=nf210_122


 33%|█████████████████████████████▍                                                          | 581/1735 [55:45<1:53:28,  5.90s/it]

[!] rank=21, status_code=404: https://imgnews.pstatic.net/image/origin/241/2017/03/03/2646933.jpg?type=nf210_122


 35%|██████████████████████████████▌                                                         | 603/1735 [57:54<1:50:39,  5.87s/it]

[!] rank=12, status_code=404: https://imgnews.pstatic.net/image/origin/241/2017/03/26/2655194.jpg?type=nf210_122


 36%|████████████████████████████████▍                                                         | 626/1735 [59:38<52:35,  2.85s/it]

[!] rank=11, status_code=404: https://imgnews.pstatic.net/image/origin/241/2017/04/18/2664097.jpg?type=nf210_122


 36%|███████████████████████████████▉                                                        | 630/1735 [59:51<1:08:09,  3.70s/it]

[!] rank=64, status_code=404: https://imgnews.pstatic.net/image/origin/241/2017/04/22/2665397.jpg?type=nf210_122


 36%|███████████████████████████████▎                                                      | 632/1735 [1:00:08<1:41:47,  5.54s/it]

[!] rank=7, status_code=404: https://imgnews.pstatic.net/image/origin/241/2017/04/23/2665684.jpg?type=nf210_122


 37%|███████████████████████████████▌                                                      | 637/1735 [1:00:27<1:13:45,  4.03s/it]

[!] rank=9, status_code=404: https://imgnews.pstatic.net/image/origin/241/2017/04/29/2667900.jpg?type=nf210_122


 37%|████████████████████████████████▊                                                       | 646/1735 [1:00:48<38:42,  2.13s/it]

[!] rank=5, status_code=404: https://imgnews.pstatic.net/image/origin/241/2017/05/08/2670444.jpg?type=nf210_122


 37%|████████████████████████████████▊                                                       | 648/1735 [1:00:51<30:23,  1.68s/it]

[!] rank=23, status_code=404: https://imgnews.pstatic.net/image/origin/241/2017/05/10/2670954.jpg?type=nf210_122


 38%|█████████████████████████████████▎                                                      | 656/1735 [1:01:25<52:32,  2.92s/it]

[!] rank=2, status_code=404: https://imgnews.pstatic.net/image/origin/241/2017/05/18/2673388.jpg?type=nf210_122


 38%|█████████████████████████████████▎                                                      | 658/1735 [1:01:32<56:28,  3.15s/it]

[!] rank=24, status_code=404: https://imgnews.pstatic.net/image/origin/241/2017/05/20/2674201.jpg?type=nf210_122


 38%|████████████████████████████████▋                                                     | 660/1735 [1:01:45<1:24:19,  4.71s/it]

[!] rank=25, status_code=404: https://imgnews.pstatic.net/image/origin/241/2017/05/22/2675044.jpg?type=nf210_122


100%|███████████████████████████████████████████████████████████████████████████████████████| 1735/1735 [2:17:05<00:00,  4.74s/it]


In [12]:
s.close()