In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from IPython.display import Image

import os, sys, re, datetime, time
from pathlib import Path

pj_dir = Path(os.getcwd()).parents[0]
data_dir = pj_dir/'data'
img_dir = pj_dir/'images'
src_dir = pj_dir/'src'
sys.path.append(str(src_dir))

from matplotlib import pyplot as plt
import japanize_matplotlib
import seaborn as sns
plt.style.use("bmh")
import numpy as np
import pandas as pd
import dask.dataframe as dd

from requests_html import HTMLSession
from urllib.parse import urlparse, urljoin

from tqdm import tqdm_notebook
from dotenv import load_dotenv

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# requests_htmlセットアップ

In [3]:
session = HTMLSession()
headers = {
    'accept-language': 'ja',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}

# 準備

In [4]:
endpoint = 'https://qiita.com/'
resp = session.get(endpoint, headers=headers)
url_base = '{uri.scheme}://{uri.netloc}/'.format(uri=urlparse(resp.url))
print('url_base is:', url_base)

url_base is: https://qiita.com/


# Advent Calendar TOP

In [5]:
endpoint_top = 'https://qiita.com/advent-calendar/{}'

years = [2015, 2016, 2017, 2018]

ls_category = []
for year in tqdm_notebook(years):
    endpoint = endpoint_top.format(year)
    resp = session.get(endpoint, headers=headers)
    time.sleep(1)
    
    advent_calendar_cards = resp.html.find('.adventCalendarCard')
    
    for advent_calendar_card in advent_calendar_cards:
        genre_name = advent_calendar_card.find(':root > a', first=True).text
        showall_link = advent_calendar_card.find(':root > a', first=True).attrs.get('href')
        showall_link = urljoin(url_base, showall_link)
        n_calendar = advent_calendar_card.find('.adventCalendarCard_block_count', first=True).text
        n_participant = advent_calendar_card.find('.adventCalendarCard_block_stocks', first=True).text.replace('参加者数', '')
        
        dic_category = {
            'year': year, 'name': genre_name, 'url': showall_link, 'n_calendar': n_calendar, 'n_participant': n_participant
        }
        ls_category.append(dic_category)

df_cat = pd.DataFrame(ls_category)

# カレンダー

In [None]:
ls_calendar = []

for i, row in tqdm_notebook(df_top.iterrows()):
    url = row['url']
    category_name = row['name']
    year = row['year']
    
    resp = session.get(url, headers=headers)
    time.sleep(1)
    
    calendars = resp.html.find('.adventCalendarList_calendarTitle')
    progresses = resp.html.find('.adventCalendarList_progress')

    for calendar, progress in zip(calendars, progresses):
        calendar_title = calendar.text
        calendar_url = calendar.find('a', first=True).attrs.get('href')
        calendar_url = urljoin(url_base, calendar_url)

        progress_text = progress.text
        
        dic_calendar = {
            'year': year, 'category': category_name, 'name': calendar_title, 
            'url': calendar_url, 'progress': progress_text
        }
        ls_calendar.append(dic_calendar)

df_calendar = pd.DataFrame(ls_calendar)

# カレンダー情報

In [None]:
ls_calendar_info = []

for i, row in tqdm_notebook(df_category.iterrows()):
    url = row['url']
    category_name = row['category']
    year = row['year']
    
    resp = session.get(url, headers=headers)
    time.sleep(1)
    
    n_participant = resp.html.find('.adventCalendarJumbotron_stats', first=True).text
    n_like = resp.html.find('.adventCalendarJumbotron_stats', first=True).text
    n_reader = resp.html.find('.adventCalendarJumbotron_stats', first=True).text
    
    articles = resp.html.find('.adventCalendarCalendar_comment')
    dic_calendar_info = {
        'n_participant': n_participant, 'n_like': n_like, 'n_reader': n_reader
    }
    ls_calendar_info.append(dic_calendar_info)
    
df_calendar_info = pd.DataFrame(ls_calendar_info)

# マージ

In [None]:
df_calendar = pd.concat([df_calendar, df_calendar_info], axis=1)

# 記事

In [None]:
ls_article = []

for i, row in tqdm_notebook(df_calendar.iterrows(), 'calendar'):
    url = row['url']
    category_name = row['category']
    year = row['year']
    calendar_name = row['name']

    resp = session.get(url, headers=headers)
    time.sleep(1)
    
    articles = resp.html.find('.adventCalendarCalendar_comment')
    for article in articles:
        url = article.find('a', first=True).attrs.get('href')
        resp = session.get(url, headers=headers)
        time.sleep(1)

        title = resp.html.find('.it-Header_title', first=True).text
        author = resp.html.find('.it-Header_authorName', first=True).text
        n_like = resp.html.find('.it-Actions_likeCount', first=True).text
        n_comment = resp.html.find('.it-Actions_commentCount', first=True).text
        content = resp.html.find('.it-MdContent', first=True).text
        
        dic_article = {
            'url': url, 'title': title, 'author': author, 'n_like': n_like, 'n_comment': n_comment, 'content': content,
            'year': year, 'category': category_name, 'calendar': calendar_name
        }
        ls_article.append(dic_article)
        
df_article = pd.DataFrame(ls_article)