# Web Scraping

## Script

Version: 2

url: https://news.yahoo.co.jp/ranking/comment/

output: outputs/news_yyyymmdd_hhmmss.csv 

logs: logs/logfile_ws02_%Y%m%d.log


In [1]:
import os
import sys
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta, timezone
import logging


def create_logging():
  """ Create logging """
  logs_folder = 'logs/'
  if not os.path.exists(logs_folder):
    os.makedirs(logs_folder)

  log_dir = logs_folder + str(datetime.now().strftime('logfile_ws02_%Y%m%d.log'))
  logging.basicConfig(filename=log_dir,
                  level=logging.INFO,
                  format='%(asctime)s | %(name)s | %(levelname)s | %(message)s')
  console = logging.StreamHandler()
  console.setLevel(logging.INFO)
  formatter = logging.Formatter('%(asctime)s | %(name)s | %(levelname)s | %(message)s')
  console.setFormatter(formatter)
  logging.getLogger('').addHandler(console)
  
  logging.info('Started')
  logging.info('Python environment : ' + str(sys.version))
  logging.info('Requests version: ' + str(requests.__version__))
  logging.info('BeautifulSoup version: ' + str(bs4.__version__))
  logging.info('Pandas version: ' + str(pd.__version__))
  logging.info('Log file : ' + str(log_dir))


def yahoo_news_scraping():
  """ Scrap data and return result dataframe """
  #  Set parameters
  home_url = "https://news.yahoo.co.jp/ranking/comment/"
  topics = ['domestic', 'world', 'business', 'entertainment', 'sports', 'it-science', 'life']

  result = pd.DataFrame(columns=['titles', 'datetimes', 'topics'])

  #  Get data
  for t in topics:
    df = pd.DataFrame(columns=['titles', 'datetimes', 'topics'])

    url = home_url + t

    res = requests.get(url)
    logging.info('GET: ' + url)
    logging.info('STATUS: ' + str(res.status_code))

    soup = BeautifulSoup(res.text, 'lxml')

    titles = soup.find_all('div', {'class': 'newsFeed_item_title'})
    datetimes = soup.find_all('time', {'class': 'newsFeed_item_date'})

    # Create dataframe
    df = pd.DataFrame(list(zip(titles, datetimes)), columns =['titles', 'datetimes']) 
    
    spec_chars = ['<div class="newsFeed_item_title">', '</div>']
    for char in spec_chars:
      df['titles'] = df['titles'].astype(str).str.replace(char, ' ')
      
    spec_chars = ['<time class="newsFeed_item_date">', '</time>']
    for char in spec_chars:
      df['datetimes'] = df['datetimes'].astype(str).str.replace(char, ' ')

    df['topics'] = t

    result = pd.concat([result, df])
    logging.info('Finish Scraping')

  return result


def save_csv(result):
  """ Save dataframe to csv """

  jst = timezone(timedelta(hours=9), 'JST')
  now = datetime.now(jst).strftime("%Y%m%d_%H%M%S")
  output_folder = 'outputs/'
  if not os.path.exists(output_folder):
      os.makedirs(output_folder)

  output = output_folder + 'news_' + now + '.csv'

  result.to_csv(output, index=False, encoding='utf_8_sig')
  logging.info('No. of news'  + str(len(result)))
  logging.info('Saved file: ' + output)


if __name__ == '__main__':
    create_logging()
    result = yahoo_news_scraping()
    save_csv(result)


2020-09-18 03:48:33,098 | root | INFO | Started
2020-09-18 03:48:33,100 | root | INFO | Python environment : 3.6.9 (default, Jul 17 2020, 12:50:27) 
[GCC 8.4.0]
2020-09-18 03:48:33,105 | root | INFO | Requests version: 2.23.0
2020-09-18 03:48:33,106 | root | INFO | BeautifulSoup version: 4.6.3
2020-09-18 03:48:33,109 | root | INFO | Pandas version: 1.0.5
2020-09-18 03:48:33,111 | root | INFO | Log file : logs/logfile_ws02_20200918.log
2020-09-18 03:48:33,475 | root | INFO | GET: https://news.yahoo.co.jp/ranking/comment/domestic
2020-09-18 03:48:33,476 | root | INFO | STATUS: 200
2020-09-18 03:48:33,528 | root | INFO | Finish Scraping
2020-09-18 03:48:33,807 | root | INFO | GET: https://news.yahoo.co.jp/ranking/comment/world
2020-09-18 03:48:33,808 | root | INFO | STATUS: 200
2020-09-18 03:48:33,857 | root | INFO | Finish Scraping
2020-09-18 03:48:34,153 | root | INFO | GET: https://news.yahoo.co.jp/ranking/comment/business
2020-09-18 03:48:34,154 | root | INFO | STATUS: 200
2020-09-18 

## Script

Version: 1

url: https://news.yahoo.co.jp/topics/

output: outputs/news_yyyymmdd_hhmmss.csv

logs: logs/logfile_ws01_%Y%m%d.log


In [1]:
import os
import sys
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta, timezone
import logging


def create_logging():
  """ Create logging """
  logs_folder = 'logs/'
  if not os.path.exists(logs_folder):
    os.makedirs(logs_folder)

  logging.basicConfig(filename=datetime.now().strftime('logs/logfile_ws01_%Y%m%d.log'),
                  level=logging.INFO,
                  format='%(asctime)s | %(name)s | %(levelname)s | %(message)s')
  console = logging.StreamHandler()
  console.setLevel(logging.INFO)
  formatter = logging.Formatter('%(asctime)s | %(name)s | %(levelname)s | %(message)s')
  console.setFormatter(formatter)
  logging.getLogger('').addHandler(console)
  
  logging.info('Started')
  logging.info('Python environment : ' + str(sys.version))
  logging.info('Requests version: ' + str(requests.__version__))
  logging.info('BeautifulSoup version: ' + str(bs4.__version__))
  logging.info('Pandas version: ' + str(pd.__version__))


def yahoo_news_scraping():
  """ Scrap data and return result dataframe """
  #  Set parameters
  home_url = "https://news.yahoo.co.jp/topics/"
  topics = ['domestic', 'world', 'business', 'entertainment', 'sports', 'it', 'science']
  page_no = ['/?page=1', '?page=2', '?page=3']

  result = pd.DataFrame(columns=['titles', 'datetimes', 'topics'])

  #  Get data
  for t in topics:
    for p in page_no:
      df = pd.DataFrame(columns=['titles', 'datetimes', 'topics'])

      url = home_url + "/" + t + "/" + p

      res = requests.get(url)
      logging.info('GET: ' + url)
      logging.info('STATUS: ' + str(res.status_code))

      soup = BeautifulSoup(res.text, 'lxml')

      titles = soup.find_all('div', {'class': 'newsFeed_item_title'})
      datetimes = soup.find_all('time', {'class': 'newsFeed_item_date'})

      # Create dataframe
      df = pd.DataFrame(list(zip(titles, datetimes)), columns =['titles', 'datetimes']) 
      
      spec_chars = ['<div class="newsFeed_item_title">', '</div>']
      for char in spec_chars:
        df['titles'] = df['titles'].astype(str).str.replace(char, ' ')
        
      spec_chars = ['<time class="newsFeed_item_date">', '</time>']
      for char in spec_chars:
        df['datetimes'] = df['datetimes'].astype(str).str.replace(char, ' ')

      df['topics'] = t

      result = pd.concat([result, df])
      logging.info('Finish Scraping')

  return result


def save_csv(result):
  """ Save dataframe to csv """

  jst = timezone(timedelta(hours=9), 'JST')
  now = datetime.now(jst).strftime("%Y%m%d_%H%M%S")
  output_folder = 'outputs/'
  if not os.path.exists(output_folder):
      os.makedirs(output_folder)

  output = output_folder + 'news_' + now + '.csv'

  result.to_csv(output, index=False, encoding='utf_8_sig')
  logging.info('No. of news'  + str(len(result)))
  logging.info('Saved file: ' + output)
  logging.info('Logs file: ' + datetime.now().strftime('logs/logfile_ws01_%Y%m%d.log'))


if __name__ == '__main__':
    create_logging()
    result = yahoo_news_scraping()
    save_csv(result)


2020-09-18 03:00:15,429 | root | INFO | Started
2020-09-18 03:00:15,430 | root | INFO | Python environment : 3.6.9 (default, Jul 17 2020, 12:50:27) 
[GCC 8.4.0]
2020-09-18 03:00:15,436 | root | INFO | Requests version: 2.23.0
2020-09-18 03:00:15,437 | root | INFO | BeautifulSoup version: 4.6.3
2020-09-18 03:00:15,441 | root | INFO | Pandas version: 1.0.5
2020-09-18 03:00:16,695 | root | INFO | GET: https://news.yahoo.co.jp/topics//domestic//?page=1
2020-09-18 03:00:16,696 | root | INFO | STATUS: 404
2020-09-18 03:00:16,720 | root | INFO | Finish Scraping
2020-09-18 03:00:17,893 | root | INFO | GET: https://news.yahoo.co.jp/topics//domestic/?page=2
2020-09-18 03:00:17,894 | root | INFO | STATUS: 200
2020-09-18 03:00:17,943 | root | INFO | Finish Scraping
2020-09-18 03:00:19,163 | root | INFO | GET: https://news.yahoo.co.jp/topics//domestic/?page=3
2020-09-18 03:00:19,164 | root | INFO | STATUS: 200
2020-09-18 03:00:19,207 | root | INFO | Finish Scraping
2020-09-18 03:00:20,380 | root | 

## Code Detail

In [4]:
import os
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
from datetime import datetime, timedelta, timezone

In [116]:
print('Requests version: ' + str(requests.__version__))
print('BeautifulSoup version: ' + str(bs4.__version__))
print('Pandas version: ' + str(pd.__version__))

Requests version: 2.23.0
BeautifulSoup version: 4.6.3
Pandas version: 1.0.5


In [6]:
home_url = "https://news.yahoo.co.jp/topics/"
topics = ['domestic', 'world', 'business', 'entertainment', 'sports', 'it', 'science', 'life']
url = home_url + topics[0]
print(url)

result = pd.DataFrame(columns=['titles', 'datetimes', 'topics'])

df = pd.DataFrame(columns=['titles', 'datetimes', 'topics'])

https://news.yahoo.co.jp/topics/domestic


In [7]:
res = requests.get(url)  # GET Method

In [8]:
res.status_code # 200 is OK, 404 is Not Found

200

In [9]:
soup = BeautifulSoup(res.text, 'lxml')

In [12]:
soup

<!DOCTYPE html>
<html><head><style data-styled="hhHMib caYBIl fBoovI hrsGqt kTquGp cJCziL gbmduU iGMjVf fMXKmz hwtFYB bwGPsp hfRFuC bTYDKD kvwTrF kFPTQD bBCYMr hADcb diHdpL hTnmGq idEeWB bFwKkL cVzJtw hMTNII dqxkxN bCcYTo cLNZxe kKixPU gbhVmH dmEfuR hMfiGT heWWRw opOqM kXtmHq hysxHl FNDWI jLYrqT bAVKwr edZkLj inFCXz bsqfXi fXIPpx cDRFeV guTjdM eUGvfL drgvOR iGoaTi dxTGnO eBZCfC jdcIyw gWHgoh NsOhg kgGXsR OGGDU dIQmNx bdhnUx fDXrsq fntzuT eoArno kumZQI dtRHnw ldMbOW kqlTUJ gGajIV cfvFYw kOlGXS fLGZkA szsjR lnTtye gmHBwa eJMNfm jdrmnJ bRitLj eWSvDV gfJHwq hIqrOe kkoNzJ dyRSXy jECwIC iZqefY kNTYQW cYMW jfAc Qqjob ckRCrN fNSuaT kHMGun fuCSrA jAttih itcads fCcCxm jLrfAI lgBTZu gkfLJm iYZoRH jYWwiL bNjTRx hkTKeE dLBfFP kbTkLI kZGhxn gEeJGj clzMMM hqOEtE iKDLxT dmgHYf looPpP bfjcMb dvjCKo cLCgKw eCrouM jqaoCL cIOpBZ hLwNsk cFuWMT jogblL kwPRlN cEiLGJ lniAdT efeSg kCCOMK ghqeys iRWeuI hFJZos cdsyFG htGsYY dSuDFJ djSCat gGLQLz cCuJGn gRtRsy wnmnh eGTRSy VFRgJ gPjQJW gAqjLT hFCzaq gahaYI ecLMZv 

In [15]:
titles = soup.find_all('div', {'class': 'newsFeed_item_title'})
datetimes = soup.find_all('time', {'class': 'newsFeed_item_date'})

In [16]:
print(len(titles))
titles[0:3]

25


[<div class="newsFeed_item_title">北陸や北海道 強い雨に注意</div>,
 <div class="newsFeed_item_title">二階氏温情 石破派複雑な表情</div>,
 <div class="newsFeed_item_title">「官邸主導」強固に 官僚恐々</div>]

In [39]:
df = pd.DataFrame(list(zip(titles, datetimes)), columns =['titles', 'datetimes']) 

spec_chars = ['<div class="newsFeed_item_title">', '</div>']
for char in spec_chars:
  df['titles'] = df['titles'].astype(str).str.replace(char, ' ')

spec_chars = ['<time class="newsFeed_item_date">', '</time>']
for char in spec_chars:
  df['datetimes'] = df['datetimes'].astype(str).str.replace(char, ' ')

df['topics'] = topics[3]

In [38]:
result = pd.concat([result, df])

In [42]:
len(result)
result.tail(5)

Unnamed: 0,titles,datetimes,topics
20,ゆうちょ銀不正被害 1811万円,9/16(水) 17:18,business
21,ドコモ口座 過去みずほ被害か,9/16(水) 17:04,business
22,JR東4180億円赤字 民営化後初,9/16(水) 16:57,business
23,SBI証券 9864万円が不正流出,9/16(水) 16:46,business
24,タピオカバブル崩壊 専門店減,9/16(水) 16:15,business


interation 2

In [47]:
df = pd.DataFrame(columns=['titles', 'datetimes', 'topics'])

urlpath = topics[3] + "/" + page_no[1]
url = urljoin(home_url, urlpath)
url

res = requests.get(url)  # GET Method

soup = BeautifulSoup(res.text, 'lxml')

titles = soup.find_all('div', {'class': 'newsFeed_item_title'})
datetimes = soup.find_all('time', {'class': 'newsFeed_item_date'})

df = pd.DataFrame(list(zip(titles, datetimes)), columns =['titles', 'datetimes']) 

spec_chars = ['<div class="newsFeed_item_title">', '</div>']
for char in spec_chars:
  df['titles'] = df['titles'].astype(str).str.replace(char, ' ')

spec_chars = ['<time class="newsFeed_item_date">', '</time>']
for char in spec_chars:
  df['datetimes'] = df['datetimes'].astype(str).str.replace(char, ' ')

df['topics'] = topics[3]

result = pd.concat([result, df])

In [48]:
print(len(result))
result.tail(5)

50


Unnamed: 0,titles,datetimes,topics
20,米航空の日本人CA 失職の恐れ,9/14(月) 12:52,business
21,観光政策変わる?注視する京都,9/14(月) 10:43,business
22,倒産「予備軍多い」年末急増?,9/14(月) 8:51,business
23,SBG アーム全株NVIDIAに売却,9/14(月) 8:43,business
24,SBG 株式の非公開化を検討,9/14(月) 8:20,business


save file

In [83]:
jst = timezone(timedelta(hours=9), 'JST')
now = datetime.now(jst).strftime("%Y%m%d_%H%M%S")
output_folder = 'outputs/'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

output = output_folder + 'news_' + now + '.csv'

result.to_csv(output, index=False, encoding='utf_8_sig')
print('Save file: ' + output)

Save file: outputs/news_20200917_235428.csv


logging

In [2]:
import os
import logging
from datetime import datetime

def create_logging():
  """ Create logging """
  logs_folder = 'logs/'
  if not os.path.exists(logs_folder):
    os.makedirs(logs_folder)

  log_dir = logs_folder + str(datetime.now().strftime('logfile_ws02_%Y%m%d.log'))
  logging.basicConfig(filename=log_dir,
                  level=logging.INFO,
                  format='%(asctime)s | %(name)s | %(levelname)s | %(message)s')
  console = logging.StreamHandler()
  console.setLevel(logging.INFO)
  formatter = logging.Formatter('%(asctime)s | %(name)s | %(levelname)s | %(message)s')
  console.setFormatter(formatter)
  logging.getLogger('').addHandler(console)
  logging.info('Started')
  logging.info('Log file : ' + str(log_dir))
  logging.info('Python environment : ' + str(sys.version))
  logging.info('Requests version: ' + str(requests.__version__))
  logging.info('BeautifulSoup version: ' + str(bs4.__version__))
  logging.info('Pandas version: ' + str(pd.__version__))

create_logging()

2020-09-18 04:08:34,780 | root | INFO | Started
2020-09-18 04:08:34,781 | root | INFO | Log file : logs/logfile_ws02_20200918.log
2020-09-18 04:08:34,784 | root | INFO | informational message.
2020-09-18 04:08:34,789 | root | CRITICAL | critial!
