## Code for Scraping Aozora Access Rankings

In [1]:
#load libraries

import requests #for http requests
import bs4 #called `beautifulsoup4`, an html parser
import pandas #gives us DataFrames
import re #for regexs
import io #for making http requests look like files

In [128]:
#grab content from page
base_url = 'http://www.aozora.gr.jp/access_ranking/'
ContentRequest = requests.get(base_url)
ContentRequest.encoding = "utf-8"   #need to switch encoding from original, which is ISO-899-1
soup = bs4.BeautifulSoup(ContentRequest.text, 'html.parser')

#grab links to all the rankings
links = list(soup.findAll('a'))
links = links[0:223]   #last two links are not legit

rank_dict = {}

for link in links:
    rankings = requests.get(base_url + link.get("href"))
    rankings.encoding = "utf-8"
    soup = bs4.BeautifulSoup(rankings.text, 'html.parser')
    ranks = list(soup.findAll('tr'))
    ranks = ranks[1:] #eliminate the first table, which is just column headers
    
    for entry in ranks:  
        entry_data = list(entry.findAll('td'))
        title = re.sub('\n', '', entry_data[1].text)
        author = re.sub(u'\u3000', '', entry_data[2].text)
        author = re.sub(' ', '', author)
        count = entry_data[3].text
        
        #store in the dictionary
        if title in rank_dict:
            rank_dict[title]['count'] += int(count)
        else:
            rank_dict[title] = {"author": author, "count": int(count)}

#sort the results
from collections import OrderedDict
l = OrderedDict(sorted(rank_dict.items(), key=lambda t: t[1]['count'], reverse=True))

In [132]:
#transform data into a data frame and inspect
df = pandas.DataFrame.from_dict(l, orient='index')
df = df.sort_values(['count'], ascending=False)
df[0:100]

Unnamed: 0,author,count
こころ,夏目漱石,4033985
〔雨ニモマケズ〕,宮沢賢治,3114769
銀河鉄道の夜,宮沢賢治,2637941
吾輩は猫である,夏目漱石,2628145
人間失格,太宰治,2532982
走れメロス,太宰治,2475493
ドグラ・マグラ,夢野久作,2234748
羅生門,芥川竜之介,2212075
坊っちゃん,夏目漱石,2194551
山月記,中島敦,1953982


In [135]:
#print results to excel file
import xlsxwriter
import openpyxl
writer = pandas.ExcelWriter(r'c:\Users\Hoyt\Dropbox\JAPAN_CORPUS\TopTitlesByAccessRanking.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
writer.save()

In [58]:
entries = list(ranks[340].findAll('td'))
title = re.sub('\n', '', entries[1].text)
author = re.sub(u'\u3000', '', entries[2].text)
author = re.sub(' ', '', author)
count = entries[3].text
print(title, author, count)

あらくれ 徳田秋声 423


In [75]:
rank_dict = {}
rank_dict[title] = {"author": author, "count": int(count)}
rank_dict

{'あらくれ': {'author': '徳田秋声', 'count': 423}}