In [46]:
from janome.tokenizer import Tokenizer
import urllib.request
import zipfile
import tempfile
import os
from abc import ABCMeta, abstractmethod

class Task:
    processes = []
    def process(self, process):
        self.processes.append(process)
        return self
    
    def run(self, init):
        for p in self.processes:
            init = p.apply(init)
        return init


class Process(metaclass=ABCMeta):
    @abstractmethod
    def apply(self, init):
        pass

class DownloadAozora(Process):
    
    def __init__(self, url):
        self.url = url
    
    def apply(self, init):
        localfile = self.url.split('/')[-1]
        if not os.path.exists(localfile):
            #download
            print('downloading')
            urllib.request.urlretrieve(url, localfile)
        #unzip
        with zipfile.ZipFile(localfile, 'r') as zip_fp:
            for entry in zip_fp.infolist():
                if entry.filename.find('.txt') > 0:
                    with zip_fp.open(entry.filename, 'r') as fp:
                        return fp.read().decode('shift_jis')
        raise Exception('No text file found in {0}'.format(self.url))


class TokenCounter(Process):
    
    def __init__(self, tokenizer, filter):
        self.tokenizer = tokenizer
        self.filter = filter
    
    def apply(self, lines):
        counts = {}
        for line in lines.split('\r\n'):
            for token in self.tokenizer.tokenize(line):
                if self.filter(token):
                    counts[token.surface] = counts.get(token.surface, 0) + 1
        return counts

def noun_filter(token):

    def is_h(title):
        a =   [ch for ch in title if "あ" <= ch <= "ん"]
        if len(title) == len(a):
            return True
        return False
    return token.part_of_speech.find('名詞') >= 0 and not is_h(token.surface)

class SortByFreq(Process):
    
    def __init__(self, limit, desc=True):
        self.limit = limit
        self.desc = desc
    
    def apply(self, worddic):
        keys = sorted(worddic.items(),key = lambda x:x[1], reverse=self.desc)
        return [(word, cnt) for word,cnt in keys[:self.limit]]

    
TEXT = 'http://www.aozora.gr.jp/cards/001699/files/57858_ruby_59671.zip'

t = Task()
t.process(DownloadAozora(TEXT))
t.process(TokenCounter(Tokenizer(), noun_filter))
t.process(SortByFreq(50))
tokens = t.run(None)

for t in tokens:
    word, cnt = t[0], t[1]
    print("{0}({1})\n".format(word,cnt), end="")

我々(401)
的(324)
.(251)
一(202)
私(161)
中(154)
二(141)
/(137)
五(127)
-(126)
_(124)
十(119)
彫刻(118)
レイク(113)
者(112)
上(112)
何(106)
都市(96)
三(93)
時(92)
太字(92)
後(86)
〇(84)
恐怖(84)
(*(83)
メートル(81)
他(79)
年(77)
氷(75)
調査(74)
＃「(73)
性(72)
部(71)
彼ら(71)
前(70)
形(70)
一つ(70)
間(69)
山脈(68)
キャンプ(66)
部分(66)
ダンフォース(66)
＃(66)
彼(66)
大(64)
目(63)
奇妙(62)
四(61)
物(61)
機(60)


TypeError: update expected at most 1 arguments, got 2