Skip to content

Commit

Permalink
update experimental streaming interface from koshort
Browse files Browse the repository at this point in the history
  • Loading branch information
Jung committed Sep 9, 2018
1 parent 86b62e2 commit 82368b6
Show file tree
Hide file tree
Showing 22 changed files with 1,323 additions and 2 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Expand Up @@ -9,4 +9,5 @@
.cache
build/
dist/
docs/_build/
docs/_build/
data/
1 change: 1 addition & 0 deletions konlpy/__init__.py
Expand Up @@ -11,6 +11,7 @@
from konlpy import (
corpus,
data,
stream,
internals,
tag
)
2 changes: 1 addition & 1 deletion konlpy/about.py
Expand Up @@ -15,7 +15,7 @@
]

__title__ = 'KoNLPy'
__version__ = '0.5.1'
__version__ = '0.5.2'

__author__ = 'Team KoNLPy'
__email__ = 'konlpy@googlegroups.com'
Expand Down
12 changes: 12 additions & 0 deletions konlpy/constants.py
@@ -0,0 +1,12 @@
"""Constants used in konlpy library. """
import os

DATA_DIR = "data/"
ALPHABET = ["가", "나", "다", "라", "마", "바", "사", "아", "자", "차", "카", "타", "파", "하"]


def make_dir(directory=DATA_DIR):
"""make konlpy data directory to store streaming data"""

if not os.path.exists(directory):
os.mkdir(directory)
71 changes: 71 additions & 0 deletions konlpy/data.py
Expand Up @@ -3,6 +3,7 @@

import os
import sys
import glob
import textwrap

if sys.version_info[0] >= 3:
Expand All @@ -11,6 +12,7 @@
import cPickle as pickle

from konlpy import utils
from konlpy.constants import DATA_DIR, make_dir


#: A dictionary describing the formats that are supported by
Expand Down Expand Up @@ -133,6 +135,75 @@ def file_size(self):
return os.stat(self.path).st_size


def clear():
"""clear the konlpy output data directory
.. code-block:: python
>>> import konlpy
>>> konlpy.clear()
"""

items = os.listdir(DATA_DIR + "*")
for item in items:
os.remove(item)


def listdir():
"""list konlpy default data directory.
.. code-block:: python
>>> import konlpy
>>> konlpy.listdir()
"""

print(os.listdir(DATA_DIR))


class CorpusReader(object):
def __init__(self, extension='.txt'):
"""CorpusReader reads corpuses in konlpy data directory.
extension (str, optional): Defaults to '.txt'. extension of corpus to load.
.. code-block:: python
>>> from konlpy.data import CorpusReader
>>> reader = CorpusReader()
>>> reader.read()
>>> reader.corpus
{...}
>>> reader.items = ["data/specific_corpus.txt"]
>>> reader.read()
>>> reader.corpus['specific_corpus.txt']
content of corpus
"""

self.items = glob.glob(DATA_DIR + "*" + extension)
self.corpus = {}

def read(self):
"""read method reads all files included
in items attr and save it into corpus dictionary.
"""

for filename in self.items:
reader = open(filename, mode='r+', encoding='utf-8')
self.corpus[os.path.basename(filename)] = reader.read()


class StringWriter(object):
def __init__(self, filename):
make_dir()
self.writer = open(DATA_DIR + filename, mode='a', encoding='utf-8')

def write(self, string):
self.writer.write(string)
self.writer.write('\n')


__all__ = [
'find', 'load',
'path', 'FileSystemPathPointer', 'PathPointer']
12 changes: 12 additions & 0 deletions konlpy/stream/__init__.py
@@ -0,0 +1,12 @@
"""konlpy.stream is a high-level streaming interfaces
for various websites and services brought from the original project koshort by nyanye(iam@nyanye.com)"""

from __future__ import absolute_import

from konlpy.stream.base import BaseStreamer, KonlpyStreamerError
from konlpy.stream.twitter import TwitterStreamer
from konlpy.stream.naver import NaverStreamer
from konlpy.stream.dcinside import DCInsideStreamer
from konlpy.stream.misc import NavtterStreamer
from konlpy.stream.daum import DaumStreamer
from konlpy.stream.google_trend import GoogleTrendStreamer
71 changes: 71 additions & 0 deletions konlpy/stream/base.py
@@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division

from argparse import ArgumentParser
from konlpy.utils import PropagatingThread
import urllib3


__all__ = ['KonlpyStreamerError', 'BaseStreamer']


class KonlpyStreamerError(Exception):
def __init__(self, message, streamer):
self.message = message
self.streamer = streamer

def __str__(self):
return "%s has crashed. \n%s" % (self.streamer, self.message)


class BaseStreamer(object):
"""BaseStreamer class contains:
Methods:
get_parser: returns initial argument parser
show_options: show options that can be used or parsed
stream: try asynchronous streaming using job method
"""

def __init__(self, is_async=True):
self.is_async = is_async

def get_parser(self):
"""customized argument parser to set various parameters
Returns:
object: argument parser.
"""

parser = ArgumentParser()
parser.add_argument(
'-v', '--verbose',
help="increase verbosity",
action="store_true"
)
return parser

def show_options(self):
"""Print out options available and predefined values."""

for attr, value in sorted(vars(self.options).items()):
print("{} = {}".format(attr, value))

def stream(self):
try:
if self.is_async:
self._thread = PropagatingThread(target=self.job)
self._thread.start()
self._thread.join()
else:
self.job()
except urllib3.exceptions.ProtocolError:
print("ProtocolError has raised but continue to stream.")
self.stream(is_async=self.is_async)
except RecursionError:
return False
except KeyboardInterrupt:
print("User has interrupted.")
return False
127 changes: 127 additions & 0 deletions konlpy/stream/daum.py
@@ -0,0 +1,127 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division

from urllib.request import urlopen
from bs4 import BeautifulSoup
from argparse import ArgumentParser
from time import sleep

from konlpy.data import StringWriter
from konlpy.stream import BaseStreamer
from konlpy.utils import PropagatingThread


def get_current_trend():
"""Get current top trending words from naver
Returns:
counts: list of count
keywords: list of keyword
"""

url = 'https://www.daum.net/'
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
counts = []
keywords = []

item_list = soup.find("div", {"class": "realtime_part"}).findAll("div", {"class": "roll_txt"})
item_list = [item.find("div", {"class": "rank_cont"}) for item in item_list]

for item in item_list:
count = item.find("span", {"class": "ir_wa"}).getText()
keyword = item.find("span", {"class": "txt_issue"}).getText()
counts.append(count)
keywords.append(keyword)

return counts, keywords


class DaumStreamer(BaseStreamer):
"""DaumStreamer helps to stream daum trending keywords asynchronously.
.. code-block:: python
>>> from konlpy.stream import daum
>>> streamer = daum.DaumStreamer()
>>> streamer.stream()
김민승
이유애린
훈남정음
소유진
...
"""

def __init__(self, is_async=True):
self.is_async = is_async

parser = self.get_parser()
parser.add_argument(
'-d', '--display_rank',
help="display rank in results and commandline.",
action="store_true"
)
parser.add_argument(
'-i', '--interval',
help="streaming interval(secs)",
default=60,
type=int
)
parser.add_argument(
'-n', '--n_limits',
help="stop when this amount of trends are collected. 0 for forever",
default=10,
type=int
)
parser.add_argument(
'--filename',
help="filename to be saved.",
default="trends.txt"
)

self.options, _ = parser.parse_known_args()
self.writer = StringWriter(self.options.filename)

def save_and_print(self):
"""collect current trending words and save or print"""

counts, keywords = get_current_trend()
if self.options.display_rank:
for count, keyword in zip(counts, keywords):
pair = "{}.{}".format(count, keyword)
self.writer.write(pair)
if self.options.verbose:
print(pair)

else:
for keyword in keywords:
self.writer.write(keyword)
if self.options.verbose:
print(keyword)

def job(self):
"""Streaming job with intervals.
Args:
interval (int): Time interval
"""

n_try = 0
while (self.options.n_limits == 0) | (self.options.n_limits > n_try):
n_try += 1
self.save_and_print()
sleep(self.options.interval)


def main():
app = DaumStreamer(is_async=False)
app.options.verbose = True
app.show_options()
app.stream()


if __name__ == '__main__':
main()

0 comments on commit 82368b6

Please sign in to comment.