Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
update experimental streaming interface from koshort
- Loading branch information
Jung
committed
Sep 9, 2018
1 parent
86b62e2
commit 82368b6
Showing
22 changed files
with
1,323 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,4 +9,5 @@ | |
.cache | ||
build/ | ||
dist/ | ||
docs/_build/ | ||
docs/_build/ | ||
data/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
from konlpy import ( | ||
corpus, | ||
data, | ||
stream, | ||
internals, | ||
tag | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
"""Constants used in konlpy library. """ | ||
import os | ||
|
||
DATA_DIR = "data/" | ||
ALPHABET = ["가", "나", "다", "라", "마", "바", "사", "아", "자", "차", "카", "타", "파", "하"] | ||
|
||
|
||
def make_dir(directory=DATA_DIR): | ||
"""make konlpy data directory to store streaming data""" | ||
|
||
if not os.path.exists(directory): | ||
os.mkdir(directory) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
"""konlpy.stream is a high-level streaming interfaces | ||
for various websites and services brought from the original project koshort by nyanye(iam@nyanye.com)""" | ||
|
||
from __future__ import absolute_import | ||
|
||
from konlpy.stream.base import BaseStreamer, KonlpyStreamerError | ||
from konlpy.stream.twitter import TwitterStreamer | ||
from konlpy.stream.naver import NaverStreamer | ||
from konlpy.stream.dcinside import DCInsideStreamer | ||
from konlpy.stream.misc import NavtterStreamer | ||
from konlpy.stream.daum import DaumStreamer | ||
from konlpy.stream.google_trend import GoogleTrendStreamer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# -*- coding: utf-8 -*- | ||
from __future__ import print_function | ||
from __future__ import absolute_import | ||
from __future__ import division | ||
|
||
from argparse import ArgumentParser | ||
from konlpy.utils import PropagatingThread | ||
import urllib3 | ||
|
||
|
||
__all__ = ['KonlpyStreamerError', 'BaseStreamer'] | ||
|
||
|
||
class KonlpyStreamerError(Exception): | ||
def __init__(self, message, streamer): | ||
self.message = message | ||
self.streamer = streamer | ||
|
||
def __str__(self): | ||
return "%s has crashed. \n%s" % (self.streamer, self.message) | ||
|
||
|
||
class BaseStreamer(object): | ||
"""BaseStreamer class contains: | ||
Methods: | ||
get_parser: returns initial argument parser | ||
show_options: show options that can be used or parsed | ||
stream: try asynchronous streaming using job method | ||
""" | ||
|
||
def __init__(self, is_async=True): | ||
self.is_async = is_async | ||
|
||
def get_parser(self): | ||
"""customized argument parser to set various parameters | ||
Returns: | ||
object: argument parser. | ||
""" | ||
|
||
parser = ArgumentParser() | ||
parser.add_argument( | ||
'-v', '--verbose', | ||
help="increase verbosity", | ||
action="store_true" | ||
) | ||
return parser | ||
|
||
def show_options(self): | ||
"""Print out options available and predefined values.""" | ||
|
||
for attr, value in sorted(vars(self.options).items()): | ||
print("{} = {}".format(attr, value)) | ||
|
||
def stream(self): | ||
try: | ||
if self.is_async: | ||
self._thread = PropagatingThread(target=self.job) | ||
self._thread.start() | ||
self._thread.join() | ||
else: | ||
self.job() | ||
except urllib3.exceptions.ProtocolError: | ||
print("ProtocolError has raised but continue to stream.") | ||
self.stream(is_async=self.is_async) | ||
except RecursionError: | ||
return False | ||
except KeyboardInterrupt: | ||
print("User has interrupted.") | ||
return False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
# -*- coding: utf-8 -*- | ||
from __future__ import print_function | ||
from __future__ import absolute_import | ||
from __future__ import division | ||
|
||
from urllib.request import urlopen | ||
from bs4 import BeautifulSoup | ||
from argparse import ArgumentParser | ||
from time import sleep | ||
|
||
from konlpy.data import StringWriter | ||
from konlpy.stream import BaseStreamer | ||
from konlpy.utils import PropagatingThread | ||
|
||
|
||
def get_current_trend(): | ||
"""Get current top trending words from naver | ||
Returns: | ||
counts: list of count | ||
keywords: list of keyword | ||
""" | ||
|
||
url = 'https://www.daum.net/' | ||
html = urlopen(url) | ||
soup = BeautifulSoup(html, 'html.parser') | ||
counts = [] | ||
keywords = [] | ||
|
||
item_list = soup.find("div", {"class": "realtime_part"}).findAll("div", {"class": "roll_txt"}) | ||
item_list = [item.find("div", {"class": "rank_cont"}) for item in item_list] | ||
|
||
for item in item_list: | ||
count = item.find("span", {"class": "ir_wa"}).getText() | ||
keyword = item.find("span", {"class": "txt_issue"}).getText() | ||
counts.append(count) | ||
keywords.append(keyword) | ||
|
||
return counts, keywords | ||
|
||
|
||
class DaumStreamer(BaseStreamer): | ||
"""DaumStreamer helps to stream daum trending keywords asynchronously. | ||
.. code-block:: python | ||
>>> from konlpy.stream import daum | ||
>>> streamer = daum.DaumStreamer() | ||
>>> streamer.stream() | ||
김민승 | ||
이유애린 | ||
훈남정음 | ||
소유진 | ||
... | ||
""" | ||
|
||
def __init__(self, is_async=True): | ||
self.is_async = is_async | ||
|
||
parser = self.get_parser() | ||
parser.add_argument( | ||
'-d', '--display_rank', | ||
help="display rank in results and commandline.", | ||
action="store_true" | ||
) | ||
parser.add_argument( | ||
'-i', '--interval', | ||
help="streaming interval(secs)", | ||
default=60, | ||
type=int | ||
) | ||
parser.add_argument( | ||
'-n', '--n_limits', | ||
help="stop when this amount of trends are collected. 0 for forever", | ||
default=10, | ||
type=int | ||
) | ||
parser.add_argument( | ||
'--filename', | ||
help="filename to be saved.", | ||
default="trends.txt" | ||
) | ||
|
||
self.options, _ = parser.parse_known_args() | ||
self.writer = StringWriter(self.options.filename) | ||
|
||
def save_and_print(self): | ||
"""collect current trending words and save or print""" | ||
|
||
counts, keywords = get_current_trend() | ||
if self.options.display_rank: | ||
for count, keyword in zip(counts, keywords): | ||
pair = "{}.{}".format(count, keyword) | ||
self.writer.write(pair) | ||
if self.options.verbose: | ||
print(pair) | ||
|
||
else: | ||
for keyword in keywords: | ||
self.writer.write(keyword) | ||
if self.options.verbose: | ||
print(keyword) | ||
|
||
def job(self): | ||
"""Streaming job with intervals. | ||
Args: | ||
interval (int): Time interval | ||
""" | ||
|
||
n_try = 0 | ||
while (self.options.n_limits == 0) | (self.options.n_limits > n_try): | ||
n_try += 1 | ||
self.save_and_print() | ||
sleep(self.options.interval) | ||
|
||
|
||
def main(): | ||
app = DaumStreamer(is_async=False) | ||
app.options.verbose = True | ||
app.show_options() | ||
app.stream() | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.