Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add attributes to token #31

Merged
merged 2 commits into from
Aug 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions tests/test_token.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""Test for word tokenizers"""
import unittest

from tiny_tokenizer.tiny_tokenizer_token import Token


class TokenTest(unittest.TestCase):
"""Test ordinal word tokenizer."""

def test_token_without_feature(self):
token = Token(surface="大崎")
self.assertEqual("大崎", token.surface)
self.assertEqual("", token.feature)

def test_token_with_postag(self):
token = Token(surface="大崎", postag="名詞")
self.assertEqual("大崎", token.surface)
self.assertEqual("名詞", token.feature)

def test_token_with_postag2(self):
token = Token(
surface="大崎",
postag="名詞",
postag2="固有名詞,人名,姓",
conj_type="*",
conj_form="*",
origin_form="大崎",
yomi="オオサキ",
pron="オーサキ")

self.assertEqual(
"名詞,固有名詞,人名,姓,*,*,大崎,オオサキ,オーサキ",
token.feature)
60 changes: 51 additions & 9 deletions tiny_tokenizer/tiny_tokenizer_token.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
"""Token class."""
from typing import Optional


class Token:
"""Token class for tiny_tokenizer."""

def __init__(
self,
surface: str,
postag: Optional[str] = None,
postag2: Optional[str] = None
self,
surface: str,
postag: Optional[str] = None,
postag2: Optional[str] = None,
conj_type: Optional[str] = None,
conj_form: Optional[str] = None,
origin_form: Optional[str] = None,
yomi: Optional[str] = None,
pron: Optional[str] = None,
):
"""
Initializer for Token.
Expand All @@ -17,14 +23,29 @@ def __init__(
---
surface (str)
surface (original form) of a word
postag (Optional[str], default None)
postag (str, default: None)
part-of-speech tag of a word (optional)
postag2 (Optional[str], default None)
postag2 (str, default: None)
detailed part-of-speech tag of a word (optional)
conjugate type (str, default: None)
conjugate type of word (optional)
conjugate form (str, default: None)
conjugate type of word (optional)
origin_form (str, default: None)
original form of a word
yomi (str, default: None)
yomi of a word (optional)
pron (str, default: None)
pronounciation of a word (optional)
"""
self.surface = surface
self.postag = postag # NOQA
self.postag = postag
self.postag2 = postag2
self.conj_type = conj_type
self.conj_form = conj_form
self.origin_form = origin_form
self.pron = pron
self.yomi = yomi

def __repr__(self):
representation = self.surface
Expand All @@ -33,6 +54,27 @@ def __repr__(self):
return representation

def __eq__(self, right):
return self.surface == right.surface \
and self.postag == right.postag \
return (
self.surface == right.surface
and self.postag == right.postag
and self.postag2 == right.postag2
and self.yomi == right.yomi)

@property
def feature(self):
feature = []
if self.postag is not None:
feature.append(self.postag)
if self.postag2 is not None:
feature.append(self.postag2)
if self.conj_type is not None:
feature.append(self.conj_type)
if self.conj_form is not None:
feature.append(self.conj_form)
if self.origin_form is not None:
feature.append(self.origin_form)
if self.yomi is not None:
feature.append(self.yomi)
if self.pron is not None:
feature.append(self.pron)
return ','.join(feature)