Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add initial Chinese support for hero.lang.zh.preprocessing #128

Draft
wants to merge 14 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
env: PATH=/c/Python38:/c/Python38/Scripts:$PATH
install:
- pip3 install --upgrade pip # all three OSes agree about 'pip3'
- pip3 install black
- pip3 install black==19.10b0
- pip3 install ".[dev]" .
# 'python' points to Python 2.7 on macOS but points to Python 3.8 on Linux and Windows
# 'python3' is a 'command not found' error on Windows but 'py' works on Windows only
Expand Down
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,12 @@ install_requires =
# TODO pick the correct version.
[options.extras_require]
dev =
black>=19.10b0
black==19.10b0
pytest>=4.0.0
Sphinx>=3.0.3
sphinx-markdown-builder>=0.5.4
recommonmark>=0.6.0
nbsphinx
parameterized>=0.7.4
coverage
jieba
Empty file added tests/lang/__init__.py
Empty file.
Empty file added tests/lang/zh/__init__.py
Empty file.
125 changes: 125 additions & 0 deletions tests/lang/zh/test_indexes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import pandas as pd
from texthero.lang.zh import preprocessing

from ... import PandasTestCase
import unittest
import string
from parameterized import parameterized


# Define valid inputs for different functions.
s_text = pd.Series(["Test"], index=[5])
s_numeric = pd.Series([5.0], index=[5])
s_numeric_lists = pd.Series([[5.0, 5.0], [6.0, 6.0]], index=[5, 6])

# Define all test cases. Every test case is a list
# of [name of test case, function to test, tuple of valid input for the function].
# First argument of valid input has to be the Pandas Series where we
# want to keep the index. If this is different for a function, a separate
# test case has to implemented in the class below.
# The tests will be run by AbstractIndexTest below through the @parameterized
# decorator.
# The names will be expanded automatically, so e.g. "named_entities"
# creates test cases test_correct_index_named_entities and test_incorrect_index_named_entities.


test_cases_preprocessing = [
["fillna", preprocessing.fillna, (s_text,)],
["remove_whitespace", preprocessing.remove_whitespace, (s_text,)],
["clean", preprocessing.clean, (s_text,)],
["remove_html_tags", preprocessing.remove_html_tags, (s_text,)],
["tokenize", preprocessing.tokenize, (s_text,)],
["replace_urls", preprocessing.replace_urls, (s_text, "")],
["remove_urls", preprocessing.remove_urls, (s_text,)],
["replace_tags", preprocessing.replace_tags, (s_text, "")],
["remove_tags", preprocessing.remove_tags, (s_text,)],
["replace_hashtags", preprocessing.replace_hashtags, (s_text, "")],
["remove_hashtags", preprocessing.remove_hashtags, (s_text,)],
]

test_cases = test_cases_preprocessing

# test_cases_nlp = [
# ["named_entities", nlp.named_entities, (s_text,)],
# ["noun_chunks", nlp.noun_chunks, (s_text,)],
# ]
#
# test_cases_preprocessing = [
# ["fillna", preprocessing.fillna, (s_text,)],
# ["lowercase", preprocessing.lowercase, (s_text,)],
# ["replace_digits", preprocessing.replace_digits, (s_text, "")],
# ["remove_digits", preprocessing.remove_digits, (s_text,)],
# ["replace_punctuation", preprocessing.replace_punctuation, (s_text, "")],
# ["remove_punctuation", preprocessing.remove_punctuation, (s_text,)],
# ["remove_diacritics", preprocessing.remove_diacritics, (s_text,)],
# ["remove_whitespace", preprocessing.remove_whitespace, (s_text,)],
# ["replace_stopwords", preprocessing.replace_stopwords, (s_text, "")],
# ["remove_stopwords", preprocessing.remove_stopwords, (s_text,)],
# ["stem", preprocessing.stem, (s_text,)],
# ["clean", preprocessing.clean, (s_text,)],
# ["remove_round_brackets", preprocessing.remove_round_brackets, (s_text,)],
# ["remove_curly_brackets", preprocessing.remove_curly_brackets, (s_text,)],
# ["remove_square_brackets", preprocessing.remove_square_brackets, (s_text,)],
# ["remove_angle_brackets", preprocessing.remove_angle_brackets, (s_text,)],
# ["remove_brackets", preprocessing.remove_brackets, (s_text,)],
# ["remove_html_tags", preprocessing.remove_html_tags, (s_text,)],
# ["tokenize", preprocessing.tokenize, (s_text,)],
# ["tokenize_with_phrases", preprocessing.tokenize_with_phrases, (s_text,)],
# ["replace_urls", preprocessing.replace_urls, (s_text, "")],
# ["remove_urls", preprocessing.remove_urls, (s_text,)],
# ["replace_tags", preprocessing.replace_tags, (s_text, "")],
# ["remove_tags", preprocessing.remove_tags, (s_text,)],
# ]
#
# test_cases_representation = [
# ["count", representation.count, (preprocessing.tokenize(s_text),),],
# [
# "term_frequency",
# representation.term_frequency,
# (preprocessing.tokenize(s_text),),
# ],
# ["tfidf", representation.tfidf, (preprocessing.tokenize(s_text),)],
# ["pca", representation.pca, (s_numeric_lists, 0)],
# ["nmf", representation.nmf, (s_numeric_lists,)],
# ["tsne", representation.tsne, (s_numeric_lists,)],
# ["kmeans", representation.kmeans, (s_numeric_lists, 1)],
# ["dbscan", representation.dbscan, (s_numeric_lists,)],
# ["meanshift", representation.meanshift, (s_numeric_lists,)],
# ]
#
# test_cases_visualization = []
#
# test_cases = (
# test_cases_nlp
# + test_cases_preprocessing
# + test_cases_representation
# + test_cases_visualization
# )


class AbstractIndexTest(PandasTestCase):
"""
Class for index test cases. Tests for all cases
in test_cases whether the input's index is correctly
preserved by the function. Some function's tests
are implemented manually as they take different inputs.

"""

"""
Tests defined in test_cases above.
"""

@parameterized.expand(test_cases)
def test_correct_index(self, name, test_function, valid_input):
s = valid_input[0]
result_s = test_function(*valid_input)
t_same_index = pd.Series(s.values, s.index)
self.assertTrue(result_s.index.equals(t_same_index.index))

@parameterized.expand(test_cases)
def test_incorrect_index(self, name, test_function, valid_input):
s = valid_input[0]
result_s = test_function(*valid_input)
t_different_index = pd.Series(s.values, index=None)
self.assertFalse(result_s.index.equals(t_different_index.index))
126 changes: 126 additions & 0 deletions tests/lang/zh/test_preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import string

import pandas as pd
import numpy as np
import doctest

from texthero.lang.zh import preprocessing, stopwords
from ... import PandasTestCase


"""
Test doctest
"""


def load_tests(loader, tests, ignore):
tests.addTests(doctest.DocTestSuite(preprocessing))
return tests


class TestPreprocessing(PandasTestCase):
"""
Remove whitespace.
"""

def test_remove_whitespace(self):
s = pd.Series("早上好啊,\n\t我的朋友。今天我要去吃 KFC。")
s_true = pd.Series("早上好啊, 我的朋友。今天我要去吃 KFC。")
self.assertEqual(preprocessing.remove_whitespace(s), s_true)

"""
Test pipeline.
"""

def test_pipeline_stopwords(self):
s = pd.Series("语言是人类区别其他动物的本质特性。\t@中国NLP第一大师\n#如何定义NLP 为什么呢?")
s_true = pd.Series("语言是人类区别其他动物的本质特性。 为什么呢?")
pipeline = [
preprocessing.remove_whitespace,
preprocessing.remove_hashtags,
preprocessing.remove_tags,
]
self.assertEqual(preprocessing.clean(s, pipeline=pipeline), s_true)

"""
Test remove html tags
"""

def test_remove_html_tags(self):
s = pd.Series("<html> 中国新闻网 <br>体育</br> 标记<html> &nbsp;")
s_true = pd.Series(" 中国新闻网 体育 标记 ")
self.assertEqual(preprocessing.remove_html_tags(s), s_true)

"""
Text tokenization
"""

def test_tokenize(self):
s = pd.Series("我昨天吃烤鸭去了。")
s_true = pd.Series([["我", "昨天", "吃", "烤鸭", "去", "了", "。"]])
self.assertEqual(preprocessing.tokenize(s), s_true)

def test_tokenize_multirows(self):
s = pd.Series(["今天天气真好", "明天会怎样呢"])
s_true = pd.Series([["今天天气", "真", "好"], ["明天", "会", "怎样", "呢"]])
self.assertEqual(preprocessing.tokenize(s), s_true)

"""
Has content
"""

def test_has_content(self):
s = pd.Series(["哈哈", np.nan, "\t\n", " ", "", "这有点东西", None])
s_true = pd.Series([True, False, False, False, False, True, False])
self.assertEqual(preprocessing.has_content(s), s_true)

"""
Test remove urls
"""

def test_remove_urls(self):
s = pd.Series("http://tests.com http://www.tests.com")
s_true = pd.Series(" ")
self.assertEqual(preprocessing.remove_urls(s), s_true)

def test_remove_urls_https(self):
s = pd.Series("https://tests.com https://www.tests.com")
s_true = pd.Series(" ")
self.assertEqual(preprocessing.remove_urls(s), s_true)

def test_remove_urls_multiline(self):
s = pd.Series("https://tests.com \n https://tests.com")
s_true = pd.Series(" \n ")
self.assertEqual(preprocessing.remove_urls(s), s_true)

"""
Test replace and remove tags
"""

def test_replace_tags(self):
s = pd.Series("你好@马丁123abc佩奇,我要把你取关了。")
s_true = pd.Series("你好TAG,我要把你取关了。")

self.assertEqual(preprocessing.replace_tags(s, symbol="TAG"), s_true)

def test_remove_tags(self):
s = pd.Series("你好@马丁123abc佩奇,我要把你取关了。")
s_true = pd.Series("你好 ,我要把你取关了。")

self.assertEqual(preprocessing.remove_tags(s), s_true)

"""
Test replace and remove hashtags
"""

def test_replace_hashtags(self):
s = pd.Series("语言是人类区别其他动物的本质特性。#NLP百科大全")
s_true = pd.Series("语言是人类区别其他动物的本质特性。HASHTAG")

self.assertEqual(preprocessing.replace_hashtags(s, symbol="HASHTAG"), s_true)

def test_remove_hashtags(self):
s = pd.Series("语言是人类区别其他动物的本质特性。#NLP百科大全")
s_true = pd.Series("语言是人类区别其他动物的本质特性。 ")

self.assertEqual(preprocessing.remove_hashtags(s), s_true)
3 changes: 3 additions & 0 deletions texthero/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,6 @@
from .nlp import *

from . import stopwords

from . import lang
from .lang import *
1 change: 1 addition & 0 deletions texthero/lang/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from . import zh
18 changes: 18 additions & 0 deletions texthero/lang/zh/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""Texthero: python toolkit for text preprocessing, representation and visualization.



"""
from . import preprocessing
from .preprocessing import *

# from . import representation
# from .representation import *
#
# from . import visualization
# from .visualization import *
#
# from . import nlp
# from .nlp import *

from . import stopwords
Loading