jbesomi · henrifroese · Aug 18, 2020 · Aug 19, 2020 · Aug 19, 2020 · Aug 21, 2020
diff --git a/tests/test_types.py b/tests/test_types.py
@@ -42,6 +42,22 @@ def f(s):
 
         self.assertRaises(TypeError, f, pd.Series([["token", "ized"]]))
 
+    def test_inputseries_function_executes_correctly_CategorySeries(self):
+        @_types.InputSeries(_types.ClusterSeries)
+        def f(s, t):
+            return t
+
+        s = pd.Series([0, 1], dtype="category")
+        t = "test"
+        self.assertEqual(f(s, t), t)
+
+    def test_inputseries_wrong_type_CategorySeries(self):
+        @_types.InputSeries(_types.ClusterSeries)
+        def f(s):
+            pass
+
+        self.assertRaises(TypeError, f, pd.Series([0, 1]))
+
     def test_inputseries_correct_type_textseries(self):
         @_types.InputSeries(_types.TextSeries)
         def f(s):

diff --git a/texthero/_types.py b/texthero/_types.py
@@ -42,6 +42,7 @@ def tfidf(s: TokenSeries) -> DataFrame:
 - TextSeries: cells are text (i.e. strings), e.g. "Test"
 - TokenSeries: cells are lists of tokens (i.e. lists of strings), e.g. ["word1", "word2"]
 - VectorSeries: cells are vector representations of text, e.g. [0.25, 0.75]
+- ClusterSeries: Series has dtype "category", and every entry is a cluster-ID (e.g. 5 or "topic 1")
 
 The implemented types are lightweight subclasses of pd.Series and serve 2 purposes:
 1. Good documentation for users through docstring.
@@ -88,6 +89,10 @@ class HeroTypes(pd.Series, pd.DataFrame):
     3. VectorSeries: Every cell is a vector representing text, i.e.
     a list of floats. For example, `pd.Series([[1.0, 2.0], [3.0]])` is a valid VectorSeries.
 
+    4. - ClusterSeries: Series has dtype "category" and every entry is a
+    cluster-ID (e.g. 5 or "topic 1"). For example, `pd.Series([0, 3, 0, 1], dtype="category")`
+    is a valid ClusterSeries.
+
     Additionally, some Texthero functions (most that accept
     VectorSeries input) accept a Pandas DataFrame as input that is
     representing a matrix.
@@ -171,6 +176,30 @@ def is_list_of_strings(cell):
         return True, ""
 
 
+class ClusterSeries(HeroTypes):
+    """
+    A ClusterSeries has dtype "category" and every entry is a
+    cluster-ID (e.g. 5 or "topic 1"). For example, `pd.Series([0, 3, 0, 1], dtype="category")`
+    is a valid ClusterSeries.
+    """
+
+    @staticmethod
+    def check_type(s: pd.Series) -> Tuple[bool, str]:
+        """
+        Check if a given Pandas Series has the properties of a ClusterSeries.
+        """
+
+        error_string = (
+            "should be ClusterSeries: the given Series does not have dtype 'category'."
+            " See help(hero.HeroTypes) for more information."
+        )
+
+        if not pd.api.types.is_categorical_dtype(s):
+            return False, error_string
+        else:
+            return True, ""
+
+
 class VectorSeries(HeroTypes):
     """
     In a VectorSeries, every cell is a vector representing text, i.e.

diff --git a/texthero/representation.py b/texthero/representation.py
@@ -78,7 +78,7 @@ def count(
 
     min_df : float in range [0.0, 1.0] or int, optional, default=1
         When building the vocabulary ignore terms that have a document
-        frequency (number of documents they appear in) strictly 
+        frequency (number of documents they appear in) strictly
         lower than the given threshold.
         If float, the parameter represents a proportion of documents,
         integer absolute counts.
@@ -154,7 +154,7 @@ def term_frequency(
 
     min_df : float in range [0.0, 1.0] or int, optional, default=1
         When building the vocabulary ignore terms that have a document
-        frequency (number of documents they appear in) strictly 
+        frequency (number of documents they appear in) strictly
         lower than the given threshold.
         If float, the parameter represents a proportion of documents,
         integer absolute counts.
@@ -245,7 +245,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram
 
     min_df : float in range [0.0, 1.0] or int, optional, default=1
         When building the vocabulary ignore terms that have a document
-        frequency (number of documents they appear in) strictly 
+        frequency (number of documents they appear in) strictly
         lower than the given threshold.
         If float, the parameter represents a proportion of documents, 
         integer absolute counts.
@@ -390,7 +390,7 @@ def nmf(
     natural language processing to find clusters of similar
     texts (e.g. some texts in a corpus might be about sports
     and some about music, so they will differ in the usage
-    of technical terms; see the example below). 
+    of technical terms; see the example below).
 
     Given a document-term matrix (so in
     texthero usually a Series after applying
@@ -436,7 +436,7 @@ def nmf(
     >>> # As we can see, the third document, which
     >>> # is a mix of sports and music, is placed
     >>> # between the two axes (the topics) while
-    >>> # the other documents are placed right on 
+    >>> # the other documents are placed right on
     >>> # one topic axis each.
 
     See also
@@ -587,11 +587,11 @@ def kmeans(
     Performs K-means clustering algorithm on the given input.
 
     K-means clustering is used in natural language processing
-    to separate texts into k clusters (groups) 
+    to separate texts into k clusters (groups)
     (e.g. some texts in a corpus might be about sports
     and some about music, so they will differ in the usage
     of technical terms; the K-means algorithm uses this
-    to separate them into two clusters). 
+    to separate them into two clusters).
 
     Given a document-term matrix (so in
     texthero usually a Series after applying