jbesomi · mk2510 · Aug 18, 2020 · Aug 19, 2020 · Aug 19, 2020 · Aug 21, 2020
diff --git a/setup.cfg b/setup.cfg
@@ -38,6 +38,7 @@ install_requires =
     unidecode>=1.1.1
     gensim>=3.6.0
     matplotlib>=3.1.0
+    pyLDAvis>=2.1.2
 # TODO pick the correct version.
 [options.extras_require]
 dev =

diff --git a/tests/test_indexes.py b/tests/test_indexes.py
@@ -12,6 +12,13 @@
 s_tokenized_lists = pd.Series([["Test", "Test2"], ["Test3"]], index=[5, 6])
 s_numeric = pd.Series([5.0], index=[5])
 s_numeric_lists = pd.Series([[5.0, 5.0], [6.0, 6.0]], index=[5, 6])
+df_document_term = pd.DataFrame(
+    [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]],
+    index=[5, 6],
+    columns=pd.MultiIndex.from_product([["test"], ["!", ".", "?", "TEST", "Test"]]),
+    dtype="Sparse",
+)
+
 
 # Define all test cases. Every test case is a list
 # of [name of test case, function to test, tuple of valid input for the function].
@@ -62,9 +69,21 @@
     ["pca", representation.pca, (s_numeric_lists, 0)],
     ["nmf", representation.nmf, (s_numeric_lists,)],
     ["tsne", representation.tsne, (s_numeric_lists,)],
+    ["truncatedSVD", representation.tsne, (s_numeric_lists, 1)],
+    ["lda", representation.tsne, (s_numeric_lists, 1)],
     ["kmeans", representation.kmeans, (s_numeric_lists, 1)],
     ["dbscan", representation.dbscan, (s_numeric_lists,)],
     ["meanshift", representation.meanshift, (s_numeric_lists,)],
+    [
+        "topics_from_topic_model",
+        representation.topics_from_topic_model,
+        (s_numeric_lists,),
+    ],
+    [
+        "top_words_per_document",
+        representation.relevant_words_per_document,
+        (df_document_term,),
+    ],
 ]
 
 test_cases_visualization = []
@@ -94,12 +113,22 @@ class AbstractIndexTest(PandasTestCase):
     def test_correct_index(self, name, test_function, valid_input):
         s = valid_input[0]
         result_s = test_function(*valid_input)
-        t_same_index = pd.Series(s.values, s.index)
+
+        if isinstance(s, pd.Series):
+            t_same_index = pd.Series(s.values, s.index)
+        else:
+            t_same_index = pd.DataFrame(s.values, s.index)
+
         self.assertTrue(result_s.index.equals(t_same_index.index))
 
     @parameterized.expand(test_cases)
     def test_incorrect_index(self, name, test_function, valid_input):
         s = valid_input[0]
         result_s = test_function(*valid_input)
-        t_different_index = pd.Series(s.values, index=None)
+
+        if isinstance(s, pd.Series):
+            t_different_index = pd.Series(s.values, index=None)
+        else:
+            t_different_index = pd.DataFrame(s.values, index=None)
+
         self.assertFalse(result_s.index.equals(t_different_index.index))
diff --git a/tests/test_representation.py b/tests/test_representation.py
@@ -268,3 +268,160 @@ def test_normalize_DataFrame_also_as_output(self):
         pd.testing.assert_frame_equal(
             result, correct_output, check_dtype=False, rtol=0.1, atol=0.1,
         )
+
+    """
+    Test Topic Modelling (not all are suitable for parameterization).
+    `topics_from_topic_model, lda, truncatedSVD` already tested above.
+
+    Here, we test
+    `relevant_words_per_document, relevant_words_per_topic, topic_matrices`
+    """
+
+    def test_relevant_words_per_document(self):
+        s = pd.Series(
+            [
+                "Football, Sports, Soccer",
+                "music, violin, orchestra",
+                "football, fun, sports",
+                "music, band, guitar",
+            ]
+        )
+
+        s_tfidf = (
+            s.pipe(preprocessing.clean)
+            .pipe(preprocessing.tokenize)
+            .pipe(representation.tfidf)
+        )
+        s_result = representation.relevant_words_per_document(s_tfidf, n_words=2)
+
+        s_true = pd.Series(
+            [
+                ["soccer", "sports"],
+                ["violin", "orchestra"],
+                ["fun", "sports"],
+                ["guitar", "band"],
+            ],
+        )
+        pd.testing.assert_series_equal(s_result, s_true)
+
+    def test_relevant_words_per_topic(self):
+        s = pd.Series(
+            [
+                "Football, Sports, Soccer",
+                "music, violin, orchestra",
+                "football, fun, sports",
+                "music, band, guitar",
+            ]
+        )
+        s_tfidf = (
+            s.pipe(preprocessing.clean)
+            .pipe(preprocessing.tokenize)
+            .pipe(representation.tfidf)
+        )
+        s_cluster = (
+            s_tfidf.pipe(representation.normalize)
+            .pipe(representation.pca, n_components=2, random_state=42)
+            .pipe(representation.kmeans, n_clusters=2, random_state=42)
+        )
+
+        s_document_topic, s_topic_term = representation.topic_matrices(
+            s_tfidf, s_cluster
+        )
+        s_document_topic_distribution = representation.normalize(
+            s_document_topic, norm="l1"
+        )
+        s_topic_term_distribution = representation.normalize(s_topic_term, norm="l1")
+
+        s_result = representation.relevant_words_per_topic(
+            s_tfidf, s_document_topic_distribution, s_topic_term_distribution, n_words=3
+        )
+        s_true = pd.Series(
+            [["music", "violin", "orchestra"], ["sports", "football", "soccer"]],
+        )
+        pd.testing.assert_series_equal(s_result, s_true, check_names=False)
+
+    def test_topic_matrices_clustering_for_second_input(self):
+
+        s = pd.Series(["Football", "Music", "Football", "Music",])
+
+        s_tfidf = (
+            s.pipe(preprocessing.clean)
+            .pipe(preprocessing.tokenize)
+            .pipe(representation.tfidf)
+        )
+        s_cluster = (
+            s_tfidf.pipe(representation.normalize)
+            .pipe(representation.pca, n_components=2, random_state=42)
+            .pipe(representation.kmeans, n_clusters=2, random_state=42)
+        )
+
+        s_document_topic_result, s_topic_term_result = representation.topic_matrices(
+            s_tfidf, s_cluster
+        )
+
+        s_document_topic_true = pd.DataFrame(
+            [[0, 1], [1, 0], [0, 1], [1, 0]], columns=[0, 1]
+        )
+
+        s_topic_term_true = pd.DataFrame(
+            [[0.0, 3.021651], [3.021651, 0.0]], columns=["football", "music"]
+        )
+
+        pd.testing.assert_frame_equal(
+            s_document_topic_result,
+            s_document_topic_true,
+            check_less_precise=True,
+            check_dtype=False,
+        )
+
+        pd.testing.assert_frame_equal(
+            s_topic_term_result,
+            s_topic_term_true,
+            check_less_precise=True,
+            check_dtype=False,
+        )
+
+    def test_visualize_topics_topic_modelling_for_second_input(self):
+
+        s = pd.Series(["Football", "Music", "Football", "Music",])
+
+        s_tfidf = (
+            s.pipe(preprocessing.clean)
+            .pipe(preprocessing.tokenize)
+            .pipe(representation.tfidf)
+        )
+        s_lda = s_tfidf.pipe(representation.normalize).pipe(
+            representation.lda, n_components=2, random_state=42
+        )
+
+        s_document_topic_result, s_topic_term_result = representation.topic_matrices(
+            s_tfidf, s_lda
+        )
+
+        s_document_topic_true = pd.DataFrame(
+            [
+                [0.744417, 0.255583],
+                [0.255583, 0.744417],
+                [0.744417, 0.255583],
+                [0.255583, 0.744417],
+            ],
+            columns=[0, 1],
+        )
+
+        s_topic_term_true = pd.DataFrame(
+            [[2.249368, 0.772283], [0.772283, 2.249369]], columns=["football", "music"],
+        )
+
+        pd.testing.assert_frame_equal(
+            s_document_topic_result,
+            s_document_topic_true,
+            check_less_precise=True,
+            check_dtype=False,
+        )
+
+        pd.testing.assert_frame_equal(
+            s_topic_term_result,
+            s_topic_term_true,
+            check_less_precise=True,
+            check_dtype=False,
+        )
diff --git a/tests/test_visualization.py b/tests/test_visualization.py
@@ -2,8 +2,9 @@
 
 import pandas as pd
 import doctest
+import warnings
 
-from texthero import visualization
+from texthero import visualization, preprocessing, representation
 from . import PandasTestCase
 
 
@@ -79,3 +80,60 @@ def test_top_words_digits_punctuation(self):
     def test_wordcloud(self):
         s = pd.Series("one two three")
         self.assertEqual(visualization.wordcloud(s), None)
+
+    """
+    Test visualize_topics
+    """
+
+    def test_visualize_topics_clustering_for_second_input(self):
+
+        s = pd.Series(
+            [
+                "Football, Sports, Soccer",
+                "music, violin, orchestra",
+                "football, fun, sports",
+                "music, band, guitar",
+            ]
+        )
+
+        s_tfidf = (
+            s.pipe(preprocessing.clean)
+            .pipe(preprocessing.tokenize)
+            .pipe(representation.tfidf)
+        )
+        s_cluster = (
+            s_tfidf.pipe(representation.normalize)
+            .pipe(representation.pca, n_components=2)
+            .pipe(representation.kmeans, n_clusters=2)
+        )
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            self.assertIsNotNone(
+                visualization.visualize_topics(s_tfidf, s_cluster, return_figure=True)
+            )
+
+    def test_visualize_topics_topic_modelling_for_second_input(self):
+
+        s = pd.Series(
+            [
+                "Football, Sports, Soccer",
+                "music, violin, orchestra",
+                "football, fun, sports",
+                "music, band, guitar",
+            ]
+        )
+
+        s_tfidf = (
+            s.pipe(preprocessing.clean)
+            .pipe(preprocessing.tokenize)
+            .pipe(representation.tfidf)
+        )
+        s_lda = s_tfidf.pipe(representation.normalize).pipe(
+            representation.lda, n_components=2
+        )
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            self.assertIsNotNone(
+                visualization.visualize_topics(s_tfidf, s_lda, return_figure=True)
+            )
diff --git a/texthero/__init__.py b/texthero/__init__.py
@@ -16,3 +16,5 @@
 from .nlp import *
 
 from . import stopwords
+
+from . import _helper
diff --git a/texthero/_helper.py b/texthero/_helper.py
@@ -2,7 +2,9 @@
 Useful helper functions for the texthero library.
 """
 
+import pyLDAvis
 import pandas as pd
+import numpy as np
 import functools
 import warnings
 
@@ -71,3 +73,57 @@ def wrapper(*args, **kwargs):
         return wrapper
 
     return decorator
+
+
+"""
+For representation.relevant_words_per_topic:
+
+Redefinition of PCoA from pyLDAvis to support
+big datasets. The only thing we change is the line
+`eigvals, eigvecs = np.linalg.eigh(B)`, which was before
+`eigvals, eigvecs = np.linalg.eig(B)`. Apart from that,
+every line is the same as in pyLDAvis! Without this change,
+we get complex eigenvalues with all complex components = 0
+due to floating point errors, see e.g.
+https://stackoverflow.com/questions/8765310/scipy-linalg-eig-return-complex-eigenvalues-for-covariance-matrix
+
+The change is safe and makes sense as the input matrix `pair_dists`
+(pairwise distances) is always a symmetric matrix.
+
+"""
+
+
+def _hero_pcoa(pair_dists, n_components=2):
+    """Principal Coordinate Analysis,
+    aka Classical Multidimensional Scaling
+    """
+    # code referenced from skbio.stats.ordination.pcoa
+    # https://github.com/biocore/scikit-bio/blob/0.5.0/skbio/stats/ordination/_principal_coordinate_analysis.py
+
+    # pairwise distance matrix is assumed symmetric
+    pair_dists = np.asarray(pair_dists, np.float64)
+
+    # perform SVD on double centred distance matrix
+    n = pair_dists.shape[0]
+    H = np.eye(n) - np.ones((n, n)) / n
+    B = -H.dot(pair_dists ** 2).dot(H) / 2
+    eigvals, eigvecs = np.linalg.eigh(B)  # CHANGED BY US
+
+    # Take first n_components of eigenvalues and eigenvectors
+    # sorted in decreasing order
+    ix = eigvals.argsort()[::-1][:n_components]
+    eigvals = eigvals[ix]
+    eigvecs = eigvecs[:, ix]
+
+    # replace any remaining negative eigenvalues and associated eigenvectors with zeroes
+    # at least 1 eigenvalue must be zero
+    eigvals[np.isclose(eigvals, 0)] = 0
+    if np.any(eigvals < 0):
+        ix_neg = eigvals < 0
+        eigvals[ix_neg] = np.zeros(eigvals[ix_neg].shape)
+        eigvecs[:, ix_neg] = np.zeros(eigvecs[:, ix_neg].shape)
+
+    return np.sqrt(eigvals) * eigvecs
+
+
+pyLDAvis._prepare._pcoa = _hero_pcoa
diff --git a/texthero/_types.py b/texthero/_types.py
@@ -59,6 +59,7 @@ def tfidf(s: TokenSeries) -> DataFrame:
 
 import functools
 import pandas as pd
+import numpy as np
 
 from typing import Tuple
 
@@ -198,7 +199,9 @@ def is_numeric(x):
                 return True
 
         def is_list_of_numbers(cell):
-            return isinstance(cell, (list, tuple)) and all(is_numeric(x) for x in cell)
+            return isinstance(cell, (list, tuple, np.ndarray)) and all(
+                is_numeric(x) for x in cell
+            )
 
         try:
             first_non_nan_value = s.loc[s.first_valid_index()]