In [1]:
import logging
import math
import numpy as np
import scipy.sparse
from scipy.stats import entropy
import scipy.linalg
from scipy.linalg.lapack import get_lapack_funcs
from scipy.linalg.special_matrices import triu
from scipy.special import psi  # gamma function utils

class Sparse2Corpus:
    """Convert a matrix in scipy.sparse format into a streaming Gensim corpus.
    See Also
    --------
    :func:`~gensim.matutils.corpus2csc`
        Convert gensim corpus format to `scipy.sparse.csc` matrix
    :class:`~gensim.matutils.Dense2Corpus`
        Convert dense matrix to gensim corpus.
    """
    def __init__(self, sparse, documents_columns=True):
        """
        Parameters
        ----------
        sparse : `scipy.sparse`
            Corpus scipy sparse format
        documents_columns : bool, optional
            Documents will be column?
        """
        if documents_columns:
            self.sparse = sparse.tocsc()
        else:
            self.sparse = sparse.tocsr().T  # make sure shape[1]=number of docs (needed in len())

    def __iter__(self):
        """
        Yields
        ------
        list of (int, float)
            Document in BoW format.
        """
        for indprev, indnow in zip(self.sparse.indptr, self.sparse.indptr[1:]):
            yield list(zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow]))

    def __len__(self):
        return self.sparse.shape[1]

    def __getitem__(self, key):
        """
        Retrieve a document vector or subset from the corpus by key.
        Parameters
        ----------
        key: int, ellipsis, slice, iterable object
            Index of the document retrieve.
            Less commonly, the key can also be a slice, ellipsis, or an iterable
            to retrieve multiple documents.
        Returns
        -------
        list of (int, number), Sparse2Corpus
            Document in BoW format when `key` is an integer. Otherwise :class:`~gensim.matutils.Sparse2Corpus`.
        """
        sparse = self.sparse
        if isinstance(key, int):
            iprev = self.sparse.indptr[key]
            inow = self.sparse.indptr[key + 1]
            return list(zip(sparse.indices[iprev:inow], sparse.data[iprev:inow]))

        sparse = self.sparse.__getitem__((slice(None, None, None), key))
        return Sparse2Corpus(sparse)


In [2]:
from src.prepare_dataset import TextDataLoader
from src.evaluierung import topicCoherence2, topicDiversity

from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric
from pathlib import Path
from tqdm import tqdm
import gensim

In [3]:
stopwords_filter = True

In [4]:
def get_data(min_df):
    textsloader = TextDataLoader(source="20newsgroups", 
                             train_size=None, test_size=None)
    textsloader.load_tokenize_texts("20newsgroups")
    textsloader.preprocess_texts(length_one_remove=True, 
                                 punctuation_lower = True, 
                                 stopwords_filter = stopwords_filter)
    textsloader.split_and_create_voca_from_trainset(max_df=0.7, min_df=min_df, 
                                                    stopwords_remove_from_voca=stopwords_filter)

    for_lda_model = True
    # bow must be first and can get the get_docs_in_words_for_each_set()
    word2id, id2word, train_set, test_set, val_set = textsloader.create_bow_and_savebow_for_each_set(for_lda_model=for_lda_model, 
                                                                                                     normalize = True)
    textsloader.write_info_vocab_to_text()
    del textsloader
    return train_set, id2word

def get_lda_topics(train_set, id2word):
    num_topics = 5
    myid2word = id2word
    from gensim.models import LdaModel
    ldamodel = LdaModel(train_set, num_topics=5, id2word = id2word, random_state = 42)
    lda_topics = ldamodel.show_topics(num_topics=50, num_words=25)
    topics = []
    filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]
    for topic in lda_topics:
        topics.append(preprocess_string(topic[1], filters))
    del ldamodel
    del lda_topics
    return topics

In [5]:
from IPython.display import clear_output

In [18]:
for min_df in [10,30,100]:
    clear_output(wait=True)
    train_set, id2word = get_data(min_df)
    print(f'len of vocab: {len(id2word.keys())}')
    print(train_set.sparse.shape)
    print(get_lda_topics(train_set, id2word))

loading texts: ...
train-size after loading: 11314
test-size after loading: 7532
finished load!
start: preprocessing: ...
finised: preprocessing!
vocab-size in df: 8496
validation-size ist: 0.01
start creating vocabulary ...
length of the vocabulary: 8496
sample ten words of the vocabulary: ['favour', 'rm', 'meeting', 'persuade', 'chan', 'basic', 'tracking', 'macs', 'insists', 'oak']
length word2id list: 8496
length id2word list: 8496
finished: creating vocabulary
train-size-after-all: 11214
test-size-after-all: 7532
validation-size-after-all: 100
test-size-after-all: 11214
test-indices-length: 11214
test-size-after-all: 100
test-indices-length: 100
test-size-after-all: 7532
test-indices-length: 7532
length train-documents-indices : 1150368
length of the vocabulary: 8496


start: creating bow representation...
finised creating bow input!

start: creating bow representation...
finised creating bow input!

start: creating bow representation...
finised creating bow input!

start: creating

IndexError: index 9011 is out of bounds for axis 1 with size 8496

In [None]:
"""
classgensim.models.ldamodel.LdaModel(
    corpus=None, num_topics=100, 
    id2word=None, distributed=False, 
    chunksize=2000, passes=1, 
    update_every=1, alpha='symmetric', 
    eta=None, decay=0.5, offset=1.0, eval_every=10, 
    iterations=50, gamma_threshold=0.001, minimum_probability=0.01, 
    random_state=None, ns_conf=None, minimum_phi_value=0.01, 
    per_word_topics=False, callbacks=None, dtype=<class 'numpy.float32'>)
"""

In [20]:
import pandas as pd
pd.DataFrame(train_set.sparse.toarray())#.shape

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8486,8487,8488,8489,8490,8491,8492,8493,8494,8495
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11209,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11211,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11212,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
