In [None]:
import numpy as np
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import classification_report
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser

# Sample data loading
newsgroups_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
documents = newsgroups_data.data
y = newsgroups_data.target

# Preprocessing documents
def preprocess(docs):
    # Simple tokenization
    tokenized_docs = [simple_preprocess(doc) for doc in docs]
    # Learn bigrams
    bigram = Phrases(tokenized_docs, min_count=20)
    bigram_mod = Phraser(bigram)
    return [bigram_mod[doc] for doc in tokenized_docs]

processed_docs = preprocess(documents)

# Create a dictionary representation of the documents
dictionary = Dictionary(processed_docs)
# Filter out extremes to limit the number of features
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Create a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, random_state=42,
                     update_every=1, chunksize=100, passes=10, alpha='auto')

# Use LDA model to get the topic distribution for each document
def get_document_topics(corpus, model):
    topic_distributions = []
    for doc in corpus:
        doc_distribution = np.zeros(model.num_topics)
        for topic, prob in model.get_document_topics(doc):
            doc_distribution[topic] = prob
        topic_distributions.append(doc_distribution)
    return np.array(topic_distributions)

X = get_document_topics(corpus, lda_model)

# Splitting the dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Train a simple logistic regression model
classifier = LogisticRegression(random_state=42, max_iter=1000)
classifier.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.24      0.21      0.23       198
           1       0.26      0.22      0.24       245
           2       0.12      0.05      0.07       242
           3       0.26      0.39      0.31       238
           4       0.10      0.04      0.06       250
           5       0.29      0.28      0.29       260
           6       0.28      0.33      0.30       241
           7       0.22      0.17      0.19       244
           8       0.23      0.45      0.30       219
           9       0.41      0.39      0.40       261
          10       0.59      0.65      0.62       245
          11       0.33      0.34      0.34       251
          12       0.19      0.14      0.16       249
          13       0.31      0.40      0.35       249
          14       0.25      0.42      0.31       240
          15       0.40      0.68      0.51       245
          16       0.24      0.23      0.23       230
          17       0.42    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
