In [1]:
import spacy
import gensim
from gensim import corpora
from pprint import pprint

In [2]:
# Download spaCy's English NLP model
spacy.cli.download("en_core_web_sm")

# Load spaCy's English NLP model
nlp = spacy.load('en_core_web_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
# 
documents = [
    "Machine learning involves algorithms that improve automatically through experience.",
    "Deep learning is a subset of machine learning based on artificial neural networks.",
    "Reinforcement learning is a type of machine learning technique.",
    "Supervised and unsupervised learning are two main types of machine learning."
]

In [4]:

def preprocess(text):
    # Tokenize and lemmatize using spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return tokens

# Apply preprocessing to all documents
processed_documents = [preprocess(doc) for doc in documents]

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(processed_documents)
corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

# Build LDA model
lda_model = gensim.models.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=20)



In [5]:
pprint(lda_model.print_topics())

[(0,
  '0.212*"learning" + 0.131*"machine" + 0.091*"type" + 0.051*"involve" + '
  '0.051*"automatically" + 0.051*"unsupervised" + 0.051*"main" + '
  '0.051*"algorithm" + 0.051*"supervise" + 0.051*"improve"'),
 (1,
  '0.053*"machine" + 0.053*"learning" + 0.053*"technique" + '
  '0.053*"reinforcement" + 0.053*"improve" + 0.053*"experience" + '
  '0.053*"involve" + 0.053*"automatically" + 0.053*"algorithm" + '
  '0.053*"supervise"'),
 (2,
  '0.053*"machine" + 0.053*"learning" + 0.053*"technique" + '
  '0.053*"reinforcement" + 0.053*"experience" + 0.053*"algorithm" + '
  '0.053*"improve" + 0.053*"automatically" + 0.053*"involve" + '
  '0.053*"supervise"'),
 (3,
  '0.164*"learning" + 0.091*"machine" + 0.091*"base" + 0.091*"artificial" + '
  '0.091*"deep" + 0.091*"network" + 0.091*"neural" + 0.091*"subset" + '
  '0.018*"reinforcement" + 0.018*"technique"')]


In [6]:
# Assign topics to documents
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.9055359), (1, 0.03140766), (2, 0.031407688), (3, 0.0316488)]
Document 2 - Topic: [(0, 0.025963092), (1, 0.025076207), (2, 0.025076203), (3, 0.9238846)]
Document 3 - Topic: [(0, 0.89182305), (1, 0.035832524), (2, 0.035832524), (3, 0.03651195)]
Document 4 - Topic: [(0, 0.90542287), (1, 0.03136699), (2, 0.03136699), (3, 0.031843163)]
