In [1]:
%pip install spacy
%pip install gensim

In [None]:
# Import necessary libraries
import spacy
import gensim
from gensim import corpora
from pprint import pprint

spacy.cli.download("en_core_web_sm")
nlp = spacy.load('en_core_web_sm')


# Documents

In [None]:
documents = [
    "Artificial intelligence is reshaping industries through innovative automation.",
    "Robotics and AI applications revolutionize manufacturing processes.",
    "Ethical considerations in AI development are crucial for responsible innovation.",
    "AI-driven advancements pose challenges in workforce adaptation and job displacement."
]

# Preprocess the documents

In [None]:

def preprocess(text):
    # Tokenize and lemmatize using spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return tokens

# Apply preprocessing to all documents
processed_documents = [preprocess(doc) for doc in documents]

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(processed_documents)
corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

# Build LDA model
lda_model1 = gensim.models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10)
lda_model2 = gensim.models.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=20)


# Print topics and their keywords

In [None]:
pprint(lda_model1.print_topics())

[(0,
  '0.234*"learning" + 0.121*"machine" + 0.084*"type" + 0.047*"network" + '
  '0.047*"artificial" + 0.047*"base" + 0.047*"subset" + 0.047*"deep" + '
  '0.047*"neural" + 0.047*"supervise"'),
 (1,
  '0.053*"machine" + 0.053*"learning" + 0.053*"reinforcement" + '
  '0.053*"technique" + 0.053*"main" + 0.053*"unsupervised" + 0.053*"supervise" '
  '+ 0.053*"type" + 0.053*"neural" + 0.053*"subset"'),
 (2,
  '0.053*"machine" + 0.053*"learning" + 0.053*"reinforcement" + '
  '0.053*"technique" + 0.053*"supervise" + 0.053*"main" + 0.053*"unsupervised" '
  '+ 0.053*"type" + 0.053*"neural" + 0.053*"deep"'),
 (3,
  '0.106*"machine" + 0.106*"experience" + 0.106*"automatically" + '
  '0.106*"improve" + 0.106*"involve" + 0.106*"algorithm" + 0.106*"learning" + '
  '0.021*"reinforcement" + 0.021*"technique" + 0.021*"unsupervised"')]


In [None]:
pprint(lda_model2.print_topics())

# Assign topics to documents

In [None]:
# Assign topics to documents
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model1.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.032817908), (1, 0.03134291), (2, 0.031342912), (3, 0.90449625)]
Document 2 - Topic: [(0, 0.924469), (1, 0.025126709), (2, 0.02512671), (3, 0.025277598)]
Document 3 - Topic: [(0, 0.89203846), (1, 0.035840042), (2, 0.035840042), (3, 0.036281466)]
Document 4 - Topic: [(0, 0.9055772), (1, 0.03137502), (2, 0.03137502), (3, 0.031672765)]


In [None]:
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model2.get_document_topics(corpus[i])}")

#                   