In [None]:
%pip install spacy
%pip install gensim

In [2]:
# Import necessary libraries
import spacy
import gensim
from gensim import corpora
from pprint import pprint

spacy.cli.download("en_core_web_sm")
nlp = spacy.load('en_core_web_sm')


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Documents

In [3]:
documents = [
    "Artificial intelligence is reshaping industries through innovative automation.",
    "Robotics and AI applications revolutionize manufacturing processes.",
    "Ethical considerations in AI development are crucial for responsible innovation.",
    "AI-driven advancements pose challenges in workforce adaptation and job displacement."
]

# Preprocess the documents

In [4]:

def preprocess(text):
    # Tokenize and lemmatize using spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return tokens

# Apply preprocessing to all documents
processed_documents = [preprocess(doc) for doc in documents]

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(processed_documents)
corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

# Build LDA model
lda_model1 = gensim.models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10)
lda_model2 = gensim.models.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=20)


# Print topics and their keywords

In [5]:
pprint(lda_model1.print_topics())

[(0,
  '0.078*"AI" + 0.047*"process" + 0.047*"manufacturing" + 0.047*"responsible" '
  '+ 0.047*"robotic" + 0.047*"crucial" + 0.047*"innovation" + '
  '0.047*"development" + 0.047*"application" + 0.047*"consideration"'),
 (1,
  '0.069*"AI" + 0.068*"job" + 0.068*"challenge" + 0.068*"displacement" + '
  '0.068*"adaptation" + 0.068*"pose" + 0.068*"advancement" + 0.068*"drive" + '
  '0.068*"workforce" + 0.023*"innovative"')]


In [6]:
pprint(lda_model2.print_topics())

[(0,
  '0.081*"job" + 0.081*"drive" + 0.081*"adaptation" + 0.081*"advancement" + '
  '0.081*"displacement" + 0.081*"workforce" + 0.081*"pose" + 0.081*"challenge" '
  '+ 0.081*"AI" + 0.016*"robotic"'),
 (1,
  '0.100*"AI" + 0.100*"manufacturing" + 0.100*"application" + 0.100*"process" '
  '+ 0.100*"revolutionize" + 0.100*"robotic" + 0.020*"challenge" + '
  '0.020*"pose" + 0.020*"workforce" + 0.020*"displacement"'),
 (2,
  '0.100*"reshape" + 0.100*"innovative" + 0.100*"artificial" + '
  '0.100*"automation" + 0.100*"intelligence" + 0.100*"industry" + 0.020*"AI" + '
  '0.020*"robotic" + 0.020*"revolutionize" + 0.020*"process"'),
 (3,
  '0.093*"AI" + 0.093*"responsible" + 0.093*"consideration" + '
  '0.093*"development" + 0.093*"innovation" + 0.093*"crucial" + '
  '0.093*"ethical" + 0.019*"robotic" + 0.019*"revolutionize" + '
  '0.019*"process"')]


# Assign topics to documents

In [7]:
# Assign topics to documents
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model1.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.92373675), (1, 0.076263264)]
Document 2 - Topic: [(0, 0.9209897), (1, 0.0790103)]
Document 3 - Topic: [(0, 0.9312502), (1, 0.06874975)]
Document 4 - Topic: [(0, 0.053791914), (1, 0.94620806)]


In [8]:
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model2.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.035746496), (1, 0.03575483), (2, 0.89274716), (3, 0.035751496)]
Document 2 - Topic: [(0, 0.036042366), (1, 0.89210784), (2, 0.03575438), (3, 0.036095448)]
Document 3 - Topic: [(0, 0.031518765), (1, 0.0315899), (2, 0.03128779), (3, 0.9056035)]
Document 4 - Topic: [(0, 0.92448705), (1, 0.02524915), (2, 0.025034629), (3, 0.025229214)]
