In [3]:
import os
import docx
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt

# 1) Download stopwords
nltk.download('punkt')
nltk.download('stopwords')

# 2) Load all .docx files from folder
folder_path = "/Users/user/Desktop/GAITAR-hashed"
docs = []

for fname in os.listdir(folder_path):
    if fname.endswith('.docx'):
        doc_path = os.path.join(folder_path, fname)
        doc = docx.Document(doc_path)
        text = '\n'.join([para.text for para in doc.paragraphs])
        docs.append(text)

print(f"Loaded {len(docs)} documents from {folder_path}")

# 3) Basic text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = re.sub(r'\W+', ' ', text) # remove punctuation
    return text

docs_clean = [clean_text(doc) for doc in docs]

# 4) Vectorization
stop_words = nltk.corpus.stopwords.words('english')
vectorizer = CountVectorizer(
    stop_words=stop_words,
    min_df=2,
    max_df=0.8,
    ngram_range=(1, 2)
)
dtm = vectorizer.fit_transform(docs_clean)

# 5) Topic modeling with LDA
n_topics = 4  # you can change this
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(dtm)

# 6) Show topics
def print_topics(model, vectorizer, top_n=10):
    words = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(model.components_):
        print(f"\nTopic #{idx + 1}:")
        print(", ".join([words[i] for i in topic.argsort()[:-top_n - 1:-1]]))

print_topics(lda, vectorizer)

Loaded 18 documents from /Users/user/Desktop/GAITAR-hashed

Topic #1:
homes, nile, alexandria, river, temples, civilization, used, greek, water, materials

Topic #2:
rich, accurate, understanding, history, political, civilization, like cleopatra, cleopatra ancient, answers, tell

Topic #3:
would, image, see, often, copilot, people, time, also, women, made

Topic #4:
antony, caesar, political, julius, mark, mark antony, roman, ruler, julius caesar, relationship


[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
