In [1]:
# The script creates five five-word topics based on the concordances of the word "kodu" ("home").
# The script was also used as a basis for finding topics by decade, in which case the file name was modified to match the concordances of the decade being studied.

In [2]:
# Importing the necessary libraries.
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.lda_model

In [3]:
# Loading the multilingual model, since the spacy library does not have a model for Estonian.
nlp = spacy.load("xx_ent_wiki_sm")

# Reading the file consisting of concordances for "kodu" with 100 characters of context on the left and right. 
with open("kodu_concs.txt", "r", encoding="utf-8") as file:
    documents = file.readlines()

# Striping the newline characters from each line.
documents = [doc.strip() for doc in documents]

# Reading the custom Estonian stopwords list.
with open("estonian-stopwords-lemmas.txt", "r", encoding="utf-8") as file:
    custom_stopwords = file.readlines()

# Striping newline characters from each line and converting them.
custom_stopwords = set([word.strip() for word in custom_stopwords])

# Preprocessing the documents.
def preprocess(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if token.text.lower() not in custom_stopwords and not token.is_punct]
    return " ".join(tokens)

processed_docs = [preprocess(doc) for doc in documents]

# Vectorising the documents.
vectorizer = CountVectorizer()
dtm = vectorizer.fit_transform(processed_docs)

# Performing LDA.
num_topics = 5
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(dtm)

# Displaying topics.
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

feature_names = vectorizer.get_feature_names_out()
display_topics(lda, feature_names, 5)

# Visualising topics.
pyLDAvis.enable_notebook()
panel = pyLDAvis.lda_model.prepare(lda, dtm, vectorizer, mds='tsne')
pyLDAvis.display(panel)

Topic 0:
kodu kodumaa aasta mees uus
Topic 1:
kodumaa kodu keel eesti aeg
Topic 2:
eesti metsakodu laps suvikodu suvekodu
Topic 3:
eesti kodumaa kodu rootsi kell
Topic 4:
kodumaa eesti kodu rahvas eestlane
