In [1]:
import os
import nltk
import re
import string
import gensim
import numpy as np
import pandas as pd
from gensim.models import Phrases
from gensim.models.phrases import Phraser

In [4]:
from nltk.tokenize import word_tokenize
# for LDA
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# for LDA evaluation
import pyLDAvis
import pyLDAvis.gensim_models as gensimvisualize

In [5]:
# %%
# for tokenization
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download('wordnet')

# for stopword removal
from nltk.corpus import stopwords
nltk.download('stopwords')

# for lemmatization and POS tagging
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to
[nltk_data]     c:\Users\Michael\github\github_lda\lda_venv\nltk_data.
[nltk_data]     ..
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     c:\Users\Michael\github\github_lda\lda_venv\nltk_data.
[nltk_data]     ..
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     c:\Users\Michael\github\github_lda\lda_venv\nltk_data.
[nltk_data]     ..
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     c:\Users\Michael\github\github_lda\lda_venv\nltk_data.
[nltk_data]     ..
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [6]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser


In [65]:
def read_txt_files_to_df(txt_dir):
    data = []
    for filename in os.listdir(txt_dir):
        if filename.endswith('.txt'):
            filepath = os.path.join(txt_dir, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                content = file.read()
                data.append({'filename': filename, 'content': content})
    return pd.DataFrame(data)

# Directory containing your txt files
file_dir = r'D:\Data\CapitalIQ_Transcript\Txt_TestRun_v1'

# Read files into DataFrame
df = read_txt_files_to_df(file_dir)
print(df.head())  # preview

    filename                                            content
0  10098.txt  The military revenue for the fourth quarter wa...
1  11069.txt  It's a couple of questions in there. Let me st...
2   1157.txt  Let me start, Gregg, by saying that with respe...
3  11681.txt  Sure, Simon. First of all, thank you for your ...
4  11812.txt  Okay. Matt, I'll take that. And we're looking ...


In [51]:
# WordNet POS tagger
def wordnet_pos_tags(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Preprocessing pipeline (your specified order)
def txt_preprocess_pipeline(text):
    # 1. Standardize text
    standard_txt = text.lower()
    clean_txt = re.sub(r'\n', ' ', standard_txt)
    clean_txt = re.sub(r'\s+', ' ', clean_txt)
    clean_txt = clean_txt.strip()

    # 2. Tokenize
    tokens = word_tokenize(clean_txt)

    # 3. Lemmatize
    lemmatizer = WordNetLemmatizer()
    pos_tags = nltk.pos_tag(tokens)
    lemma_tokens = [lemmatizer.lemmatize(token, wordnet_pos_tags(pos_tag)) for token, pos_tag in pos_tags]

    # 4. Remove stop words
    stop_words = set(stopwords.words('english'))
    custom_stopwords = set([
        'business', 'revenue', 'sale', 'market', 'million'])
    all_stopwords = stop_words.union(custom_stopwords)
    filtered_tokens = [w for w in lemma_tokens if w not in all_stopwords]

    # 5. Remove non-alphabetic tokens
    filtered_tokens_alpha = [
        word for word in filtered_tokens
        if word.isalpha() and not re.match(r'^[ivxlcdm]+$', word)
    ]

    # 6. Return tokens for n-gram processing
    return filtered_tokens_alpha

# Example file iteration function
def iterate_txt_files(txt_dir):
    texts = []
    for filename in os.listdir(txt_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(txt_dir, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                txt_tokens = txt_preprocess_pipeline(text)
                texts.append(txt_tokens)
    return texts

In [66]:
# Apply preprocessing to dataframe
df['tokens'] = df['content'].apply(txt_preprocess_pipeline)

In [60]:
print(df)

     filename                                            content  \
0   10098.txt  The military revenue for the fourth quarter wa...   
1   11069.txt  It's a couple of questions in there. Let me st...   
2    1157.txt  Let me start, Gregg, by saying that with respe...   
3   11681.txt  Sure, Simon. First of all, thank you for your ...   
4   11812.txt  Okay. Matt, I'll take that. And we're looking ...   
..        ...                                                ...   
95   8022.txt  Kathryn, we got some favorable mix, but also a...   
96   8525.txt  Right. Exactly. I think we've got to make a de...   
97   8982.txt  Yes. So just to -- going back to 2021, overall...   
98   9240.txt  Joe, that's absolutely correct. You analyzed i...   
99   9728.txt  I think it’s a blend of a couple of things. I ...   

                                               tokens  
0   [military, fourth, quarter, versus, prior, yea...  
1   [couple, question, let, start, probably, easy,...  
2   [let, start

In [67]:
NGRAM_TYPE = 'trigram'   # 'bigram', 'trigram', or 'unigram'

# Build models based on the tokens column (after initial preprocessing)
bigram = Phrases(df['tokens'], min_count=20, threshold=100)
bigram_mod = Phraser(bigram)
trigram = Phrases(bigram[df['tokens']], threshold=100)
trigram_mod = Phraser(trigram)

def apply_ngrams(tokens, ngram_type):
    if ngram_type == 'bigram':
        return bigram_mod[tokens]
    elif ngram_type == 'trigram':
        return trigram_mod[bigram_mod[tokens]]
    else:
        return tokens


In [68]:
# Apply n-grams to the DataFrame
df['tokens'] = df['tokens'].apply(lambda x: apply_ngrams(x, NGRAM_TYPE))

if NGRAM_TYPE == 'bigram':
    print("Bigram transformation applied.")
elif NGRAM_TYPE == 'trigram':
    print("Trigram transformation applied.")
else:
    print("Unigram: no n-gram transformation applied.")

Trigram transformation applied.


In [69]:
texts = df['tokens'].tolist()
print(texts[:1])

[['military', 'fourth_quarter', 'versus', 'prior', 'year', 'well', 'expect', 'military', 'fairly', 'line', 'current', 'year', 'maybe', 'slightly', 'less', 'equipment', 'nonrecurring', 'look', 'military', 'fiscal', 'year', 'expect', 'imaging', 'physic', 'gain', 'traction', 'pick', 'relate', 'customer', 'meet', 'requirement', 'around', 'joint', 'commission', 'likely', 'second_half', 'fiscal', 'year', 'radiation', 'measurement', 'feel', 'international', 'go', 'pretty', 'well', 'think', 'domestic', 'well', 'believe', 'radiation', 'whole', 'negatively', 'impact', 'foreign_exchange', 'rate', 'approximately', 'base', 'estimate', 'foreign_exchange', 'rate', 'know', 'accurately', 'would', 'sit', 'rick', 'really', 'ca', 'comment', 'segment', 'level', 'data', 'apologize', 'rick', 'segment', 'level', 'data', 'able', 'put', 'press_release', 'direct', 'margin', 'dollar', 'probably', 'materially', 'correct', 'prior', 'indiscernible', 'yes', 'would', 'restate', 'income', 'tax', 'quarter', 'previously'

In [81]:
for idx, doc in enumerate(df['tokens'].tolist()[:5]):
    ngrams = [token for token in doc if '_' in token]
    print(f"Doc {idx+1} n-grams:", ngrams)


Doc 1 n-grams: ['fourth_quarter', 'second_half', 'foreign_exchange', 'foreign_exchange', 'press_release', 'cautiously_optimistic', 'second_half', 'second_half', 'cautiously_optimistic', 'cautiously_optimistic', 'relatively_flat', 'relatively_flat', 'thank_join', 'holiday_season']
Doc 2 n-grams: ['little_bit', 'little_bit']
Doc 3 n-grams: ['lung_cancer', 'remain_committed', 'fourth_quarter', 'cell_lung_cancer', 'lung_cancer', 'second_half', 'unmet_need', 'cell_lung_cancer', 'lung_cancer', 'remain_committed', 'lung_cancer', 'cell_lung_cancer', 'lung_cancer', 'lung_cancer', 'phase_study', 'lung_cancer', 'fourth_quarter', 'technical_difficulty', 'conference_call', 'across_board', 'lung_cancer', 'remain_committed', 'cell_lung_cancer', 'lung_cancer', 'lung_cancer', 'foreign_exchange', 'cell_lung_cancer', 'relatively_flat', 'fourth_quarter', 'fourth_quarter', 'fourth_quarter', 'technical_difficulty', 'lung_cancer', 'cell_lung_cancer', 'lung_cancer', 'phase_study', 'lung_cancer', 'lung_cancer'

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

In [71]:
# Remove any documents that ended up empty after preprocessing
original_doc_count = len(texts)
texts = [doc for doc in texts if len(doc) > 0]
filtered_doc_count = len(texts)
if filtered_doc_count < original_doc_count:
    print(f"Removed {original_doc_count - filtered_doc_count} empty documents after preprocessing.")




In [79]:
from gensim.corpora import Dictionary
# load dictionary
id2word = corpora.Dictionary(texts)
id2word.filter_extremes(no_above = .8, no_below = 5)

# generate corpus as BoW
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:1])

# train LDA model
lda_model = gensim.models.LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    num_topics=30,
    passes=200,
    random_state=4583,
    chunksize=20,
    iterations=200,
    alpha='asymmetric',
    workers=10,  # Adjust based on your CPU cores
)
   

# print LDA topics
for topic in lda_model.print_topics(num_topics=30, num_words=10):
    print(topic)

[[(0, 3), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 3), (11, 1), (12, 2), (13, 1), (14, 2), (15, 2), (16, 1), (17, 2), (18, 1), (19, 3), (20, 1), (21, 7), (22, 2), (23, 1), (24, 1), (25, 2), (26, 1), (27, 2), (28, 3), (29, 1), (30, 1), (31, 1), (32, 3), (33, 3), (34, 1), (35, 1), (36, 1), (37, 3), (38, 2), (39, 2), (40, 1), (41, 1), (42, 1), (43, 1), (44, 3), (45, 1), (46, 1), (47, 3), (48, 1), (49, 2), (50, 3), (51, 1), (52, 8), (53, 2), (54, 3), (55, 4), (56, 1), (57, 2), (58, 2), (59, 1), (60, 1), (61, 1), (62, 3), (63, 1), (64, 2), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 3), (74, 2), (75, 2), (76, 1), (77, 1), (78, 1), (79, 2), (80, 1), (81, 1), (82, 2), (83, 1), (84, 1), (85, 5), (86, 8), (87, 2), (88, 1), (89, 1), (90, 1), (91, 2), (92, 2), (93, 2), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 3), (100, 3), (101, 1), (102, 1), (103, 1), (104, 1), (105, 2), (106, 1), (107, 1), (108, 4), (109, 1), (110, 1)

In [80]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)


Coherence Score:  0.329331630760668


In [34]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b,
                                           workers=10,  )
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [35]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 10
max_topics = 18
step_size = 4
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('./results/lda_tuning_results.csv', index=False)
    pbar.close()

  1%|          | 1/120 [00:59<1:58:25, 59.71s/it]
100%|██████████| 120/120 [22:55<00:00, 11.47s/it]


In [83]:
lda_visual = gensimvisualize.prepare(lda_model, corpus, id2word, mds='mmds')
pyLDAvis.display(lda_visual)




In [84]:
# generate document-topic distributions
for i, doc in enumerate(corpus):
    doc_topics = lda_model.get_document_topics(doc)
    print(f"Document {i}: {doc_topics}")

# %%

Document 0: [(1, 0.21503152), (7, 0.33552614), (26, 0.44742954)]
Document 1: [(19, 0.9964394)]
Document 2: [(2, 0.14753678), (7, 0.077082835), (13, 0.037581194), (19, 0.73722506)]
Document 3: [(2, 0.5471564), (7, 0.35227057), (29, 0.098933436)]
Document 4: [(2, 0.14139803), (7, 0.79480153), (24, 0.062975265)]
Document 5: [(17, 0.9985457)]
Document 6: [(19, 0.77163506), (25, 0.226847)]
Document 7: [(26, 0.99833816)]
Document 8: [(1, 0.12093533), (2, 0.26355723), (7, 0.10310677), (8, 0.08109716), (16, 0.1010386), (19, 0.098584525), (26, 0.22959876)]
Document 9: [(2, 0.99809295)]
Document 10: [(2, 0.25185525), (7, 0.74729425)]
Document 11: [(1, 0.22288494), (7, 0.6203682), (19, 0.07431071), (28, 0.08091626)]
Document 12: [(7, 0.5338929), (13, 0.46216115)]
Document 13: [(5, 0.93206245)]
Document 14: [(24, 0.9991879)]
Document 15: [(10, 0.77984554), (19, 0.21915126)]
Document 16: [(2, 0.26397815), (16, 0.21098779), (19, 0.27138567), (23, 0.24945186)]
Document 17: [(1, 0.28708047), (13, 0.16