In [2]:
import os
import nltk
import re
import string
import gensim
import numpy as np
import pandas as pd
from gensim.models import Phrases
from gensim.models.phrases import Phraser

In [36]:
def read_txt_files_to_df(txt_dir):
    data = []
    for filename in os.listdir(txt_dir):
        if filename.endswith('.txt'):
            filepath = os.path.join(txt_dir, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                content = file.read()
                data.append({'filename': filename, 'content': content})
    return pd.DataFrame(data)

# Directory containing your txt files
file_dir = r'D:\Data\CapitalIQ_Transcript\Txt_Full2007added'

# Read files into DataFrame
df = read_txt_files_to_df(file_dir)
print(df.head())  # preview

KeyboardInterrupt: 

In [4]:
from nltk.tokenize import word_tokenize
# for LDA
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# for LDA evaluation
import pyLDAvis
import pyLDAvis.gensim_models as gensimvisualize

In [5]:
# %%
# for tokenization
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download('wordnet')

# for stopword removal
from nltk.corpus import stopwords
nltk.download('stopwords')

# for lemmatization and POS tagging
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to
[nltk_data]     c:\Users\Michael\github\github_lda\lda_venv\nltk_data.
[nltk_data]     ..
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     c:\Users\Michael\github\github_lda\lda_venv\nltk_data.
[nltk_data]     ..
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     c:\Users\Michael\github\github_lda\lda_venv\nltk_data.
[nltk_data]     ..
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     c:\Users\Michael\github\github_lda\lda_venv\nltk_data.
[nltk_data]     ..
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:
# load WordNet POS tags for lemmatization
def wordnet_pos_tags(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# preprocessing function
def txt_preprocess_pipeline(text):
    # text is now a string (not a file handle)
    # standardize text to lowercase
    standard_txt = text.lower()
    # remove multiple white spaces and line breaks
    clean_txt = re.sub(r'\n', ' ', standard_txt)
    clean_txt = re.sub(r'\s+', ' ', clean_txt)
    clean_txt = clean_txt.strip()
    # tokenize text
    tokens = word_tokenize(clean_txt)
    # remove non-alphabetic tokens
    filtered_tokens_alpha = [word for word in tokens if word.isalpha() and not re.match(r'^[ivxlcdm]+$', word)]
    # load NLTK stopword list and add original stopwords
    stop_words = stopwords.words('english')
    stop_words.extend(['customer', 'business', 'revenue', 'quarter', 'year', 'actually', 'sale', 'market', 'also', 'million', 'unfortunately', 'data', 'advantage', 'anymore'])
    # remove stopwords
    filtered_tokens_final = [w for w in filtered_tokens_alpha if not w in stop_words]
    # define lemmatizer
    lemmatizer = WordNetLemmatizer()
    # conduct POS tagging
    pos_tags = nltk.pos_tag(filtered_tokens_final)
    # lemmatize word-tokens via assigned POS tags
    lemma_tokens = [lemmatizer.lemmatize(token, wordnet_pos_tags(pos_tag)) for token, pos_tag in pos_tags]
    return lemma_tokens

# file iteration function
def iterate_txt_files(txt_dir):
    texts = []
    for filename in os.listdir(txt_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(txt_dir, filename), 'r', encoding='utf-8') as file:
                txt_tokens = txt_preprocess_pipeline(file)
                texts.append(txt_tokens)
    return texts




In [8]:
# Apply preprocessing
df['tokens'] = df['content'].apply(txt_preprocess_pipeline)

# Remove empty docs
original_doc_count = len(df)
df = df[df['tokens'].apply(lambda x: len(x) > 0)].reset_index(drop=True)
filtered_doc_count = len(df)
if filtered_doc_count < original_doc_count:
    print(f"Removed {original_doc_count - filtered_doc_count} empty documents after preprocessing.")

texts = df['tokens'].tolist()
print(texts[:1])

[['military', 'fourth', 'versus', 'prior', 'well', 'expect', 'military', 'fairly', 'line', 'current', 'maybe', 'slightly', 'less', 'equipment', 'sale', 'nonrecurring', 'look', 'military', 'fiscal', 'expect', 'image', 'physic', 'gain', 'traction', 'pick', 'relate', 'customer', 'meeting', 'requirement', 'around', 'joint', 'commission', 'likely', 'second', 'half', 'fiscal', 'radiation', 'measurement', 'feel', 'international', 'go', 'pretty', 'well', 'think', 'domestic', 'well', 'believe', 'radiation', 'whole', 'negatively', 'impact', 'foreign', 'exchange', 'rate', 'approximately', 'base', 'estimate', 'foreign', 'exchange', 'rate', 'know', 'accurately', 'would', 'sit', 'rick', 'really', 'ca', 'comment', 'segment', 'level', 'apologize', 'rick', 'segment', 'level', 'able', 'put', 'sale', 'press', 'release', 'direct', 'margin', 'sale', 'dollar', 'probably', 'materially', 'correct', 'prior', 'indiscernible', 'yes', 'would', 'restate', 'income', 'tax', 'quarter', 'previously', 'file', 'fiscal',

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

In [9]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

# Set this to 'unigram', 'bigram', or 'trigram'
NGRAM_TYPE = 'bigram'   # Change to 'trigram' or 'unigram' as desired

# Build the bigram and trigram models
bigram = Phrases(texts, min_count=5, threshold=100)
trigram = Phrases(bigram[texts], threshold=100)

bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

# Apply ngram transformation based on choice
if NGRAM_TYPE == 'bigram':
    texts = make_bigrams(texts)
    print("Bigram transformation applied.")
elif NGRAM_TYPE == 'trigram':
    texts = make_trigrams(texts)
    print("Trigram transformation applied.")
else:
    print("Unigram: no n-gram transformation applied.")

print(texts[:1])


Bigram transformation applied.
[['military', 'fourth', 'versus', 'prior', 'well', 'expect', 'military', 'fairly', 'line', 'current', 'maybe', 'slightly', 'less', 'equipment', 'sale', 'nonrecurring', 'look', 'military', 'fiscal', 'expect', 'image_physic', 'gain', 'traction', 'pick', 'relate', 'customer', 'meeting', 'requirement', 'around', 'joint', 'commission', 'likely', 'second_half', 'fiscal', 'radiation', 'measurement', 'feel', 'international', 'go', 'pretty', 'well', 'think', 'domestic', 'well', 'believe', 'radiation', 'whole', 'negatively', 'impact', 'foreign_exchange', 'rate', 'approximately', 'base', 'estimate', 'foreign_exchange', 'rate', 'know', 'accurately', 'would', 'sit', 'rick', 'really', 'ca', 'comment', 'segment', 'level', 'apologize', 'rick', 'segment', 'level', 'able', 'put', 'sale', 'press_release', 'direct', 'margin', 'sale', 'dollar', 'probably', 'materially', 'correct', 'prior', 'indiscernible', 'yes', 'would', 'restate', 'income', 'tax', 'quarter', 'previously', '

In [10]:
# Remove any documents that ended up empty after preprocessing
original_doc_count = len(texts)
texts = [doc for doc in texts if len(doc) > 0]
filtered_doc_count = len(texts)
if filtered_doc_count < original_doc_count:
    print(f"Removed {original_doc_count - filtered_doc_count} empty documents after preprocessing.")




In [None]:
# load dictionary
id2word = corpora.Dictionary(texts)
id2word.filter_extremes(no_above = .8, no_below = 5)

# generate corpus as BoW
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus[:1])

# train LDA model
lda_model = gensim.models.LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    num_topics=30,
    passes=200,
    random_state=4583,
    chunksize=20,
    iterations=200,
    alpha=1.0
    workers=10,  # Adjust based on your CPU cores
)
   

# print LDA topics
for topic in lda_model.print_topics(num_topics=15, num_words=10):
    print(topic)

[[(0, 3), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 3), (11, 1), (12, 2), (13, 1), (14, 2), (15, 2), (16, 1), (17, 2), (18, 1), (19, 3), (20, 1), (21, 7), (22, 2), (23, 1), (24, 1), (25, 2), (26, 1), (27, 2), (28, 3), (29, 1), (30, 1), (31, 1), (32, 3), (33, 3), (34, 1), (35, 1), (36, 1), (37, 3), (38, 2), (39, 2), (40, 1), (41, 1), (42, 1), (43, 1), (44, 3), (45, 1), (46, 1), (47, 3), (48, 1), (49, 2), (50, 3), (51, 1), (52, 5), (53, 3), (54, 4), (55, 1), (56, 2), (57, 1), (58, 1), (59, 1), (60, 1), (61, 3), (62, 1), (63, 1), (64, 2), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 3), (73, 2), (74, 2), (75, 1), (76, 1), (77, 2), (78, 1), (79, 1), (80, 2), (81, 1), (82, 1), (83, 5), (84, 8), (85, 2), (86, 1), (87, 1), (88, 1), (89, 2), (90, 1), (91, 1), (92, 2), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 3), (99, 3), (100, 1), (101, 1), (102, 1), (103, 1), (104, 2), (105, 1), (106, 1), (107, 2), (108, 1), (109, 1), (110, 2)

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)


Coherence Score:  0.303658433802309


In [34]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b,
                                           workers=10,  )
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [35]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 10
max_topics = 18
step_size = 4
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('./results/lda_tuning_results.csv', index=False)
    pbar.close()

  1%|          | 1/120 [00:59<1:58:25, 59.71s/it]
100%|██████████| 120/120 [22:55<00:00, 11.47s/it]


In [12]:
lda_visual = gensimvisualize.prepare(lda_model, corpus, dictionary, mds='mmds')
pyLDAvis.display(lda_visual)




In [None]:
# generate document-topic distributions
for i, doc in enumerate(corpus):
    doc_topics = lda_model.get_document_topics(doc)
    print(f"Document {i}: {doc_topics}")

# %%