In [29]:
import spacy
import gensim
import gensim.corpora as corpora
from gensim import models
from pprint import pprint

In [30]:
nlp = spacy.load("en_core_web_lg")

In [31]:
from os import listdir

# Name of the folder containing the files
folder_path = "ami-transcripts"

# Get a list of filenames
filenames = listdir(folder_path)

ECallDocuments = [] # List to store all documents in the training corpus as a 'list of lists'

# For each file
for filename in filenames:
    # Create the filepath
    file_path = f"{folder_path}/{filename}"

    # Open the file (using "with" for file opening will autoclose the file at the end. It's a good practice)
    with open(file_path, "r") as f:
        # Get the file content
        ECallTxt = f.read()
        
        # Clean text
        ECallTxt = ECallTxt.strip()  # Remove white space at the beginning and end
        ECallTxt = ECallTxt.replace('\n', ' ') # Replace the \n (new line) character with space
        ECallTxt = ECallTxt.replace('\r', '') # Replace the \r (carriage returns -if you're on windows) with null
        ECallTxt = ECallTxt.replace(' ', ' ') # Replace " " (a special character for space in HTML) with space. 
        ECallTxt = ECallTxt.replace(' ', ' ') # Replace " " (a special character for space in HTML) with space.
        while '  ' in ECallTxt:
            ECallTxt = ECallTxt.replace('  ', ' ') # Remove extra spaces
        
        # Parse document with SpaCy
        ECall = nlp(ECallTxt)
        
        ECallDoc = [] # Temporary list to store individual document
    
        # Further cleaning and selection of text characteristics
        for token in ECall:
            if token.is_stop == False and token.is_punct == False and (token.pos_ == "NOUN" or token.pos_ == "ADJ" or token.pos_ =="VERB"): # Retain words that are not a stop word nor punctuation, and only if a Noun, Adjective or Verb
                ECallDoc.append(token.lemma_.lower()) # Convert to lower case and retain the lemmatized version of the word (this is a string object)
            
       
        # Append the content to the list
        ECallDocuments.append(ECallDoc) # Build the training corpus 'list of lists'

### NUMERIC REPRESENTATION OF TRAINING CORPUS USING BAG OF WORDS AND TF-IDF ###

# Form dictionary by mapping word IDs to words
ID2word = corpora.Dictionary(ECallDocuments)

# Set up Bag of Words and TFIDF
corpus = [ID2word.doc2bow(doc) for doc in ECallDocuments] # Apply Bag of Words to all documents in training corpus
TFIDF = models.TfidfModel(corpus) # Fit TF-IDF model
trans_TFIDF = TFIDF[corpus] # Apply TF-IDF model

### SET UP & TRAIN LDA MODEL ###

SEED = 75 # Set random seed
NUM_topics = 3 # Set number of topics
ALPHA = 0.9 # Set alpha
ETA = 0.35 # Set eta

# Train LDA model on the training corpus
lda_model = gensim.models.LdaMulticore(corpus=trans_TFIDF, num_topics=NUM_topics, id2word=ID2word, random_state=SEED, alpha=ALPHA, eta=ETA, passes=100)



In [32]:

# Print topics generated from the training corpus
pprint(lda_model.print_topics(num_words=10))

[(0,
  '0.000*"galaxy" + 0.000*"quasar" + 0.000*"continuum" + 0.000*"absorption" + '
  '0.000*"emission" + 0.000*"nominate" + 0.000*"flux" + 0.000*"posterior" + '
  '0.000*"damp" + 0.000*"gamma"'),
 (1,
  '0.000*"galaxy" + 0.000*"quasar" + 0.000*"continuum" + 0.000*"absorption" + '
  '0.000*"emission" + 0.000*"nominate" + 0.000*"flux" + 0.000*"posterior" + '
  '0.000*"gamma" + 0.000*"damp"'),
 (2,
  '0.002*"remote" + 0.002*"button" + 0.001*"scroll" + 0.001*"rubber" + '
  '0.001*"control" + 0.001*"wheel" + 0.001*"animal" + 0.001*"fruit" + '
  '0.001*"voice" + 0.001*"chip"')]


In [33]:
### GET TOPIC ALLOCATIONS FOR TRAINING CORPUS DOCUMENTS ###

doc_no = 0 # Set document counter
for doc in ECallDocuments:
    TFIDF_doc = TFIDF[corpus[doc_no]] # Apply TFIDF model to individual documents
    print(lda_model.get_document_topics(TFIDF_doc)) # Get and print document topic allocations
    doc_no += 1

print('-'*50)

[(0, 0.069866404), (1, 0.069869325), (2, 0.8602643)]
[(0, 0.09112013), (1, 0.0911175), (2, 0.8177624)]
[(0, 0.07858088), (1, 0.07858463), (2, 0.84283453)]
[(0, 0.109286636), (1, 0.109289415), (2, 0.7814239)]
[(0, 0.0613722), (1, 0.06137418), (2, 0.87725365)]
[(0, 0.08306099), (1, 0.083063625), (2, 0.83387536)]
[(0, 0.056336477), (1, 0.05633816), (2, 0.8873254)]
[(0, 0.101693355), (1, 0.10169673), (2, 0.7966099)]
[(0, 0.102115385), (1, 0.102127954), (2, 0.79575664)]
[(0, 0.060014315), (1, 0.060015004), (2, 0.87997067)]
[(0, 0.063970335), (1, 0.06397013), (2, 0.8720595)]
[(0, 0.058206372), (1, 0.058208425), (2, 0.8835852)]
[(0, 0.075100504), (1, 0.07510316), (2, 0.84979635)]
[(0, 0.05802477), (1, 0.058024358), (2, 0.8839508)]
[(0, 0.072745286), (1, 0.072737284), (2, 0.85451746)]
[(0, 0.07352267), (1, 0.07352242), (2, 0.8529549)]
[(0, 0.06612194), (1, 0.066121385), (2, 0.86775666)]
[(0, 0.07387414), (1, 0.07387424), (2, 0.85225165)]
[(0, 0.07770627), (1, 0.07770468), (2, 0.84458905)]
[(0,