# Part 1: Automatic Speech Recognition, Diarize and Label

Environment = "whisperx"

* Performance Benchmarks on local
* GPU Benchmark: 0.09961056709289551 seconds
* Memory Bandwidth Benchmark: 0.2920224666595459 seconds
* CPU Benchmark: 13.046526432037354 seconds
* Disk Write Benchmark: 2.3364615440368652 seconds
* Disk Read Benchmark: 0.05882525444030762 seconds \n
  
** all benchmarks are >> faster than Collab with the exception of Disk write.

## Setup ⚙️
Tested for PyTorch 2.0, Python 3.10 (use other versions at your own risk!)
GPU execution requires the NVIDIA libraries cuBLAS 11.x and cuDNN 8.x to be installed on the system. Please refer to the CTranslate2 documentation.

1.  Create Python3.10 environment

`conda create --name whisperx python=3.10`

`conda activate whisperx`

2. Install PyTorch, e.g. for Linux and Windows CUDA11.8:
   
conda install pytorch==2.0.0 torchaudio==2.0.0 pytorch-cuda=11.8 -c pytorch -c nvidia

See other methods here.

1. Install this repo

`pip install git+https://github.com/m-bain/whisperx.git`

If already installed, update package to most recent commit

`pip install git+https://github.com/m-bain/whisperx.git --upgrade`



# Preprocess initial audio file
convert to Wav using ffmpeg

In [1]:
import ffmpeg

## 1 - Convert Mp3 to WAV.

def convert_m4a_to_mp3(input_file, output_file):
    try:
        ffmpeg.input(input_file).output(output_file).run(overwrite_output=True)
        print(f"Successfully converted {input_file} to {output_file}")
    except ffmpeg.Error as e:
        print("An error occurred:", e)

# Input/ output files and usage
input_mp3 = './audio/Botswana_2024_Audio.mp3'  # Change this to your mp3 file path
output_wav = './data/Botswana_2024_Audio.wav'  # Change this to your desired output wav file path

convert_m4a_to_mp3(input_mp3, output_wav)

Successfully converted ./audio/Botswana_2024_Audio.mp3 to ./data/Botswana_2024_Audio.wav


In [None]:
import whisperx
import gc
import os
import torch

device = "cuda"
## Full file should be the input (2007 or 2024 file..)
audio_file = "./data/Botswana_2007_Audio.wav"


batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
without_timestamps= 'True'

## Some error handling to ensure that successfully loaded the mp3 file!
try:
    # Check if the file exists
    if not os.path.isfile(audio_file):
        raise FileNotFoundError(f"The file '{audio_file}' does not exist.")
    # Optionally, you can add more checks (like file format) here

    print(f"Successfully accessed the audio file: {audio_file}")

except FileNotFoundError as e:
    print(e)
except Exception as e:
    print(f"An unexpected error occurred: {e}")

## Load the Audio File

In [None]:
import whisperx
import gc
import os
import torch

device = "cuda"
## Full file should be the input (2007 or 2024 file..)
audio_file = "./audio/Botswana_2007_Audio.wav"

## DEBUGGING, use a small file
# audio_file = "./audio/Intro.wav"

batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
without_timestamps= 'True'

## Some error handling to ensure that successfully loaded the mp3 file!
try:
    # Check if the file exists
    if not os.path.isfile(audio_file):
        raise FileNotFoundError(f"The file '{audio_file}' does not exist.")
    # Optionally, you can add more checks (like file format) here

    print(f"Successfully accessed the audio file: {audio_file}")

except FileNotFoundError as e:
    print(e)
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# BATCH PROCESS: Transcript - Align - Diarize

### Whisperx in the Terminal
This seemed to work on a single file VERY fast! 

In [None]:
import os
from HF_token import TOKEN_ID
# Set the path to your directory
directory = "./audio/"

# Iterate through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".wav"):  # Check for .wav files
        filepath = os.path.join(directory, filename)
        
        # Construct and run the whisperx command for each file
        command = f"whisperx {filepath} --model large-v2 --diarize --highlight_words True --hf_token {TOKEN_ID} --output_dir ./outputs"
        os.system(command)

# Consolidate the Diarized JSON files 

In [6]:
import os
import pandas as pd
import json
import glob

# Directory containing the JSON files
json_directory = 'outputs/'

# Get a list of all JSON files in the directory
json_files = glob.glob(os.path.join(json_directory, '*.json'))

# Initialize a list to hold all DataFrames
df_list = []

# Iterate through each JSON file and merge segments
for json_file in json_files:
	with open(json_file, 'r') as file:
		data = json.load(file)
		# Convert the "segments" part of the JSON data to a DataFrame
		df = pd.DataFrame(data["segments"])
		df_list.append(df)

# Concatenate all DataFrames into a single DataFrame
diarized_df = pd.concat(df_list, ignore_index=True)

# Export
diarized_df.to_csv('./data/00.diarzed_output_no_names.csv')

# Display the consolidated DataFrame
diarized_df.head(100)



Unnamed: 0,start,end,text,words,speaker
0,21.467,23.068,"Hello, hello, and welcome.","[{'word': 'Hello,', 'start': 21.467, 'end': 21...",SPEAKER_04
1,23.108,24.029,Thank you very much.,"[{'word': 'Thank', 'start': 23.108, 'end': 23....",SPEAKER_05
2,24.049,34.256,"Now, as you know, the producers on this show l...","[{'word': 'Now,', 'start': 24.049, 'end': 24.1...",SPEAKER_05
3,34.597,39.640,Then they set unbelievably hard tasks to do to...,"[{'word': 'Then', 'start': 34.597, 'end': 34.7...",SPEAKER_05
4,39.900,40.080,Yeah.,"[{'word': 'Yeah.', 'start': 39.9, 'end': 40.08...",SPEAKER_06
...,...,...,...,...,...
95,377.117,378.638,And that instrument's a bit wobbly.,"[{'word': 'And', 'start': 377.117, 'end': 377....",SPEAKER_03
96,378.658,382.560,"Apart from that, everything that's actually im...","[{'word': 'Apart', 'start': 378.658, 'end': 37...",SPEAKER_03
97,383.560,386.882,"Apart from the handbrake, which I can pull lik...","[{'word': 'Apart', 'start': 383.56, 'end': 383...",SPEAKER_03
98,388.453,393.758,"Nevertheless, because we were on tarmac roads...","[{'word': 'Nevertheless,', 'start': 388.453, '...",SPEAKER_05


In [7]:
# Apply the Speaker names to the labels

'''
Speaker 00 = Narrator
Speaker 01 = Hammond
Speaker 02 = Hammond
Speaker 03 = May
Speaker 04 = Clarkson
Speaker 05 = Clarkson
06 = Hammond
07 = Hammond
08 = Hammond
09 = ?

'''

diarized_df['speaker'] = diarized_df['speaker'].replace('SPEAKER_00', 'Narrator')
diarized_df['speaker'] = diarized_df['speaker'].replace('SPEAKER_01', 'Hammond')
diarized_df['speaker'] = diarized_df['speaker'].replace('SPEAKER_02', 'Hammond')

diarized_df['speaker'] = diarized_df['speaker'].replace('SPEAKER_03', 'May')
diarized_df['speaker'] = diarized_df['speaker'].replace('SPEAKER_04', 'Clarkson')
diarized_df['speaker'] = diarized_df['speaker'].replace('SPEAKER_05', 'Clarkson')

diarized_df['speaker'] = diarized_df['speaker'].replace('SPEAKER_06', 'Hammond')
diarized_df['speaker'] = diarized_df['speaker'].replace('SPEAKER_07', 'Hammond')
diarized_df['speaker'] = diarized_df['speaker'].replace('SPEAKER_08', 'Hammond')

diarized_df.head(100)

# Export
diarized_df.to_csv('./data/01.diarzed_output_named.csv')

# Part 2: LDA (Latent Dirichlet Allocation) Preparation




### Import the diarized data from (created previously)

In [13]:

import pandas as pd
import json

# Import
diarized_df = pd.read_csv('./data/01.diarzed_output_named.csv')


In [14]:
preprocessed_df = diarized_df.copy()
# Preprocessing steps for LDA analysis

preprocessed_df

Unnamed: 0.1,Unnamed: 0,start,end,text,words,speaker
0,0,21.467,23.068,"Hello, hello, and welcome.","[{'word': 'Hello,', 'start': 21.467, 'end': 21...",Clarkson
1,1,23.108,24.029,Thank you very much.,"[{'word': 'Thank', 'start': 23.108, 'end': 23....",Clarkson
2,2,24.049,34.256,"Now, as you know, the producers on this show l...","[{'word': 'Now,', 'start': 24.049, 'end': 24.1...",Clarkson
3,3,34.597,39.640,Then they set unbelievably hard tasks to do to...,"[{'word': 'Then', 'start': 34.597, 'end': 34.7...",Clarkson
4,4,39.900,40.080,Yeah.,"[{'word': 'Yeah.', 'start': 39.9, 'end': 40.08...",Hammond
...,...,...,...,...,...,...
1035,1035,3544.028,3544.468,He's right.,"[{'word': ""He's"", 'start': 3544.028, 'end': 35...",Clarkson
1036,1036,3545.329,3547.850,You've replaced all the electrics in that car.,"[{'word': ""You've"", 'start': 3545.329, 'end': ...",Clarkson
1037,1037,3554.833,3563.498,"Tomorrow night at 8.50, it's more from the Top...","[{'word': 'Tomorrow', 'start': 3554.833, 'end'...",Narrator
1038,1038,3563.818,3566.059,Let's hope they bring enough glove compartment...,"[{'word': ""Let's"", 'start': 3563.818, 'end': 3...",Narrator


### Import all the libraries required


In [9]:
import numpy as np
import json
import glob
import re

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

### Cant remember why needed this ... 
import locale
def getpreferredencoding(do_setlocale=True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding


In [None]:
#### Below is a suggestion from gpt. 
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus import stopwords as nltk_stopwords

# 1. Remove emails, newline characters, and non-alphabetic characters
preprocessed_df['cleaned_text'] = preprocessed_df['text'].str.replace(r'\S+@\S+', '', regex=True)
preprocessed_df['cleaned_text'] = preprocessed_df['cleaned_text'].str.replace(r'http\S+|www\S+', '', regex=True)
preprocessed_df['cleaned_text'] = preprocessed_df['cleaned_text'].str.replace(r'\n', ' ', regex=True)
preprocessed_df['cleaned_text'] = preprocessed_df['cleaned_text'].str.replace(r'[^a-zA-Z\s]', '', regex=True)

# 2. Convert to lowercase
preprocessed_df['cleaned_text'] = preprocessed_df['cleaned_text'].str.lower()

# 3. Tokenize the text
preprocessed_df['tokens'] = preprocessed_df['cleaned_text'].apply(lambda x: gensim.utils.simple_preprocess(x, deacc=True))

# 4. Remove stopwords
stopwords = set(nltk_stopwords.words('english'))
preprocessed_df['tokens'] = preprocessed_df['tokens'].apply(lambda x: [word for word in x if word not in stopwords])

# 5. Lemmatize the tokens
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
preprocessed_df['lemmatized_tokens'] = preprocessed_df['tokens'].apply(lambda x: [token.lemma_ for token in nlp(" ".join(x)) if token.pos_ in ["NOUN", "ADJ", "VERB", "ADV"]])

# 6. Remove tokens that are less than 3 characters long
preprocessed_df['lemmatized_tokens'] = preprocessed_df['lemmatized_tokens'].apply(lambda x: [word for word in x if len(word) >= 3])

# 7. Remove rows where the list length in lemmatized_tokens is 0
preprocessed_df = preprocessed_df[preprocessed_df['lemmatized_tokens'].apply(len) > 0]

#  Tested each step with a few preprocessed_df.iloc[6]


In [28]:
# Display the preprocessed dataframe
preprocessed_df.head()


Unnamed: 0.1,Unnamed: 0,start,end,text,words,speaker,cleaned_text,tokens,lemmatized_tokens
1,1,23.108,24.029,Thank you very much.,"[{'word': 'Thank', 'start': 23.108, 'end': 23....",Clarkson,thank you very much,"[thank, much]","[thank, much]"
2,2,24.049,34.256,"Now, as you know, the producers on this show l...","[{'word': 'Now,', 'start': 24.049, 'end': 24.1...",Clarkson,now as you know the producers on this show lik...,"[know, producers, show, like, give, us, challe...","[know, producer, show, give, challenge, specif..."
3,3,34.597,39.64,Then they set unbelievably hard tasks to do to...,"[{'word': 'Then', 'start': 34.597, 'end': 34.7...",Clarkson,then they set unbelievably hard tasks to do to...,"[set, unbelievably, hard, tasks, see, one, us,...","[set, unbelievably, hard, task, see, get, good..."
5,5,40.541,44.043,"This week, for a Top Gear special, they came u...","[{'word': 'This', 'start': 40.541, 'end': 40.7...",Hammond,this week for a top gear special they came up ...,"[week, top, gear, special, came, real, humdinger]","[week, top, gear, special, come, real, humdinger]"
6,6,44.423,50.408,"They gave each of us 1,500 quid and told us to...","[{'word': 'They', 'start': 44.423, 'end': 44.5...",Hammond,they gave each of us quid and told us to go t...,"[gave, us, quid, told, us, go, africa, buy, car]","[give, quid, tell, buy, car]"


### Export the Pre Processed DF to CSV

* removed emails
* converted to lower
* remove stop words
* tokenized and lemmatize
* remove tokens <3 char
* remove null dictionaries (rows)

In [11]:

preprocessed_df.to_csv("./data/03.preprocess_completed.csv")

### Split the data frame into 3 (one per presenter)

In [29]:

# Create a new DataFrame for each speaker
May_df = preprocessed_df[preprocessed_df['speaker'] == 'May']
Clarkson_df = preprocessed_df[preprocessed_df['speaker'] == 'Clarkson']
Hammond_df = preprocessed_df[preprocessed_df['speaker'] == 'Hammond']

# Display the first few rows of each DataFrame (optional)
# May_df.head()

Clarkson_df.head()

# Hammond_df.head()

Unnamed: 0.1,Unnamed: 0,start,end,text,words,speaker,cleaned_text,tokens,lemmatized_tokens
1,1,23.108,24.029,Thank you very much.,"[{'word': 'Thank', 'start': 23.108, 'end': 23....",Clarkson,thank you very much,"[thank, much]","[thank, much]"
2,2,24.049,34.256,"Now, as you know, the producers on this show l...","[{'word': 'Now,', 'start': 24.049, 'end': 24.1...",Clarkson,now as you know the producers on this show lik...,"[know, producers, show, like, give, us, challe...","[know, producer, show, give, challenge, specif..."
3,3,34.597,39.64,Then they set unbelievably hard tasks to do to...,"[{'word': 'Then', 'start': 34.597, 'end': 34.7...",Clarkson,then they set unbelievably hard tasks to do to...,"[set, unbelievably, hard, tasks, see, one, us,...","[set, unbelievably, hard, task, see, get, good..."
16,16,100.86,102.06,It is a Lancia Beta.,"[{'word': 'It', 'start': 100.86, 'end': 100.96...",Clarkson,it is a lancia beta,"[lancia, beta]",[beta]
17,17,102.22,102.44,"Coupé, 1981.","[{'word': 'Coupé,', 'start': 102.22, 'end': 10...",Clarkson,coup,[coup],[coup]


### Convert the DF to a LIST

In [35]:
Clarkson_cleaned_texts = Clarkson_df["lemmatized_tokens"].to_list()
May_cleaned_texts = May_df["lemmatized_tokens"].to_list()
Hammond_cleaned_texts = Hammond_df["lemmatized_tokens"].to_list()	


print(Hammond_cleaned_texts)

[['week', 'top', 'gear', 'special', 'come', 'real', 'humdinger'], ['give', 'quid', 'tell', 'buy', 'car'], ['hell', 'man'], ['much', 'well', 'nick'], ['much', 'change', 'buy', 'many', 'bean'], ['much', 'simple', 'get'], ['get', 'move', 'part', 'year'], ['horsepower'], ['sport', 'version', 'want', 'lairy'], ['bit', 'thick'], ['sorry'], ['think', 'car', 'inspire', 'latterly', 'build', 'become', 'rubbish'], ['course', 'indirectly'], ['quite', 'lot', 'reasonably', 'average', 'car', 'owe'], ['brake', 'terrible', 'work', 'wheel'], ['work', 'well', 'wheel', 'wheel'], ['overtake', 'truck'], ['pull', 'slipstream'], ['really', 'come'], ['still', 'truck'], ['still', 'truck'], ['happy', 'car', 'world'], ['call', 'oliver'], ['ever', 'name', 'car', 'top'], ['wish', 'say'], ['know'], ['horn'], ['oliver', 'get', 'cold'], ['listen'], ['fun', 'discover', 'travel', 'bill'], ['take', 'look', 'car'], ['massive'], ['knock', 'oliver'], ['knock', 'car'], ['fine', 'fine', 'call'], ['oliver', 'think', 'talk'], [

## Step 3 - Create the Corpora, Dictionary & LDA

There are 3 data frames, so create ensure consistent and efficient treatment, create a function to pass each df through the same series of steps. These include: 

1. create the dictionary of terms
2. create the corpus (count of each dictionary term)
3. create the LDA Model
4. run the LDA model

In [36]:
def create_corpora_and_LDA(data_sets, names, num_topics = 5):
    results_dict = {}
    for data, name in zip(data_sets, names):
        # STEP 1 - Create dictionary
        id2word = corpora.Dictionary(data)
        print(f"Dictionary for {name}:")
        print(id2word)

        # STEP 2 - Create Corpus
        texts = data

        # STEP 3 - Term Document Frequency
        corpus = [id2word.doc2bow(text) for text in texts]

        print(f"Corpus for {name}:")
        print(corpus)

        # STEP 4- Create LDA Model
        lda_model = gensim.models.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics,
                                           random_state=100,
                                           chunksize=200,
                                           passes=10,
                                           per_word_topics=True)

        print(f"LDA Model for {name}:")
        print(lda_model)

        # STEP 5 - Store in dictionary
        results_dict[name] = {
            'dictionary': id2word,
            'corpus': corpus,
            'lda_model': lda_model
        }

    return results_dict

# RESULTS_DICTIONARY - Create a dictionary of the data sets, their respective names, and the info created from the above function. 

data_sets = [May_cleaned_texts, Clarkson_cleaned_texts, Hammond_cleaned_texts]
names = ['May', 'Clarkson', 'Hammond']
results_dict = create_corpora_and_LDA(data_sets, names)

Dictionary for May:
Dictionary<379 unique tokens: ['condition', 'build', 'drive', 'fourwheel', 'offroad']...>
Corpus for May:
[[(0, 1)], [(1, 1), (2, 1), (3, 1), (4, 1), (5, 1)], [(6, 1), (7, 1), (8, 1), (9, 1), (10, 1)], [(11, 1), (12, 1)], [(13, 1), (14, 1)], [(15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1)], [(11, 1), (12, 1), (22, 1)], [(24, 1), (25, 1)], [(26, 1), (27, 1)], [(28, 1)], [(29, 1)], [(30, 1), (31, 1)], [(32, 1), (33, 1)], [(34, 1), (35, 1)], [(11, 1), (36, 1), (37, 1), (38, 1)], [(39, 1)], [(40, 1)], [(41, 1)], [(42, 1)], [(35, 1), (43, 1), (44, 1)], [(22, 1)], [(45, 2), (46, 1), (47, 1), (48, 1)], [(47, 1)], [(49, 1)], [(31, 1), (39, 1), (43, 2), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1)], [(59, 1), (60, 1), (61, 1), (62, 1)], [(21, 1), (25, 1), (56, 1), (63, 1), (64, 1), (65, 1), (66, 1)], [(67, 1), (68, 1), (69, 1)], [(70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1)], [(76, 1), (77, 1), (78, 

In [37]:
#Print the keyword in the 5 topics
# 
import pprint

for name, result in results_dict.items():
    print(f"Results for {name}:")
    pprint.pprint(result['lda_model'].print_topics())


# print(lda_model.print_topics())
# doc_lda = lda_model[corpus]

#We created 5 topics. You can see the keywords for each topic and the weightage(importance) of each keyword using lda_model.print_topics() as shown below:

Results for May:
[(0,
  '0.018*"next" + 0.018*"car" + 0.018*"hang" + 0.013*"drive" + 0.013*"good" + '
  '0.013*"mercede" + 0.013*"smell" + 0.013*"probably" + 0.013*"delighted" + '
  '0.013*"ruin"'),
 (1,
  '0.022*"look" + 0.017*"water" + 0.017*"point" + 0.012*"mean" + 0.012*"soon" '
  '+ 0.012*"gay" + 0.012*"brilliantly" + 0.012*"rougher" + 0.012*"near" + '
  '0.012*"car"'),
 (2,
  '0.023*"see" + 0.023*"get" + 0.023*"work" + 0.017*"car" + 0.012*"leave" + '
  '0.012*"think" + 0.012*"bit" + 0.012*"long" + 0.012*"even" + 0.012*"pull"'),
 (3,
  '0.038*"car" + 0.038*"know" + 0.017*"need" + 0.017*"cow" + 0.017*"hammond" + '
  '0.012*"work" + 0.012*"want" + 0.012*"badger" + 0.012*"honey" + '
  '0.012*"stopwatch"'),
 (4,
  '0.038*"get" + 0.033*"come" + 0.020*"really" + 0.015*"work" + 0.015*"black" '
  '+ 0.015*"take" + 0.015*"engine" + 0.010*"lift" + 0.010*"right" + '
  '0.010*"snake"')]
Results for Clarkson:
[(0,
  '0.084*"get" + 0.046*"true" + 0.039*"look" + 0.032*"beetle" + 0.031*"even" + '

## Visualize LDA Topic Models for each Presenter

### Create a Visualize function

In [38]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

def visualize_lda_model(results_dict, dataset_name):
    # Access the dictionary, corpus, and LDA model for the specified dataset
    dictionary = results_dict[dataset_name]['dictionary']
    corpus = results_dict[dataset_name]['corpus']
    lda_model = results_dict[dataset_name]['lda_model']

    # Prepare the visualization
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, mds='mmds', R=30)

    # Display the visualization
    return vis


### Call the LDA Visualization for a given Presenter

In [39]:
# Visualize the LDA model for 'X'
vis = visualize_lda_model(results_dict, 'Clarkson')
vis

### HTML for Presenters together

In [None]:
# Extract the Variables from the results dictionary and declare new variables (copy n paste method)
'''

# Extract the variables for May
lda_model_May = results_dict['May']['lda_model']
corpus_May = results_dict['May']['corpus']
dictionary_May = results_dict['May']['dictionary']

lda_model_Hammond = results_dict['Hammond']['lda_model']
corpus_Hammond = results_dict['Hammond']['corpus']
dictionary_Hammond = results_dict['Hammond']['dictionary']

# Similarly, you can extract and visualize for Clarkson and Hammond
lda_model_Clarkson = results_dict['Clarkson']['lda_model']
corpus_Clarkson = results_dict['Clarkson']['corpus']
dictionary_Clarkson = results_dict['Clarkson']['dictionary']

'''

### Function to generate and save PyLDAvis visualization

In [53]:
def generate_and_save_vis(results_dict, name):
    lda_model = results_dict[name]['lda_model']
    corpus = results_dict[name]['corpus']
    dictionary = results_dict[name]['dictionary']
    vis = gensimvis.prepare(lda_model, corpus, dictionary)
    pyLDAvis.save_html(vis, f'./outputs/vis_{name}.html')

# Generate and save visualizations for each dataset
for name in names:
    generate_and_save_vis(results_dict, name)

# Combine the HTML files into a single HTML file
with open('combined_vis.html', 'w') as outfile:
    for name in names:
        with open(f'./outputs/vis_{name}.html') as infile:
            outfile.write(infile.read())

In [54]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Assuming lda_model_May, lda_model_Clarkson, and lda_model_Hammond are your LDA models
# and corpus_May, corpus_Clarkson, corpus_Hammond are your corpora
# and dictionary_May, dictionary_Clarkson, dictionary_Hammond are your dictionaries

# Generate PyLDAvis visualizations
vis_May = gensimvis.prepare(lda_model_May, corpus_May, dictionary_May)
vis_Clarkson = gensimvis.prepare(lda_model_Clarkson, corpus_Clarkson, dictionary_Clarkson)
vis_Hammond = gensimvis.prepare(lda_model_Hammond, corpus_Hammond, dictionary_Hammond)

# Save each visualization as an HTML file
pyLDAvis.save_html(vis_May, './outputs/vis_May.html')
pyLDAvis.save_html(vis_Clarkson, './outputs/vis_Clarkson.html')
pyLDAvis.save_html(vis_Hammond, './outputs/vis_Hammond.html')

# Combine the HTML files into a single HTML file
with open('./outputs/combined_vis.html', 'w') as outfile:
    for fname in ['./outputs/vis_May.html', './outputs/vis_Clarkson.html', './outputs/vis_Hammond.html']:
        with open(fname) as infile:
            outfile.write(infile.read())

# Now you can open 'combined_vis.html' to see all three visualizations together

# Part 5: Evaluation - Coherence 

Topic coherence measures the average similarity between top words having the highest weights in a topic i.e relative distance between the top words.


In [43]:


for name in results_dict.keys():
	lda_model = results_dict[name]['lda_model']
	texts = eval(f"{name}_cleaned_texts")
	dictionary = results_dict[name]['dictionary']
	
	coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
	coherence_lda = coherence_model_lda.get_coherence()
	print(f'Coherence Score for {name}: ', coherence_lda)



Coherence Score for May:  0.6808374806112967
Coherence Score for Clarkson:  0.5909402916987949
Coherence Score for Hammond:  0.6659895742797534


# Part 6: Model Improvement - How many topics? 

## Define function to Iterate Coherence: 

In [44]:

# Define the function to iterate
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
	coherence_values = []
	model_list = []
	
	for name in names:
		dictionary = results_dict[name]['dictionary']
		corpus = results_dict[name]['corpus']
		texts = eval(f"{name}_cleaned_texts")
		
		for num_topics in range(start, limit, step):
			model = gensim.models.LdaModel(corpus=corpus, num_topics=num_topics, random_state=100, chunksize=200, passes=10, per_word_topics=True, id2word=dictionary)
			model_list.append(model)
			coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
			coherence_values.append(coherencemodel.get_coherence())
	
	return model_list, coherence_values


In [45]:

# Call the function and print the results for each name in results_dict
limit = 10
start = 2
step = 1

for name in names:
	print(f"Results for {name}:")
	model_list, coherence_values = compute_coherence_values(results_dict[name]['dictionary'], results_dict[name]['corpus'], eval(f"{name}_cleaned_texts"), limit, start, step)
	for m, cv in zip(range(start, limit, step), coherence_values):
		print(f"Num Topics = {m}, Coherence Value = {cv}")


Results for May:
Num Topics = 2, Coherence Value = 0.7166129819455946
Num Topics = 3, Coherence Value = 0.7157485205762296
Num Topics = 4, Coherence Value = 0.6874794865478038
Num Topics = 5, Coherence Value = 0.6808374806112967
Num Topics = 6, Coherence Value = 0.645575251871593
Num Topics = 7, Coherence Value = 0.584938584949688
Num Topics = 8, Coherence Value = 0.5339173988933685
Num Topics = 9, Coherence Value = 0.5369284102274318
Results for Clarkson:
Num Topics = 2, Coherence Value = 0.7166129819455946
Num Topics = 3, Coherence Value = 0.7157485205762296
Num Topics = 4, Coherence Value = 0.6874794865478038
Num Topics = 5, Coherence Value = 0.6808374806112967
Num Topics = 6, Coherence Value = 0.645575251871593
Num Topics = 7, Coherence Value = 0.584938584949688
Num Topics = 8, Coherence Value = 0.5339173988933685
Num Topics = 9, Coherence Value = 0.5369284102274318
Results for Hammond:
Num Topics = 2, Coherence Value = 0.7166129819455946
Num Topics = 3, Coherence Value = 0.7157485