# VERSION 1 - ARCHIVED

In [None]:
import ffmpeg

## 1 - Convert Mp3 to WAV.

def convert_m4a_to_mp3(input_file, output_file):
    try:
        ffmpeg.input(input_file).output(output_file).run(overwrite_output=True)
        print(f"Successfully converted {input_file} to {output_file}")
    except ffmpeg.Error as e:
        print("An error occurred:", e)

# Input/ output files and usage
input_mp3 = './audio/Botswana_2007_Audio.mp3'  # Change this to your mp3 file path
output_wav = './data/Botswana_2007_Audio.wav'  # Change this to your desired output wav file path

convert_m4a_to_mp3(input_mp3, output_wav)

## Load the Audio File

In [None]:
import whisperx
import gc
import os
import torch

device = "cuda"
## Full file should be the input (2007 or 2024 file..)
audio_file = "./data/Botswana_2007_Audio.wav"

## DEBUGGING, use a small file
# audio_file = "./data/Intro.wav"

batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
without_timestamps= 'True'

## Some error handling to ensure that successfully loaded the mp3 file!
try:
    # Check if the file exists
    if not os.path.isfile(audio_file):
        raise FileNotFoundError(f"The file '{audio_file}' does not exist.")
    # Optionally, you can add more checks (like file format) here

    print(f"Successfully accessed the audio file: {audio_file}")

except FileNotFoundError as e:
    print(e)
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# A) Transcribe and Diarize 

## SINGLE FILE

For large files, ie. the complete program, system crashed due to memory. Alternative approach is to break up audio files and iterate through them. See the other notebook

In [None]:
import torch #if not already done
import gc

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"TRANSCRIBING Using device: {device}")

# 1. Transcribe with original whisper (batched)
model = whisperx.load_model("large-v2", device, compute_type=compute_type)

# save model to local path (optional)
model_dir = "./model/"
model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)

audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
gc.collect(); torch.cuda.empty_cache(); del model



In [None]:
## SAVE the TRANSCRIPT
import json
with open('./outputs/Transcription_result.json', 'w') as json_file:
    json.dump(result, json_file, indent=4)



In [None]:
## DEBUGGING: Import the Transcript back in notebook

# Open and load the JSON file
with open('./outputs/Transcription_result.json', 'r') as file:
	data = json.load(file)

# Access and print the "segments"
Transcription_reimported = data["segments"]
print(Transcription_reimported)

## BATCH

### Split the Audio file into smaller pieces


In [None]:
## 3 - Split up large files in <10min


import sqlite3
import librosa
import soundfile as sf
import math


# TODO: change max duration to 300 seconds
# TODO: update the target database folder
# TODO: check input filenames
# TODO: Update the output folder

# Function to split audio and save to database
def split_audio(audio_file, max_duration=300):  # 60second (1min) for testing; 300sec for production
    conn = sqlite3.connect('./data/Audio_clips.db')
    cursor = conn.cursor()
    cursor.execute('''CREATE TABLE IF NOT EXISTS clips
                     (id INTEGER PRIMARY KEY AUTOINCREMENT, start_time REAL, end_time REAL, filename TEXT)''')

    try:
        y, sr = librosa.load(audio_file)
    except Exception as e:
        print(f"Error loading audio file: {e}")
        return []

    total_duration = librosa.get_duration(y=y, sr=sr)
    num_splits = math.ceil(total_duration / max_duration)
    results = []

    for i in range(num_splits):
        start_time = i * max_duration
        end_time = min((i + 1) * max_duration, total_duration)

        start_sample = int(start_time * sr)
        end_sample = int(end_time * sr)

        clip = y[start_sample:end_sample]
        filename = f"./data/Botswana2007_clip_{i}.wav"

        try:
            sf.write(filename, clip, sr)
            cursor.execute("INSERT INTO clips (start_time, end_time, filename) VALUES (?, ?, ?)",
                           (start_time, end_time, filename))
            conn.commit()
            results.append({"start_time": start_time, "end_time": end_time, "filename": filename})
        except Exception as e:
            print(f"Error processing clip {i}: {e}")

    conn.close()
    return results
# results is a DIctionary
results = split_audio(audio_file)

### Iterate through the folder containing the wav clips.
Save each transcript as a json. Json files need consolidation.

In [None]:
import os
import glob
import json
import gc
import torch
import whisperx

# TODO: Update the folder for wav files.

# Directory containing .wav files
wav_directory = './data/Testing'

# Get a list of all .wav files in the directory
wav_files = glob.glob(os.path.join(wav_directory, '*.wav'))

# Initialize results_full list
results_full = []

# Set device and compute type
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "float32"

print(f"TRANSCRIBING Using device: {device}")
print(f"Computer type is {compute_type}")

# Ensure the model directory exists
model_dir = "./model/"
os.makedirs(model_dir, exist_ok=True)

# Iterate through each .wav file and process it
for wav_file in wav_files:
    print(f"Processing file: {wav_file}")
    try:
        # Load the audio file
        audio = whisperx.load_audio(wav_file)
        
        # Load the model and save it to the local path
        try:
            model = whisperx.load_model("large-v2", device=device, compute_type=compute_type, download_root=model_dir)
        except Exception as e:
            print(f"Error loading model: {e}")
            raise

        # Transcribe the audio file
        result = model.transcribe(audio, batch_size=batch_size)
        print(result["segments"])  # before alignment
        
        # Append the result to results_full
        results_full.append(result)
        
        # Save the transcription result to a JSON file
        transcript_filename = os.path.basename(wav_file).replace('.wav', '')
        with open(f'./outputs/{transcript_filename}_transcript.json', 'w') as json_file:
            json.dump(result, json_file, indent=4)
        
        # Clean up memory after each file
        del model
        gc.collect()
        torch.cuda.empty_cache()
    
    except Exception as e:
        print(f"Error processing file {wav_file}: {e}")

# Print the full results
print(results_full)

### Consolidate the JSON files. 

The "segments" header for each file needs to be extracted an consolidated into a new file in order to be used for downstream tasks.

In [None]:
import os
import json
import glob

#  TODO: Update the output folder for json files.

# Directory containing the JSON files
json_directory = './outputs/Testing/'

# Get a list of all JSON files in the directory
json_files = glob.glob(os.path.join(json_directory, '*.json'))

# Initialize a list to hold all segments
all_segments = []

# Iterate through each JSON file and merge segments
for json_file in json_files:
    with open(json_file, 'r') as file:
        data = json.load(file)
        all_segments.extend(data.get("segments", []))

# Create a combined dictionary
combined_data = {
    "segments": all_segments,
    "language": "en"  # Assuming all segments are in English
}

# Save the combined dictionary to a new JSON file
with open('./outputs/Transcription_result_combined.json', 'w') as json_file:
    json.dump(combined_data, json_file, indent=4)

# Print the combined data to verify
print(json.dumps(combined_data, indent=4))

### Import the consolidated Transcipt JSON (if required)

In [None]:
import json

# Path to the JSON file
json_file_path = './outputs/Transcription_result_combined.json'

# Open and load the JSON file
with open(json_file_path, 'r') as file:
	data = json.load(file)

# Access and print the "segments"
results_consolidate = data["segments"]
print(results_consolidate)

# B) Alignment

## Alignment with Single File 

In [None]:
# 2. Align whisper output

## NOTE: AUDIO is declared in the Transcribe section above. 

import whisperx
import gc

# Check if CUDA is available and set the device accordingly
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"ALIGNING Using device: {device}")

# Load the alignment model with the specified device
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)

# Perform alignment using the specified device
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"]) # after alignment

# delete model if low on GPU resources
gc.collect(); torch.cuda.empty_cache(); del model_a

## SAVE the TRANSCRIPT
import json
with open('./outputs/Alignment_result_single.json', 'w') as json_file:
    json.dump(result, json_file, indent=4)


## C) Diarization - Speaker Labels
### SINGLE file

In [None]:
# 3. Assign speaker labels NOTE: this takes a long time on local machine even with GPU

from HF_token import TOKEN_ID

# Check if CUDA is available and set the device accordingly
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"LABELS Using device: {device}")

diarize_model = whisperx.DiarizationPipeline(use_auth_token=TOKEN_ID, device=device)

# Load the audio data
audio_data = {
    'waveform': torch.from_numpy(audio[None, :]),
    'sample_rate': whisperx.audio.SAMPLE_RATE
}

# Run the diarization model
diarize_segments = diarize_model(audio)

# add min/max number of speakers if known
diarize_model(audio, min_speakers=1, max_speakers=3)

# Assign speaker labels to words
result = whisperx.assign_word_speakers(diarize_segments, result)

## SAVE the TRANSCRIPT
import json
with open('./outputs/Labels_result.json', 'w') as json_file:
    json.dump(result, json_file, indent=4)



In [None]:

print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

In [None]:
import pandas as pd

# Assuming this is your `result` object from whisperx.align
# result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

# Extracting the list of segments
segments = result["segments"]

# Creating a DataFrame from the segments
align_df = pd.DataFrame(segments)

# Show the DataFrame
align_df.head()

In [None]:
import pandas as pd

SPEAKER_02 = "Clarkson"
SPEAKER_01 = "May"
SPEAKER_03 = "Hammond"



In [None]:
Speaker01_df = align_df[align_df['speaker']== 'SPEAKER_01']
Speaker01_df

In [None]:
align_df['speaker'] = align_df['speaker'].replace('SPEAKER_01', 'May')
align_df['speaker'] = align_df['speaker'].replace('SPEAKER_02', 'Clarkson')
align_df['speaker'] = align_df['speaker'].replace('SPEAKER_00', 'Hammond')

align_df.head(100)



In [None]:
# Export df to csv

align_df.to_csv("Aligned_Audio.csv")

## Milestone: Completed transcription, diarize, and label the audio file

Next:
* Separate the align_df into three, one for each speaker.
* Perform LDA on each data frame for the Presenters


In [None]:
# IMPORT DATA if required
#
#3 Load Data
import pandas as pd

align_df= pd.read_csv('Aligned_Audio.csv')
align_df


In [None]:
# prompt: use align_df to create 3 new dataframes using the Speaker field

# Create a new DataFrame for each speaker
May_df = align_df[align_df['speaker'] == 'May']
Clarkson_df = align_df[align_df['speaker'] == 'Clarkson']
Hammond_df = align_df[align_df['speaker'] == 'Hammond']

# Display the first few rows of each DataFrame (optional)
print("Speaker 01 [James] DataFrame:")
May_df.head()

print("\nSpeaker 02[Jeremy] DataFrame:")
Clarkson_df.head()

print("\nSpeaker 03 [Richard] DataFrame:")
Hammond_df.head()

In [None]:
import os
import pandas as pd
import json
import glob

# Directory containing the JSON files
json_directory = 'outputs/Testing/'

# Get a list of all JSON files in the directory
json_files = glob.glob(os.path.join(json_directory, '*.json'))

# Initialize a list to hold all DataFrames
df_list = []

# Iterate through each JSON file and merge segments
for json_file in json_files:
	with open(json_file, 'r') as file:
		data = json.load(file)
		# Convert the "segments" part of the JSON data to a DataFrame
		df = pd.DataFrame(data["segments"])
		df_list.append(df)

# Concatenate all DataFrames into a single DataFrame
diarized_df = pd.concat(df_list, ignore_index=True)

# Display the consolidated DataFrame
diarized_df.head()

# Part 2: LDA (Latent Dirichlet Allocation) Preparation

### Import the previously created json/csv

In [None]:
import pandas as pd
import json

# Path to the JSON file
json_file_path = 'Intro_clip_0.json'

# Open and load the JSON file
with open(json_file_path, 'r') as file:
	data = json.load(file)

# Convert the "segments" part of the JSON data to a DataFrame
intro_clip_0_df = pd.DataFrame(data["segments"])

# Display the DataFrame
intro_clip_0_df.head()

In [None]:
import locale
def getpreferredencoding(do_setlocale=True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

# !pip install nltk

In [None]:
## IMPORT all the libraries required

#https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#1introduction
import numpy as np
import json
import glob
import re

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [None]:
# 1. PREPROCESS
## Remove emails, newline char, stop words, and tokenize

# Reminder: these are the 3 data frames generated up to this point.
# May_df
# Clarkson_df
# Hammond_df

# Define the preprocessing function
def preprocess_text(data):
      
   # Remove emails
    data = [re.sub(r'\S+@\S+', '', i) for i in data]

    # Remove newline characters
    data = [i.replace('\n', '').replace('\r', '').strip() for i in data]

    # Remove distracting single quotes
    data = [i.replace("'", "") for i in data]

    return data

# Convert 'text' column to list and then apply the preprocessing function to each dataframe

May_data = preprocess_text(May_df['text'].values.tolist())
Clarkson_data = preprocess_text(Clarkson_df['text'].values.tolist())
Hammond_data = preprocess_text(Hammond_df['text'].values.tolist())

print(May_data)
# print(Clarkson_data)
# print(Hammond_data

## Tokenize (SINGLE PRESENTER)
Break each sentence/phrase into a list of words. 
This first pass is the POC. Skip to Part 4 after completing the POC.

### Define Tokenize function for each sentence

In [None]:
'''You need to break down each sentence into a list of words through tokenization
while clearing up all the messy text in the process. Gensim’s simple_preprocess is great for this.
I have set deacc=True to remove the punctuations.'''


def send_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        # deacc=True remove punctuation


### Tokenize the Sentences for each Presenter

In [None]:

May_words = list(send_to_words(May_data))
# Clarkson_words = list(send_to_words(Clarkson_data))
# Hammond_words = list(send_to_words(Hammond_data))

print(f"MAY words:{May_words[:10]}")
# print(f"CLARKSON words:{Clarkson_words[:10]}")
# print(f"HAMMOND words:{Hammond_words[:10]}")


## Stopwords from each of the 3 Presenter words

### Download the STOPWORDS

In [None]:
# Obtain the STOPWORDS
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('english')
# stopwords.extend(['from', 'subject', 're', 'edu', 'use'])

In [None]:
print(stopwords)

### Define Function to Remove STOPWORDS

In [None]:
def remove_stopwords(texts):
    return[[word for word in simple_preprocess(str(doc))if word not in stopwords] for doc in texts]


### Remove Stopwords from each DF

In [None]:
# Remove the Stop Words
May_data_words_nostops = remove_stopwords(May_words)
# Clarkson_data_words_nostops = remove_stopwords(Clarkson_words)
# Hammond_data_words_nostops = remove_stopwords(Hammond_words)

In [None]:
print(f"May_data_words_nostops:")
May_data_words_nostops
# print(Clarkson_data_words_nostops[:10])
# print(Hammond_data_words_nostops

## Lemmatization - Stemming

### Define Lemmatization Function

Note: this became more complex than anticipated given that each record in the data frame is a list, and became a nested list. Lemmatize function assumed a single level list.

Question for consideration: Can all the dialog from each presenter be consolidated into a single list? Is there additional detail necessary or superfluous?

From cmd line run:  

`python -m spacy download en_core_web_sm`

Then you can load it into VSC

In [None]:
## Lemmatization function works for a single list:

## The LHL Advanced version of the single list fucntion:

'''
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()

def lemmatization(texts, allowed_postags=['NOUN']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


## A Basic syntax for a single list.
'''
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out




IF: Audio transcripts are in NESTED lists. Therefore must iterate through each.

```python
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

    def process_text(text):
        doc = nlp(text)
        new_text = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
        return " ".join(new_text)

    def process_nested_list(nested_texts):
        if isinstance(nested_texts, list):
            return [process_nested_list(item) for item in nested_texts]
        else:
            return process_text(nested_texts)

    return process_nested_list(texts)


### Lemmatize the data frames

In [None]:
lemmatized_May = lemmatization(May_data_words_nostops)
# lemmatized_Clarkson = lemmatization(Clarkson_data_words_nostops)
# lemmatized_Hammond = lemmatization(Hammond_data_words_nostops)

lemmatized_May
# print(lemmatized_Clarkson)

In [None]:
## Eliminate all the null strings in the lemmatized docs:

May_cleaned_texts = [doc for doc in lemmatized_May if len(doc) > 0]
# Clarkson_cleaned_texts = [doc for doc in lemmatized_Clarkson if len(doc) > 0]
# Hammond_cleaned_texts = [doc for doc in lemmatized_Hammond if len(doc) > 0]

May_cleaned_texts
# print(Clarkson_cleaned_texts)
# print(Hammond_cleaned_texts)


# Part 3: LDA and Visualization


Create a corpus for a SINGLE presenter. (MAY)

In [None]:
## Reminder: There are TWO Main inputs: DICTIONARY (id2word key:val pair for each word) and the Corpus (words and their count)

#Data sets
# May = lemmatized_May
# Hammond = lemmatized_Hammond
# Clarkson = lemmatized_Clarkson

## For later - to iterate through each data set.
# data_sets = [lemmatized_May, lemmatized_Clarkson, lemmatized_Hammond]

# Create dictionary
id2word_May = corpora.Dictionary(lemmatized_May)
print(id2word_May)

# Create Corpus
texts = lemmatized_May

# Term Document Frequency
corpus_May = [id2word_May.doc2bow(text) for text in texts]

print(f"Corpus for May: {corpus_May}")

Base Model for SINGLE Presenter

In [None]:
lda_model_May = gensim.models.LdaModel(corpus=corpus_May,
                                            id2word=id2word_May,
                                            num_topics=5,
                                            random_state=100,
                                            chunksize=200,
                                            passes=10,
                                            per_word_topics=True)

In [None]:
print(lda_model_May.print_topics())

### Visualize Single Presenter (May)


In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_May, corpus_May, id2word_May, mds="mmds", R=30)
vis

# Part 4: Functions to Iterate ALL presenters 

### Step 1 - Tokenize

In [None]:
#Define the function to tokenize:

def send_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        # deacc=True remove punctuation

# Define the datasets
datasets_words = {
    "May_words": May_data,
    "Clarkson_words": Clarkson_data,
    "Hammond_words": Hammond_data
} 

# Apply the function to each dataset_words and store the results in new variables
results_words = {}
for name, data in datasets_words.items():
    results_words[name] = list(send_to_words(data))

# Extract the results into distinct variables
May_words = results_words["May_words"]
Clarkson_words = results_words["Clarkson_words"]
Hammond_words = results_words["Hammond_words"]

# Print the results to verify
print(May_words)
print(Clarkson_words)
print(Hammond_words)

In [None]:
import gensim
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import PlaintextCorpusReader

# Define the function to tokenize
def send_to_words(sentences):
    for sentence in sentences:
        yield gensim.utils.simple_preprocess(str(sentence), deacc=True)
        # deacc=True removes punctuation

# Ensure the data is fully loaded before processing
def ensure_iterable(data):
    if isinstance(data, LazyCorpusLoader):
        return list(data.words())  # Convert to list of words
    return data

# Define the datasets
datasets_words = {
    "May_words": May_data,
    "Clarkson_words": Clarkson_data,
    "Hammond_words": Hammond_data
}

# Apply the function to each dataset and store the results in new variables
results_words = {}
for name, data in datasets_words.items():
    iterable_data = ensure_iterable(data)
    results_words[name] = list(send_to_words(iterable_data))  # Convert generator to list

# Extract the results into distinct variables
May_words = results_words["May_words"]
Clarkson_words = results_words["Clarkson_words"]
Hammond_words = results_words["Hammond_words"]

# Print the results to verify
print(May_words)
print(Clarkson_words)
print(Hammond_words)

# Check if the results are iterable
def check_iterable(data):
    try:
        iter(data)
        return True
    except TypeError:
        return False

print(f"May_words is iterable: {check_iterable(May_words)}")
print(f"Clarkson_words is iterable: {check_iterable(Clarkson_words)}")
print(f"Hammond_words is iterable: {check_iterable(Hammond_words)}")

'''Explanation
Function Definition:

The send_to_words function tokenizes sentences and removes punctuation.

Ensure Data is Fully Loaded:
The ensure_iterable function checks if the data is a LazyCorpusLoader and converts it to a list if necessary.
Define Datasets:

The datasets_words dictionary stores the original datasets with their corresponding names as keys.
Apply Tokenization:

A loop iterates through the datasets_words dictionary, applies the send_to_words function to each dataset, and converts the generator to a list before storing the results in a new dictionary named results_words.
Extract Results:

The results are extracted from the results_words dictionary into distinct variables for each dataset.
Print Results:
The results are printed to verify that the tokenization has been applied correctly.
Check Iterability:

The check_iterable function checks if the results are iterable and prints the result.'''

### Step 2 - Remove Stopwords

In [None]:
import gensim
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus import stopwords as nltk_stopwords

# Define the stopwords list
stopwords = set(nltk_stopwords.words('english'))

# Define the function to remove stopwords
def remove_stopwords(texts):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stopwords] for doc in texts]

# Define the datasets
datasets = {
    "May_data_words_nostops": May_words,
    "Clarkson_data_words_nostops": Clarkson_words,
    "Hammond_data_words_nostops": Hammond_words
}

# Ensure the data is fully loaded before processing
def ensure_iterable(data):
    if isinstance(data, LazyCorpusLoader):
        return list(data)
    return data

# Apply the function to each dataset and store the results in new variables
results = {}
for name, data in datasets.items():
    print(f"Processing {name}:")
    print(f"Type: {type(data)}")
    print(f"First 5 items: {data[:5] if isinstance(data, list) else 'Not a list'}")
    
    iterable_data = ensure_iterable(data)
    results[name] = remove_stopwords(iterable_data)

# Extract the results into distinct variables
May_data_words_nostops = results["May_data_words_nostops"]
Clarkson_data_words_nostops = results["Clarkson_data_words_nostops"]
Hammond_data_words_nostops = results["Hammond_data_words_nostops"]

# Print the results to verify
print("May_data_words_nostops:", May_data_words_nostops[:5])
print("Clarkson_data_words_nostops:", Clarkson_data_words_nostops[:5])
print("Hammond_data_words_nostops:", Hammond_data_words_nostops[:5])

In [None]:
def create_corpora_and_LDA(data_sets, names, num_topics = 5):
    results_dict = {}
    for data, name in zip(data_sets, names):
        # STEP 1 - Create dictionary
        id2word = corpora.Dictionary(data)
        print(f"Dictionary for {name}:")
        print(id2word)

        # STEP 2 - Create Corpus
        texts = data

        # STEP 3 - Term Document Frequency
        corpus = [id2word.doc2bow(text) for text in texts]

        print(f"Corpus for {name}:")
        print(corpus)

        # STEP 4- Create LDA Model
        lda_model = gensim.models.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics,
                                           random_state=100,
                                           chunksize=200,
                                           passes=10,
                                           per_word_topics=True)

        print(f"LDA Model for {name}:")
        print(lda_model)

        # STEP 5 - Store in dictionary
        results_dict[name] = {
            'dictionary': id2word,
            'corpus': corpus,
            'lda_model': lda_model
        }

    return results_dict

# Example usage
data_sets = [lemmatized_May, lemmatized_Clarkson, lemmatized_Hammond]
names = ['May', 'Clarkson', 'Hammond']
results_dict = create_corpora_and_LDA(data_sets, names)

In [None]:
import pprint
#Print the keyword in the 5 topics

for name, result in results_dict.items():
    print(f"Results for {name}:")
    pprint.pprint(result['lda_model'].print_topics())


# print(lda_model.print_topics())
# doc_lda = lda_model[corpus]

#We created 5 topics. You can see the keywords for each topic and the weightage(importance) of each keyword using lda_model.print_topics() as shown below:

In [None]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

def visualize_lda_model(results_dict, dataset_name):
    # Access the dictionary, corpus, and LDA model for the specified dataset
    dictionary = results_dict[dataset_name]['dictionary']
    corpus = results_dict[dataset_name]['corpus']
    lda_model = results_dict[dataset_name]['lda_model']

    # Prepare the visualization
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, mds='mmds', R=30)

    # Display the visualization
    return vis


In [None]:
# Visualize the LDA model for 'May'
vis = visualize_lda_model(results_dict, 'Hammond')
vis

# Part 5: Evaluation - Coherence 
### Single presenter


In [None]:
# For Single presentater - MAY

# Topic coherence measures the average similarity between top words having the highest weights in a topic i.e relative distance between the top words.

MAY_coherence_model_lda = CoherenceModel(model=lda_model_May, texts=lemmatized_May, dictionary=id2word_May, coherence='c_v')
coherence_lda=MAY_coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)



# Part 6: Model Improvement - How many topics? 

## Define function to Iterate Coherence: 

In [None]:

# Define the function to iterate
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
    '''
    Computer c_v coherence for various numbers of topics

    Parameters: 
    -=---------
    dictionary: Gensim dictionary
    corpus: Gensim corpus
    texts: list of input texts
    limi:max number of topics

    Returns: 
    ----------
    model_list: list of LDA topics models
    coherence_values: coherence values for the corresponding LDA '''

    coherence_values =[]
    model_list = []
    for num_topics in range (start, limit, step):
        model = gensim.models.LdaModel(corpus=corpus, num_topics=num_topics,random_state=100, chunksize=200, passes=10, per_word_topics=True, id2word=id2word_May)

        model_list.append(model)
        coherencemodel=CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    
    return model_list,coherence_values


Evaluate Topic count for best Coherence & plot

In [None]:
# Run for Single presenter - MAY

model_list, coherence_values_May = compute_coherence_values(dictionary=id2word_May, corpus=corpus_May, texts=lemmatized_May, start=2, limit=8, step=1)

In [None]:
import matplotlib.pyplot as plt

#Show graph
limit = 8; start=2; step=1;
x=range(start,limit,step)
plt.plot(x,coherence_values_May)
plt.xlabel('Number of Topics')
plt.ylabel('Coherence score')
plt.legend(('coherenve_values'), loc='best')

In [None]:
## Print the Coherence scores: 

for m, cv in zip(x, coherence_values_May):
    cv_round = round(cv,4)
    print(f"Number of topics: {m} has a Coherence of {cv_round}")