# Part 1: Automatic Speech Recognition, Diarize and Label

Environment = "whisperx"

* Performance Benchmarks on local
* GPU Benchmark: 0.09961056709289551 seconds
* Memory Bandwidth Benchmark: 0.2920224666595459 seconds
* CPU Benchmark: 13.046526432037354 seconds
* Disk Write Benchmark: 2.3364615440368652 seconds
* Disk Read Benchmark: 0.05882525444030762 seconds \n
  
** all benchmarks are >> faster than Collab with the exception of Disk write.

## Setup ⚙️
Tested for PyTorch 2.0, Python 3.10 (use other versions at your own risk!)
GPU execution requires the NVIDIA libraries cuBLAS 11.x and cuDNN 8.x to be installed on the system. Please refer to the CTranslate2 documentation.

1.  Create Python3.10 environment

`conda create --name whisperx python=3.10`

`conda activate whisperx`

2. Install PyTorch, e.g. for Linux and Windows CUDA11.8:
   
conda install pytorch==2.0.0 torchaudio==2.0.0 pytorch-cuda=11.8 -c pytorch -c nvidia

See other methods here.

1. Install this repo

`pip install git+https://github.com/m-bain/whisperx.git`

If already installed, update package to most recent commit

`pip install git+https://github.com/m-bain/whisperx.git --upgrade`

## Post Setup - REQUIRED for DIARIZATION **Actually dont do this!!
https://github.com/m-bain/whisperX/issues/499

`pip install pyannote.audio==3.0.1`

`pip uninstall onnxruntime`

`pip install --force-reinstall onnxruntime-gpu`

# Preprocess initial audio file
convert to Wav using ffmpeg

In [None]:
import ffmpeg

## 1 - Convert Mp3 to WAV.

def convert_m4a_to_mp3(input_file, output_file):
    try:
        ffmpeg.input(input_file).output(output_file).run(overwrite_output=True)
        print(f"Successfully converted {input_file} to {output_file}")
    except ffmpeg.Error as e:
        print("An error occurred:", e)

# Input/ output files and usage
input_mp3 = './audio/Botswana_2007_Audio.mp3'  # Change this to your mp3 file path
output_wav = './data/Botswana_2007_Audio.wav'  # Change this to your desired output wav file path

convert_m4a_to_mp3(input_mp3, output_wav)

In [None]:
import whisperx
import gc
import os
import torch

device = "cuda"
## Full file should be the input (2007 or 2024 file..)
audio_file = "./data/Botswana_2007_Audio.wav"

## DEBUGGING, use a small file
# audio_file = "./data/Intro.wav"

batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
without_timestamps= 'True'

## Some error handling to ensure that successfully loaded the mp3 file!
try:
    # Check if the file exists
    if not os.path.isfile(audio_file):
        raise FileNotFoundError(f"The file '{audio_file}' does not exist.")
    # Optionally, you can add more checks (like file format) here

    print(f"Successfully accessed the audio file: {audio_file}")

except FileNotFoundError as e:
    print(e)
except Exception as e:
    print(f"An unexpected error occurred: {e}")

## Load the Audio File

In [None]:
import whisperx
import gc
import os
import torch

device = "cuda"
## Full file should be the input (2007 or 2024 file..)
audio_file = "./data/Botswana_2007_Audio.wav"

## DEBUGGING, use a small file
# audio_file = "./data/Intro.wav"

batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
without_timestamps= 'True'

## Some error handling to ensure that successfully loaded the mp3 file!
try:
    # Check if the file exists
    if not os.path.isfile(audio_file):
        raise FileNotFoundError(f"The file '{audio_file}' does not exist.")
    # Optionally, you can add more checks (like file format) here

    print(f"Successfully accessed the audio file: {audio_file}")

except FileNotFoundError as e:
    print(e)
except Exception as e:
    print(f"An unexpected error occurred: {e}")

## Prepare Batches

### Split the Audio file into smaller pieces


In [None]:
## 3 - Split up large files in <10min


import sqlite3
import librosa
import soundfile as sf
import math


# TODO: change max duration to 300 seconds
# TODO: update the target database folder
# TODO: check input filenames
# TODO: Update the output folder

# Function to split audio and save to database
def split_audio(audio_file, max_duration=300):  # 60second (1min) for testing; 300sec for production
    conn = sqlite3.connect('./data/Audio_clips.db')
    cursor = conn.cursor()
    cursor.execute('''CREATE TABLE IF NOT EXISTS clips
                     (id INTEGER PRIMARY KEY AUTOINCREMENT, start_time REAL, end_time REAL, filename TEXT)''')

    try:
        y, sr = librosa.load(audio_file)
    except Exception as e:
        print(f"Error loading audio file: {e}")
        return []

    total_duration = librosa.get_duration(y=y, sr=sr)
    num_splits = math.ceil(total_duration / max_duration)
    results = []

    for i in range(num_splits):
        start_time = i * max_duration
        end_time = min((i + 1) * max_duration, total_duration)

        start_sample = int(start_time * sr)
        end_sample = int(end_time * sr)

        clip = y[start_sample:end_sample]
        filename = f"./data/Botswana2007_clip_{i}.wav"

        try:
            sf.write(filename, clip, sr)
            cursor.execute("INSERT INTO clips (start_time, end_time, filename) VALUES (?, ?, ?)",
                           (start_time, end_time, filename))
            conn.commit()
            results.append({"start_time": start_time, "end_time": end_time, "filename": filename})
        except Exception as e:
            print(f"Error processing clip {i}: {e}")

    conn.close()
    return results
# results is a DIctionary
results = split_audio(audio_file)

# BATCH PROCESS: Transcript - Align - Diarize

### Approach 1 - use python

In [None]:
import os
import glob
import json
import gc
import torch
import whisperx
from HF_token import TOKEN_ID

# Directory containing .wav files
wav_directory = './data/Testing'

# Get a list of all .wav files in the directory
wav_files = glob.glob(os.path.join(wav_directory, '*.wav'))

# Initialize results_full list
aligned_results_full = []

# Set device and compute type
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "float32"

print(f"TRANSCRIBING & ALIGNING using device: {device}")
print(f"Compute type is {compute_type}")

# Ensure the model directory exists
model_dir = "./model/"
os.makedirs(model_dir, exist_ok=True)



# Define batch size
batch_size = 16  # Adjust as needed

# # Load the model and save it to the local path
# try:
#     model = whisperx.load_model("large-v2", device=device, compute_type=compute_type, download_root=model_dir)
# except Exception as e:
#     print(f"Error loading model: {e}")
#     raise

# Iterate through each .wav file and process it
for wav_file in wav_files:
    print(f"Processing file: {wav_file}")
    try:
        # Load the audio file
        audio = whisperx.load_audio(wav_file)
        
        # Ensure the output directory exists
        output_dir = "./outputs/Testing"
        os.makedirs(output_dir, exist_ok=True)

        #####################     TRANSCRIPTION  #################
        # Load the model and save it to the local path
        try:
            model = whisperx.load_model("large-v2", device=device, compute_type=compute_type, download_root=model_dir)
        except Exception as e:
            print(f"Error loading model: {e}")
            raise

        print(f"STARTING Transcription on {wav_file}")

        # Transcribe the audio file
        transcribe_result = model.transcribe(audio, batch_size=batch_size)
        print(transcribe_result["segments"])  # before alignment
        
        # Save the transcription result to a JSON file
        transcript_filename = os.path.basename(wav_file).replace('.wav', '')
        with open(f'./outputs/Testing/{transcript_filename}_transcript.json', 'w') as json_file:
            json.dump(transcribe_result, json_file, indent=4)
        
        

        #####################     ALIGNMENT #################
        print(f"STARTING ALIGNMENT on {wav_file}")
        
        # Load the alignment model with the specified device
        model_a, metadata = whisperx.load_align_model(language_code="en", device=device)
        
        # Perform alignment using the specified device
        aligned_result = whisperx.align(transcribe_result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
        
        # Save the Alignment result to a JSON file
        alignment_filename = os.path.basename(wav_file).replace('.wav', '')
        with open(f'./outputs/Testing/{alignment_filename}_aligned.json', 'w') as json_file:
            json.dump(aligned_result, json_file, indent=4)
        
        # Append the aligned result to results_full
        aligned_results_full.append(aligned_result)
        
        #####################     DIARIZE #################
        print(f"STARTING DIARIZE on {wav_file}")

        # Load the DIARIZE Model
        diarize_model = whisperx.DiarizationPipeline(use_auth_token=TOKEN_ID, device=device)

        # Load the audio data
        audio_data = {
            'waveform': torch.from_numpy(audio[None, :]),
            'sample_rate': whisperx.audio.SAMPLE_RATE
                    }
        # Run the diarization model
        diarize_segments = diarize_model(audio)

        # add min/max number of speakers if known
        diarize_model(audio, min_speakers=1, max_speakers=3)

        # Assign speaker labels to words
        diarize_result = whisperx.assign_word_speakers(diarize_segments, aligned_result)

        ## SAVE the TRANSCRIPT
        diarized_filename = os.path.basename(wav_file).replace('.wav', '')
        with open(f'./outputs/Testing/{diarized_filename}_diarized.json', 'w') as json_file:
            json.dump(diarize_result, json_file, indent=4)
     
       # Clean up memory after each file
        gc.collect()
        torch.cuda.empty_cache()
    
    except Exception as e:
        print(f"Error processing file {wav_file}: {e}")

# Optionally, save the full results to a single JSON file
with open('./outputs/Testing/full_alignment.json', 'w') as json_file:
    json.dump(aligned_results_full, json_file, indent=4)



### Approach 2 - use the Terminal and CLI 
This seemed to work on a single file VERY fast! 

In [None]:
import os
from HF_token import TOKEN_ID
# Set the path to your directory
directory = "./data/Testing/"

# Iterate through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".wav"):  # Check for .wav files
        filepath = os.path.join(directory, filename)
        
        # Construct and run the whisperx command for each file
        command = f"whisperx {filepath} --model large-v2 --diarize --highlight_words True --hf_token {TOKEN_ID}"
        os.system(command)

 ### Approach 2B - add some subprocess to monitor progress

 This method is *best* as a) actually worked and b) provided insight into what is going on! 

In [None]:
import os
import subprocess
from HF_token import TOKEN_ID

# Set the path to your directory
directory = "data/"

# Iterate through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".wav"):  # Check for .wav files
        filepath = os.path.join(directory, filename)
        
        # Print the filename to show progress
        print(f"Processing file: {filename}")
        
        # Construct the whisperx command for each file
        command = f"whisperx {filepath} --model large-v2 --diarize --highlight_words True --hf_token {TOKEN_ID} --output_dir ./outputs"
        
        # Run the command and capture real-time output
        process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        
        # Display real-time output from the command
        for line in process.stdout:
            print(line.decode().strip())
        
        process.wait()  # Wait for process to finish
        
        # Confirm completion for each file
        print(f"Completed file: {filename}\n")


# Consolidate the Diarized JSON files 

In [None]:
import os
import pandas as pd
import json
import glob

# Directory containing the JSON files
json_directory = 'outputs/'

# Get a list of all JSON files in the directory
json_files = glob.glob(os.path.join(json_directory, '*.json'))

# Initialize a list to hold all DataFrames
df_list = []

# Iterate through each JSON file and merge segments
for json_file in json_files:
	with open(json_file, 'r') as file:
		data = json.load(file)
		# Convert the "segments" part of the JSON data to a DataFrame
		df = pd.DataFrame(data["segments"])
		df_list.append(df)

# Concatenate all DataFrames into a single DataFrame
diarized_df = pd.concat(df_list, ignore_index=True)

# Export
diarized_df.to_csv('./data/diarzed_output_no_names.csv')

# Display the consolidated DataFrame
diarized_df.head(100)



In [None]:
# Apply the Speaker names to the labels

diarized_df['speaker'] = diarized_df['speaker'].replace('SPEAKER_01', 'May')
diarized_df['speaker'] = diarized_df['speaker'].replace('SPEAKER_02', 'Clarkson')
diarized_df['speaker'] = diarized_df['speaker'].replace('SPEAKER_00', 'Hammond')

diarized_df.head(100)

# Export
diarized_df.to_csv('./data/diarzed_output_named.csv')

# Part 2: LDA (Latent Dirichlet Allocation) Preparation
### Import the previously created json/csv



In [4]:

import pandas as pd
import json

# Import
diarized_df = pd.read_csv('./data/diarzed_output_named.csv')


In [7]:
preprocessed_df = diarized_df.copy()
# Preprocessing steps for LDA analysis

preprocessed_df

Unnamed: 0.1,Unnamed: 0,start,end,text,words,speaker
0,0,21.467,23.068,"Hello, hello, and welcome.","[{'word': 'Hello,', 'start': 21.467, 'end': 21...",Clarkson
1,1,23.108,24.029,Thank you very much.,"[{'word': 'Thank', 'start': 23.108, 'end': 23....",Clarkson
2,2,24.049,34.256,"Now, as you know, the producers on this show l...","[{'word': 'Now,', 'start': 24.049, 'end': 24.1...",Clarkson
3,3,34.597,39.640,Then they set unbelievably hard tasks to do to...,"[{'word': 'Then', 'start': 34.597, 'end': 34.7...",Clarkson
4,4,39.900,40.080,Yeah.,"[{'word': 'Yeah.', 'start': 39.9, 'end': 40.08...",Hammond
...,...,...,...,...,...,...
1042,1042,269.865,271.447,We're through!,"[{'word': ""We're"", 'start': 269.865, 'end': 27...",Clarkson
1043,1043,273.209,277.514,"Both our cars were flooded, but our guides wer...","[{'word': 'Both', 'start': 273.209, 'end': 273...",SPEAKER_03
1044,1044,278.215,283.842,"People of Surrey, if this happens to you, you ...","[{'word': 'People', 'start': 278.215, 'end': 2...",SPEAKER_03
1045,1045,283.922,286.085,"Well, the people of Botswana have a tip for you.","[{'word': 'Well,', 'start': 283.922, 'end': 28...",SPEAKER_03


In [None]:

preprocessed_df.to_csv("preprocess_df.csv")

### Import all the libraries required


In [15]:
import numpy as np
import json
import glob
import re

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

### Cant remember why needed this ... 
import locale
def getpreferredencoding(do_setlocale=True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding


In [17]:
#### Below is a suggestion from gpt. 
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus import stopwords as nltk_stopwords

# 1. Remove emails, newline characters, and non-alphabetic characters
preprocessed_df['cleaned_text'] = preprocessed_df['text'].str.replace(r'\S+@\S+', '', regex=True)
preprocessed_df['cleaned_text'] = preprocessed_df['cleaned_text'].str.replace(r'http\S+|www\S+', '', regex=True)
preprocessed_df['cleaned_text'] = preprocessed_df['cleaned_text'].str.replace(r'\n', ' ', regex=True)
preprocessed_df['cleaned_text'] = preprocessed_df['cleaned_text'].str.replace(r'[^a-zA-Z\s]', '', regex=True)

# 2. Convert to lowercase
preprocessed_df['cleaned_text'] = preprocessed_df['cleaned_text'].str.lower()

# 3. Tokenize the text
preprocessed_df['tokens'] = preprocessed_df['cleaned_text'].apply(lambda x: gensim.utils.simple_preprocess(x, deacc=True))

# 4. Remove stopwords
stopwords = set(nltk_stopwords.words('english'))
preprocessed_df['tokens'] = preprocessed_df['tokens'].apply(lambda x: [word for word in x if word not in stopwords])

# 5. Lemmatize the tokens
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
preprocessed_df['lemmatized_tokens'] = preprocessed_df['tokens'].apply(lambda x: [token.lemma_ for token in nlp(" ".join(x)) if token.pos_ in ["NOUN", "ADJ", "VERB", "ADV"]])

# Display the preprocessed dataframe
preprocessed_df.head()


Unnamed: 0.1,Unnamed: 0,start,end,text,words,speaker,cleaned_text,tokens,lemmatized_tokens
0,0,21.467,23.068,"Hello, hello, and welcome.","[{'word': 'Hello,', 'start': 21.467, 'end': 21...",Clarkson,hello hello and welcome,"[hello, hello, welcome]",[]
1,1,23.108,24.029,Thank you very much.,"[{'word': 'Thank', 'start': 23.108, 'end': 23....",Clarkson,thank you very much,"[thank, much]","[thank, much]"
2,2,24.049,34.256,"Now, as you know, the producers on this show l...","[{'word': 'Now,', 'start': 24.049, 'end': 24.1...",Clarkson,now as you know the producers on this show lik...,"[know, producers, show, like, give, us, challe...","[know, producer, show, give, challenge, specif..."
3,3,34.597,39.64,Then they set unbelievably hard tasks to do to...,"[{'word': 'Then', 'start': 34.597, 'end': 34.7...",Clarkson,then they set unbelievably hard tasks to do to...,"[set, unbelievably, hard, tasks, see, one, us,...","[set, unbelievably, hard, task, see, get, good..."
4,4,39.9,40.08,Yeah.,"[{'word': 'Yeah.', 'start': 39.9, 'end': 40.08...",Hammond,yeah,[yeah],[]


In [20]:

# Create a new DataFrame for each speaker
May_df = preprocessed_df[preprocessed_df['speaker'] == 'May']
Clarkson_df = preprocessed_df[preprocessed_df['speaker'] == 'Clarkson']
Hammond_df = preprocessed_df[preprocessed_df['speaker'] == 'Hammond']

# Display the first few rows of each DataFrame (optional)
# May_df.head()

Clarkson_df.head()

# Hammond_df.head()

Unnamed: 0.1,Unnamed: 0,start,end,text,words,speaker,cleaned_text,tokens,lemmatized_tokens
0,0,21.467,23.068,"Hello, hello, and welcome.","[{'word': 'Hello,', 'start': 21.467, 'end': 21...",Clarkson,hello hello and welcome,"[hello, hello, welcome]",[]
1,1,23.108,24.029,Thank you very much.,"[{'word': 'Thank', 'start': 23.108, 'end': 23....",Clarkson,thank you very much,"[thank, much]","[thank, much]"
2,2,24.049,34.256,"Now, as you know, the producers on this show l...","[{'word': 'Now,', 'start': 24.049, 'end': 24.1...",Clarkson,now as you know the producers on this show lik...,"[know, producers, show, like, give, us, challe...","[know, producer, show, give, challenge, specif..."
3,3,34.597,39.64,Then they set unbelievably hard tasks to do to...,"[{'word': 'Then', 'start': 34.597, 'end': 34.7...",Clarkson,then they set unbelievably hard tasks to do to...,"[set, unbelievably, hard, tasks, see, one, us,...","[set, unbelievably, hard, task, see, get, good..."
16,16,100.86,102.06,It is a Lancia Beta.,"[{'word': 'It', 'start': 100.86, 'end': 100.96...",Clarkson,it is a lancia beta,"[lancia, beta]",[beta]


### Split the data frame into 3 (one per presenter)

In [None]:

# Create a new DataFrame for each speaker
May_df = diarized_df[diarized_df['speaker'] == 'May']
Clarkson_df = diarized_df[diarized_df['speaker'] == 'Clarkson']
Hammond_df = diarized_df[diarized_df['speaker'] == 'Hammond']

# Display the first few rows of each DataFrame (optional)
# May_df.head()

Clarkson_df.head()

# Hammond_df.head()

Unnamed: 0.1,Unnamed: 0,start,end,text,words,speaker,cleaned_text,tokens,lemmatized_tokens
0,0,21.467,23.068,"Hello, hello, and welcome.","[{'word': 'Hello,', 'start': 21.467, 'end': 21...",Clarkson,hello hello and welcome,"[hello, hello, welcome]",[]
1,1,23.108,24.029,Thank you very much.,"[{'word': 'Thank', 'start': 23.108, 'end': 23....",Clarkson,thank you very much,"[thank, much]","[thank, much]"
2,2,24.049,34.256,"Now, as you know, the producers on this show l...","[{'word': 'Now,', 'start': 24.049, 'end': 24.1...",Clarkson,now as you know the producers on this show lik...,"[know, producers, show, like, give, us, challe...","[know, producer, show, give, challenge, specif..."
3,3,34.597,39.64,Then they set unbelievably hard tasks to do to...,"[{'word': 'Then', 'start': 34.597, 'end': 34.7...",Clarkson,then they set unbelievably hard tasks to do to...,"[set, unbelievably, hard, tasks, see, one, us,...","[set, unbelievably, hard, task, see, get, good..."
16,16,100.86,102.06,It is a Lancia Beta.,"[{'word': 'It', 'start': 100.86, 'end': 100.96...",Clarkson,it is a lancia beta,"[lancia, beta]",[beta]


## 1. PREPROCESS
Remove emails, newline char, stop words, and tokenize

In [28]:
# Define the preprocessing function
def preprocess_text(data):
   # Remove emails
    data = [re.sub(r'\S+@\S+', '', i) for i in data]

    # Remove URLs
    data = [re.sub(r'http\S+|www\S+|https\S+', '', i, flags=re.MULTILINE) for i in data]

    # Remove newline characters
    data = [i.replace('\n', ' ').replace('\r', '').strip() for i in data]

    # Remove distracting single quotes
    data = [i.replace("'", "") for i in data]

        # Remove words less than 3 characters
    data = [' '.join([word for word in i.split() if len(word) >= 3]) for i in data]

    # Remove non-alphabetic characters and convert to lowercase
    data = [' '.join([word.lower() for word in re.findall(r'\b[a-zA-Z]+\b', i)]) for i in data]

    return data


# Convert 'text' column to list and then apply the preprocessing function to each dataframe

May_data = preprocess_text(May_df['text'].values.tolist())
Clarkson_data = preprocess_text(Clarkson_df['text'].values.tolist())
Hammond_data = preprocess_text(Hammond_df['text'].values.tolist())

# print(May_data)
print(Clarkson_data)
# print(Hammond_data

['hello hello and welcome', 'thank you very much', 'now you know the producers this show like give challenges specifically where they give very small amount money and tell buy used car', 'then they set unbelievably hard tasks see which one got the best deal', 'lancia beta', '', 'the only lancia any sort the whole botswana', 'its done miles', 'one owner it', 'its the little old lady', 'and boy had she ragged it', 'no thats because the battery oh shorts the bonnet', 'shorts the bonnet bit', 'youve done well', 'what the hell that', 'dont know', 'could moskvitch', 'its opel', 'opel', 'and the front says cadet', 'yeah', 'thats the same age as thats the same age nick', 'love the speedo', 'like horizontal speedo', 'wheres the engine', 'with the cars the start line was time for our challenge', 'the people surrey think they need four wheel drive cars because they live lane which sometimes has leaves it', 'you will now attempt prove them wrong driving your two wheel drive cars from here botswana

### Step 1 - Tokenize

In [29]:
#Define the function to tokenize:

def send_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        # deacc=True remove punctuation

# Define the datasets
datasets_words = {
    "May_words": May_data,
    "Clarkson_words": Clarkson_data,
    "Hammond_words": Hammond_data
} 

# Apply the function to each dataset_words and store the results in new variables
results_words = {}
for name, data in datasets_words.items():
    results_words[name] = list(send_to_words(data))

# Extract the results into distinct variables
May_words = results_words["May_words"]
Clarkson_words = results_words["Clarkson_words"]
Hammond_words = results_words["Hammond_words"]

# Print the results to verify
print(May_words)
print(Clarkson_words)
print(Hammond_words)

[['yeah', 'there', 'were', 'just', 'two', 'conditions'], ['mustnt', 'four', 'wheel', 'drive', 'and', 'mustnt', 'built', 'any', 'way', 'off', 'road'], ['the', 'meeting', 'point', 'was', 'the', 'border', 'post', 'between', 'zimbabwe', 'and', 'botswana'], ['and', 'for', 'once', 'was', 'the', 'first', 'arrive'], ['now', 'youd', 'expect', 'ive', 'done', 'this', 'properly'], ['what', 'ive', 'got', 'mercedes', 'benz', 'car', 'that', 'africa', 'absolutely', 'adores', 'because', 'its', 'comfortable', 'its', 'rugged', 'its', 'dependable', 'and', 'frankly', 'the', 'other', 'two', 'have', 'brought', 'anything', 'other', 'than', 'one', 'these', 'along', 'theyre', 'idiots'], ['the', 'first', 'idiot', 'arrived'], ['can', 'you', 'open', 'the', 'door'], ['the', 'handles', 'broken'], ['yeah', 'thats', 'normal', 'isnt', 'it'], ['that', 'fizzing'], ['thats', 'yeah'], ['whats', 'the', 'piece', 'cardboard', 'for'], ['that', 'for', 'mopping', 'moisture'], ['so', 'now', 'what'], ['you', 'want', 'lift'], ['lef

Explanation
*Function Definition:*

The `send_to_words` function tokenizes sentences and removes punctuation.

Ensure Data is Fully Loaded:
The `ensure_iterable` function checks if the data is a LazyCorpusLoader and converts it to a list if necessary.
Define Datasets:

The `datasets_words` dictionary stores the original datasets with their corresponding names as keys.
Apply Tokenization:

A loop iterates through the datasets_words dictionary, applies the send_to_words function to each dataset, and converts the generator to a list before storing the results in a new dictionary named results_words.

*Extract Results:*

The results are extracted from the results_words dictionary into distinct variables for each dataset.
Print Results:
The results are printed to verify that the tokenization has been applied correctly.
Check Iterability:

The check_iterable function checks if the results are iterable and prints the result.'''

In [None]:
import gensim
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import PlaintextCorpusReader

# Define the function to tokenize
def send_to_words(sentences):
    for sentence in sentences:
        yield gensim.utils.simple_preprocess(str(sentence), deacc=True)
        # deacc=True removes punctuation

# Ensure the data is fully loaded before processing
def ensure_iterable(data):
    if isinstance(data, LazyCorpusLoader):
        return list(data.words())  # Convert to list of words
    return data

# Define the datasets
datasets_words = {
    "May_words": May_data,
    "Clarkson_words": Clarkson_data,
    "Hammond_words": Hammond_data
}

# Apply the function to each dataset and store the results in new variables
results_words = {}
for name, data in datasets_words.items():
    iterable_data = ensure_iterable(data)
    results_words[name] = list(send_to_words(iterable_data))  # Convert generator to list

# Extract the results into distinct variables
May_words = results_words["May_words"]
Clarkson_words = results_words["Clarkson_words"]
Hammond_words = results_words["Hammond_words"]

# Print the results to verify
print(May_words)
print(Clarkson_words)
print(Hammond_words)

# Check if the results are iterable
def check_iterable(data):
    try:
        iter(data)
        return True
    except TypeError:
        return False

print(f"May_words is iterable: {check_iterable(May_words)}")
print(f"Clarkson_words is iterable: {check_iterable(Clarkson_words)}")
print(f"Hammond_words is iterable: {check_iterable(Hammond_words)}")



NameError: name 'May_data' is not defined

### Step 2 - Remove Stopwords

In [25]:
import gensim
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus import stopwords as nltk_stopwords

# Define the stopwords list
stopwords = set(nltk_stopwords.words('english'))

# Define the function to remove stopwords
def remove_stopwords(texts):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stopwords] for doc in texts]

# Define the datasets
datasets = {
    "May_data_words_nostops": May_words,
    "Clarkson_data_words_nostops": Clarkson_words,
    "Hammond_data_words_nostops": Hammond_words
}

# Ensure the data is fully loaded before processing
def ensure_iterable(data):
    if isinstance(data, LazyCorpusLoader):
        return list(data)
    return data

# Apply the function to each dataset and store the results in new variables
results = {}
for name, data in datasets.items():
    print(f"Processing {name}:")
    print(f"Type: {type(data)}")
    print(f"First 5 items: {data[:5] if isinstance(data, list) else 'Not a list'}")
    
    iterable_data = ensure_iterable(data)
    results[name] = remove_stopwords(iterable_data)

# Extract the results into distinct variables
May_data_words_nostops = results["May_data_words_nostops"]
Clarkson_data_words_nostops = results["Clarkson_data_words_nostops"]
Hammond_data_words_nostops = results["Hammond_data_words_nostops"]

# Print the results to verify
print("May_data_words_nostops:", May_data_words_nostops[:5])
print("Clarkson_data_words_nostops:", Clarkson_data_words_nostops[:5])
print("Hammond_data_words_nostops:", Hammond_data_words_nostops[:5])

NameError: name 'May_words' is not defined

### Step 3 - Lemmatize



In [23]:
## A Basic syntax for a single list.

def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [24]:
lemmatized_May = lemmatization(May_data_words_nostops)
lemmatized_Clarkson = lemmatization(Clarkson_data_words_nostops)
lemmatized_Hammond = lemmatization(Hammond_data_words_nostops)

# lemmatized_May
lemmatized_Clarkson

NameError: name 'May_data_words_nostops' is not defined

In [21]:
import csv

## Eliminate all the null strings in the lemmatized docs:

def clean_texts(lemmatized_texts):
	return [doc for doc in lemmatized_texts if len(doc) > 0]

May_cleaned_texts = clean_texts(lemmatized_May)
Clarkson_cleaned_texts = clean_texts(lemmatized_Clarkson)
Hammond_cleaned_texts = clean_texts(lemmatized_Hammond)

# May_cleaned_texts
# print(Clarkson_cleaned_texts)
# print(Hammond_cleaned_texts)

Clarkson_cleaned_texts



NameError: name 'lemmatized_May' is not defined

In [None]:

# Define the file path for the output text file
output_file_path = './data/Clarkson_cleaned_texts.txt'

# Write the cleaned texts to the text file
with open(output_file_path, 'w') as file:
	for item in Clarkson_cleaned_texts:
		file.write("%s\n" % ' '.join(item))



## Step 3 - Create the Corpora, Dictionary & LDA

There are 3 data frames, so create ensure consistent and efficient treatment, create a function to pass each df through the same series of steps. These include: 

1. create the dictionary of terms
2. create the corpus (count of each dictionary term)
3. create the LDA Model
4. run the LDA model

In [None]:
def create_corpora_and_LDA(data_sets, names, num_topics = 5):
    results_dict = {}
    for data, name in zip(data_sets, names):
        # STEP 1 - Create dictionary
        id2word = corpora.Dictionary(data)
        print(f"Dictionary for {name}:")
        print(id2word)

        # STEP 2 - Create Corpus
        texts = data

        # STEP 3 - Term Document Frequency
        corpus = [id2word.doc2bow(text) for text in texts]

        print(f"Corpus for {name}:")
        print(corpus)

        # STEP 4- Create LDA Model
        lda_model = gensim.models.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics,
                                           random_state=100,
                                           chunksize=200,
                                           passes=10,
                                           per_word_topics=True)

        print(f"LDA Model for {name}:")
        print(lda_model)

        # STEP 5 - Store in dictionary
        results_dict[name] = {
            'dictionary': id2word,
            'corpus': corpus,
            'lda_model': lda_model
        }

    return results_dict

# RESULTS_DICTIONARY - Create a dictionary of the data sets, their respective names, and the info created from the above function. 

data_sets = [May_cleaned_texts, Clarkson_cleaned_texts, Hammond_cleaned_texts]
names = ['May', 'Clarkson', 'Hammond']
results_dict = create_corpora_and_LDA(data_sets, names)

In [None]:
#Print the keyword in the 5 topics
# 
import pprint

for name, result in results_dict.items():
    print(f"Results for {name}:")
    pprint.pprint(result['lda_model'].print_topics())


# print(lda_model.print_topics())
# doc_lda = lda_model[corpus]

#We created 5 topics. You can see the keywords for each topic and the weightage(importance) of each keyword using lda_model.print_topics() as shown below:

## Visualize LDA Topic Models for each Presenter

### Create a Visualize function

In [None]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

def visualize_lda_model(results_dict, dataset_name):
    # Access the dictionary, corpus, and LDA model for the specified dataset
    dictionary = results_dict[dataset_name]['dictionary']
    corpus = results_dict[dataset_name]['corpus']
    lda_model = results_dict[dataset_name]['lda_model']

    # Prepare the visualization
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, mds='mmds', R=30)

    # Display the visualization
    return vis


### Call the LDA Visualization for a given Presenter

In [None]:
# Visualize the LDA model for 'X'
vis = visualize_lda_model(results_dict, 'Clarkson')
vis

# Part 5: Evaluation - Coherence 

Topic coherence measures the average similarity between top words having the highest weights in a topic i.e relative distance between the top words.


In [None]:


for name in results_dict.keys():
	lda_model = results_dict[name]['lda_model']
	texts = eval(f"lemmatized_{name}")
	dictionary = results_dict[name]['dictionary']
	
	coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
	coherence_lda = coherence_model_lda.get_coherence()
	print(f'Coherence Score for {name}: ', coherence_lda)



# Part 6: Model Improvement - How many topics? 

## Define function to Iterate Coherence: 

In [None]:

# Define the function to iterate
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
	coherence_values = []
	model_list = []
	
	for name in names:
		dictionary = results_dict[name]['dictionary']
		corpus = results_dict[name]['corpus']
		texts = eval(f"lemmatized_{name}")
		
		for num_topics in range(start, limit, step):
			model = gensim.models.LdaModel(corpus=corpus, num_topics=num_topics, random_state=100, chunksize=200, passes=10, per_word_topics=True, id2word=dictionary)
			model_list.append(model)
			coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
			coherence_values.append(coherencemodel.get_coherence())
	
	return model_list, coherence_values


In [None]:

# Call the function and print the results for each name in results_dict
limit = 10
start = 2
step = 1

for name in names:
	print(f"Results for {name}:")
	model_list, coherence_values = compute_coherence_values(results_dict[name]['dictionary'], results_dict[name]['corpus'], eval(f"lemmatized_{name}"), limit, start, step)
	for m, cv in zip(range(start, limit, step), coherence_values):
		print(f"Num Topics = {m}, Coherence Value = {cv}")
