# Introduction

necessary installs: 
* whisperx
* transformers
* datawrangler
* https://huggingface.co/roberta-base

# 1. Converting speech audio data to text 

In [None]:
import whisperx
import gc
import json

In [None]:
import os
import glob
import subprocess
from scipy.spatial import distance
import pandas as pd

In [None]:
device = "cpu" # default is cuda
batch_size = 16 # reduce if low on GPU mem    
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)

model = whisperx.load_model("large-v2", device, compute_type=compute_type)

In [None]:
audio_path = "/path/to/audio/segments/files/"
transcribed_path = '/path/to/audio/segments/text/'

file_list = glob(os.path.join(audio_path, '*'))

In [None]:
for audio_file in file_list:

    #Unaligned transcriptions
    audio = whisperx.load_audio(audio_file)
    result = model.transcribe(audio, batch_size=batch_size)
    
    #Aligned transcriptions
    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
    
    with open(audio_track+".json","w") as write_file:
        json.dump(result["segments"],write_file)
    
    subprocess.run(["-mv", "audio_path+audio_file", "/path/to/audio/segments/text"])


# 2. Taking in text segments and generating Embeddings using RoBERTa

In [None]:
import datawrangler as dw 

#need to check if this actually ports in roBERTa
roberta = {'model': 'TransformerDocumentEmbeddings', 'args': ['roberta-base'], 'kwargs': {}}

In [None]:
for track,whisper_data in enumerate(json_dicts):
    whisper_segments, start_times, end_times = [], [], [] 
    
    for segment in range(len(whisper_data)):
        whisper_segments.append(whisper_data[segment]["text"])
        start_times.append(whisper_data[segment]["start"])
        end_times.append(whisper_data[segment]["end"])
        
    embeddings = pd.DataFrame()
    
    for segment in whisper_segments:
        bert_embeddings = dw.wrangle(segment, text_kwargs={'model': roberta})
        embeddings = embeddings.append(bert_embeddings, ignore_index = True)
    
    embeddings["start"] = start_times
    embeddings["end"] = end_times
    
    embeddings.to_csv(json_files[track]+"_embeddings.csv", index=False)

# 3. Taking cosine distance of embeddings

In [None]:
previous_convos = pd.read_csv('')

for ix in range(embeddings.index.max()+1):
    distance_list = []
    #past embeddings vs current track's embeddings(bert_embeddings)
    distance_list.append(distance.cosine(previous_convos.loc[ix,3:724],bert_embeddings))
    
previous_convos['distances'] = distance_list

previous_convos.sort_values(by=['distances'], ascending=True, inplace=True)
top_5 = previous_convos[0:4,['tracks']]

# 4. Making a UMAP of the conversation

In [None]:
from umap import UMAP

In [None]:
dataset = pd.read_csv(filepath+"full_dataset.csv")
features = dataset.loc[:,:"767"]

In [None]:
umap2d = UMAP(n_components=2, n_neighbors=30, init='random',random_state=0, metric='cosine')

proj_2d = umap2d.fit_transform(features)

In [None]:
df2d = pd.DataFrame(proj_2d)
df2d[["start","end","episode","segment"]] = dataset[["start","end","episode","segment"]]

fig2d = px.scatter(proj_2d, x=0, y=1, color=df2d.start, labels={'color':'start'})
fig2d.show()
