# Introduction

necessary installs: 
* whisperx
* transformers
* datawrangler
* https://huggingface.co/roberta-base

# 1. Converting speech audio data to text 

In [2]:
import whisperx
import gc
import json

2023-11-13 00:42:57.209603: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
objc[45032]: Class AVFFrameReceiver is implemented in both /Users/f004p74/anaconda3/lib/libavdevice.58.10.100.dylib (0x123afc078) and /Users/f004p74/anaconda3/lib/python3.9/site-packages/av/.dylibs/libavdevice.59.7.100.dylib (0x13a86f118). One of the two will be used. Which one is undefined.
objc[45032]: Class AVFAudioReceiver is implemented in both /Users/f004p74/anaconda3/lib/libavdevice.58.10.100.dylib (0x123afc0c8) and /Users/f004p74/anaconda3/lib/python3.9/site-packages/av/.dylibs/libavdevice.59.7.100.dylib (0x13a86f168). One of the two will be used. Which one is undefined.
  Referenced from: <AE5A0901-5B6C-3028-ADEE-0C068D04

In [3]:
import os
import glob
import subprocess
from scipy.spatial import distance
import pandas as pd

In [None]:
device = "cpu" # default is cuda
batch_size = 16 # reduce if low on GPU mem    
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)

model = whisperx.load_model("large-v2", device, compute_type=compute_type)

In [None]:
audio_path = "/path/to/audio/segments/files/"
transcribed_path = 'convo_art/transcripts/'

file_list = glob(os.path.join(audio_path, '*'))

In [None]:
for audio_file in file_list:

    #Unaligned transcriptions
    audio = whisperx.load_audio(audio_file)
    result = model.transcribe(audio, batch_size=batch_size)
    
    #Aligned transcriptions
    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
    
    with open(audio_track+".json","w") as write_file:
        json.dump(result["segments"],write_file)
    
    subprocess.run(["-mv", "audio_path+audio_file", "/path/to/audio/segments/text"])


# 2. Taking in text segments and generating Embeddings using RoBERTa

In [31]:
import datawrangler as dw 

In [32]:
#need to check if this actually ports in roBERTa
roberta = {'model': 'TransformerDocumentEmbeddings', 'args': ['roberta-base'], 'kwargs': {}}

In [36]:
path = '/Users/f004p74/Documents/dartmouth/projects/arts-integration/transcripts/'
out_dir = os.listdir(path)
dirlist

['.DS_Store',
 'index-ceres.js',
 'cTom-testing',
 'personal-site',
 'cTom-experiment']

In [52]:
out_dir = os.listdir(path)
transcripts = [x for x in out_dir if '.json' in x]

segment, start, end = [], [], [] 

for file in transcripts:
    with open(path+os.sep+file, "r") as read_file:
        data = json.load(read_file)

    for line in range(len(data)):
        segment.append(data[line]["text"])
        start.append(data[line]["start"])
        end.append(data[line]["end"])

embeddings = []

for i in segment[0:2]:
    bert_embeddings = dw.wrangle(i, text_kwargs={'model': roberta})
    embeddings.append(bert_embeddings)

df = pd.concat(embeddings)
df["start"] = start[0:2]
df["end"] = end[0:2]

In [53]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,760,761,762,763,764,765,766,767,start,end
0,-0.026634,0.085026,-0.040498,-0.105231,0.06899,-0.083818,-0.017453,0.022588,0.086036,-0.106865,...,-0.036755,-0.065474,-0.000728,0.133913,0.101147,-0.047516,-0.050494,0.014184,0.789,9.501
0,-0.049831,0.048819,-0.050759,-0.133401,0.072044,-0.062051,-0.009552,-0.036435,0.076738,-0.095187,...,-0.053567,-0.081428,-0.006288,0.100437,0.131865,-0.090388,-0.05864,0.01511,9.501,15.689


In [33]:
def embed_transcripts (path):
    out_dir = os.listdir(path)
    transcripts = [x for x in out_dir if '.json' in x]

    segment, start, end = [], [], [] 

    for file in transcripts:
        with open(path+os.sep+file, "r") as read_file:
            data = json.load(read_file)

        for line in range(len(data)):
            segment.append(data[line]["text"])
            start.append(data[line]["start"])
            end.append(data[line]["end"])

    embeddings = []

    for i in segment:
        bert_embeddings = dw.wrangle(i, text_kwargs={'model': roberta})
        embeddings.append(bert_embeddings)

    df = pd.concat(embeddings)
    df["start"] = start
    df["end"] = end
    
    return df

In [None]:
embed_transcripts('/Users/f004p74/Documents/dartmouth/projects/arts-integration/transcripts/')

# 3. Taking cosine distance of embeddings

In [None]:
previous_convos = pd.read_csv('')

for ix in range(embeddings.index.max()+1):
    distance_list = []
    #past embeddings vs current track's embeddings(bert_embeddings)
    distance_list.append(distance.cosine(previous_convos.loc[ix,3:724],bert_embeddings))
    
previous_convos['distances'] = distance_list

previous_convos.sort_values(by=['distances'], ascending=True, inplace=True)
top_5 = previous_convos[0:4,['tracks']]

# 4. Making a UMAP of the conversation

In [None]:
from umap import UMAP

In [None]:
dataset = pd.read_csv(filepath+"full_dataset.csv")
features = dataset.loc[:,:"767"]

In [None]:
umap2d = UMAP(n_components=2, n_neighbors=30, init='random',random_state=0, metric='cosine')

proj_2d = umap2d.fit_transform(features)

In [None]:
df2d = pd.DataFrame(proj_2d)
df2d[["start","end","episode","segment"]] = dataset[["start","end","episode","segment"]]

fig2d = px.scatter(proj_2d, x=0, y=1, color=df2d.start, labels={'color':'start'})
fig2d.show()
