# Process embeddings.

### Description
This pipeline process sentence embeddings, calculates predictions and performance metrics. The process consists of loading the tensor files from the "create_embeddings" pipeline output folder and calculate the predicted value for the topic segmentation task. With the predicted values we calculate the PK and WD metrics for each of the podcasts and for the overall dataset. Due to the size of the dataset (~4000 videos) we save each metric result  to a specific folder to easily replicate the measurments. The Results are saved in the "./data/metric_results/dataset_name/pre-trained model/" folder. Outputs of this pipeline are:

- Y_hat_list: List of arrays containing each prediction array per podcast
- T_hat_list: List of arrays containing each topic change index per podcast
- sims_list: List of arrays contianing the similarity vectors calculated per podcast
- results_pre-trained model name.csv: CSV with all PK and WD values per podcast.

Parameters:
- dataset_name: Name of the dataset to be loaded and processed.
- model_name: Type of model to create the sentence embeddings.
- pre_trained_model: Pre trained model to use in the sentence embeddings process
- dim_redux_method: What type of dimensionality reduction process to use to create the sentence embeddings from the token embeddings.
- print_debug: Print a message everytime an embedding is created.

In [None]:
#@title Imported Packages and Libraries
!pip install nlp --quiet
!pip install transformers --quiet
!pip install datasets --quiet
!pip install -U sentence-transformers --quiet

from collections import Counter
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub
from nlp import load_dataset
import random

import seaborn as sns
from pprint import pprint 

# Utilites
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
from numpy.linalg import norm
from numpy import dot
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import csv
import time
import torch
from datetime import datetime
from torch._C import NoneType
from transformers import AutoTokenizer, AutoModel

# JSON
import json

# Embeddings
from transformers import BertTokenizer, TFBertModel
import sklearn as sk
import nltk
from nltk.data import find
from nltk.metrics.segmentation import windowdiff
from nltk.metrics.segmentation import pk

# Pandas CSV processing
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Formatting options for float number in numpy
float_formatter = "{:.4f}".format
np.set_printoptions(formatter={'float_kind':float_formatter})

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
path = "/content/drive/MyDrive/W266/project/nlp_podcast_segmentation/" #@param ["/content/drive/MyDrive/W266/project/nlp_podcast_segmentation/", "other"] {allow-input: true}


[K     |████████████████████████████████| 1.7 MB 4.1 MB/s 
[K     |████████████████████████████████| 212 kB 51.6 MB/s 
[K     |████████████████████████████████| 5.8 MB 4.0 MB/s 
[K     |████████████████████████████████| 182 kB 47.3 MB/s 
[K     |████████████████████████████████| 7.6 MB 45.3 MB/s 
[K     |████████████████████████████████| 451 kB 4.2 MB/s 
[K     |████████████████████████████████| 132 kB 75.3 MB/s 
[K     |████████████████████████████████| 127 kB 77.6 MB/s 
[K     |████████████████████████████████| 85 kB 2.2 MB/s 
[K     |████████████████████████████████| 1.3 MB 8.8 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
Mounted at /content/drive


In [None]:
#@title Choose parameters
# Choose parameters

dataset_name =        "YouTube/yt_scripts_segments_split_n5_111422.csv" #@param ["AMIDataset", "YouTube/yt_scripts_segments_yt_simple_110922.csv", "YouTube/yt_scripts_segments_split_n5_111422.csv", "YouTube/yt_scripts_segments_split_n3_111422.csv","YouTube/yt_small_spacy_dev.csv", "YouTube/yt_scripts_segments_spacy_111022.csv", "YouTube/yt_scripts_segments_split_n5_111422_subset10.csv","YouTube/yt_scripts_segments_split_n10_112922.csv"]
model_name =          "SBERT" #@param ["SBERT", "Universal Sentence Encoder"]
pre_trained_model =   "all-MiniLM-L6-v2" #@param ["all-mpnet-base-v2","stsb-mpnet-base-v2", "all-MiniLM-L6-v2", "multi-qa-mpnet-base-dot-v1", "nli-bert-large-max-pooling", "https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5", "/content/drive/MyDrive/W266/project/nlp_podcast_segmentation/scripts/ricardo/use/"]
dim_redux_method =    'meanpooling' #@param ["meanpooling", "maxpooling"]
print_debug =         "Yes" #@param ["Yes", "No"]
similarity_window =   "1" #@param ["Average of sentences"] {allow-input: true}
z_partition_dataset = "all" #@param ["train", "test", "all"] 
Z =                   "1" #@param ["1.4"] {allow-input: true}
metric_calculation =  "average" #@param ["smooth", "average","append"] 
baseline =  "NO" #@param ["even", "random","none","NO"] 
Z = float(Z)

if model_name == "Universal Sentence Encoder":
  pre_trained_model_url = pre_trained_model
  pre_trained_model = str(pre_trained_model.split("/")[-2]) + "-" + str(pre_trained_model.split("/")[-1]) 

filename = ""
if len(dataset_name.split("/")) > 1:
  if dataset_name.split("/")[-2] == "YouTube":
    filename = dataset_name.split("/")[-1]
    dataset_name = dataset_name.split("/")[-2]
    print("Filename:                  " + filename)
print("Dataset:                   " + dataset_name)
print("Model:                     " + model_name)
print("Pre-trained Model:         " + pre_trained_model)
print("Dimensionality Redyction:  " + dim_redux_method)

# Define loading functions

#Remove adjacent topic changes
def clean_adj_topic(topic_list):
  '''
  Removes the second topic change from two neighboring topic changes
  Example: [0,1,1,0,1,1,1,0,1] would be cleaned to [0,1,0,0,1,0,1,0,1]
  '''

  idx = 1
  N = len(topic_list)
  clean_list = topic_list.copy()

  while(idx < N):
    if(clean_list[idx] == 1 and clean_list[idx-1] == 1):
      clean_list[idx] = 0

    idx += 1

  return clean_list

# average_sentences
# Calculate average sentences
def average_sentences(topic_list):
  '''
  Counts the number of sentences between topics
  input: topic labels
  returns: average number of sentences between topics
  '''

  idx = 0
  N = len(topic_list)
  sentence_counts = []
  count = 0

  while(idx < N):
    if(topic_list[idx] == 1):
      count += 1

    idx += 1
    
  return int(round(N/count))

# evaluate_pk
# Evaluates PK
def evaluate_pk(pred, act, k=5):
  idx = k
  miss_count = 0
  measurement = 0

  while (idx+k) < len(act):
    topic_change_pred = False
    topic_change_act = False

    #Checking if there is a topic change - not including the first index
    if sum(pred[idx-(k-1):idx+k]) >= 1:
      topic_change_pred = True
    if sum(act[idx-(k-1):idx+k]) >= 1:
      topic_change_act = True

    if topic_change_pred != topic_change_act:
      miss_count += 1.0

    measurement += 1.0
    idx += 1

  # print(miss_count)
  # print(measurement)
  pk = miss_count/measurement

  return pk

# evaluate_pk
# Evaluates WD
def evaluate_wd(pred, act, k=5):
  idx = k
  N = len(act)
  count = 0

  while (idx+k) < N:
    # print(pred[idx-(k-1):idx+k])
    sum_pred = sum(pred[idx-(k):idx+k])
    sum_act = sum(act[idx-(k):idx+k])

    #adds a count only if the number of boundaries is greater than 0
    if abs(sum_pred - sum_act) > 0:
      count += 1

    idx += 1

  # print(miss_count)
  # print(measurement)
  wd = (1/(N-k))*count

  return wd

# get_meeting_sentences:
# retrieves specific sentences from the transcripts_list
def get_meeting_sentences(embedding_name="", S_list=[], T_list=[], Y_list=[], transcripts_list=[]):
  for transcript_idx, transcript in enumerate(transcripts_list):
    if transcript == embedding_name:
      # Convert Y in Integers. This happens
      Y_to_convert = Y_list[transcript_idx]
      Y_return = []
      for Y_item in Y_to_convert:
        Y_return.append(int(Y_item))
      return S_list[transcript_idx], T_list[transcript_idx], Y_return, transcript

# cos_sim:
# Calculates cosine similarity between two vectors
def cos_sim(a,b):
  return dot(a, b)/(norm(a)*norm(b))

# estimate_total:
# Estimates total ETA to finish embeddings
def estimate_total(S_list, done_embeddings, emb_speed):
  # Inputs: S_list, and list of done embeddings, embedding speed
  # Returns ETA for completion.
  total_ETA = 0
  total_words_to_embed = 0
  for idx, sentences_to_embed in enumerate(S_list):
    transcript_to_do = transcripts_list[idx]
    if transcript_to_do in done_embeddings:
      continue
    else:
      for sentence_to_embed in sentences_to_embed:
        total_words_to_embed = total_words_to_embed + len(sentence_to_embed.split())
  total_ETA = total_words_to_embed/emb_speed
  return total_ETA

# get_done_embeddings:
# Searches all embeddings done in a folder.
# Inputs: Embeddings Path, transcriptions_list, z_calculation
# Returns List of transcript names that have embeddings stored in file
def get_done_embeddings(embeddings_path):
  done_embeddings = []
  embedding_name = ""
  embeddings_path = embeddings_path
  
  if z_partition_dataset == "all":
    transcripts_list_z = transcripts_list
  else:
    z_partition_dataset_path = os.path.join(dataset_path, "ep_splits.json")
    with open(z_partition_dataset_path) as f:
      z_partition_dataset_json = json.load(f)
    transcripts_list_z = z_partition_dataset_json[z_partition_dataset]

  for embeddingsPath, embeddingsDname, embedddingsFname in os.walk(os.path.join(embeddings_path)):
    for embeddings_name in embedddingsFname:
      if embeddings_name.split(".")[-1] == "pt" or embeddings_name.split(".")[-1] == "npy":
        embeddings_name = embeddings_name.replace(".pt","")
        embeddings_name = embeddings_name.replace(".npy","")
        if embeddings_name not in transcripts_list_z:
          continue
        done_embeddings.append(embeddings_name)
    break
  return done_embeddings

# get_done_metrics:
# Searches all metrics done in a csv file.
def get_done_metrics(metric_results_filename_path="",
                     Y_hat_list_filename_path="", 
                     T_hat_list_filename_path="",
                     sims_list_filename_path=""):
  with open(metric_results_filename_path, 'r+', encoding='UTF8', newline='') as f:
    done_metrics = []
    PK_metrics = []
    WD_metrics = []
    Y_hat_list = []
    T_hat_list = []
    csv_reader = csv.reader(f, delimiter=',')
    line_count = 0
    for row_idx, row in enumerate(csv_reader):
      done_metrics.append(row[1])
      PK_metrics.append(float(row[2]))
      WD_metrics.append(float(row[3]))
  Y_hat_list = np.load(Y_hat_list_filename_path,allow_pickle=True).tolist()
  T_hat_list = np.load(T_hat_list_filename_path,allow_pickle=True).tolist()
  sims_list = np.load(sims_list_filename_path,allow_pickle=True).tolist()
  return done_metrics, PK_metrics, WD_metrics, Y_hat_list, T_hat_list, sims_list

# load_ami_dataset:
# Loads Transcripts from .transcripts/ folder and converts them into
# S:                list of M utterances S = {S_1,..., S_M}
# T:                Underlying topic structure Ti ∈ [Sj , Sk]
# Y:                Label sequence Y = {y1,.., yM} yi is binary indicates whether the utterance Si is the start of a new topic segment
# S_List:           List of S_i (utterances) for the i-th transcript or meeting
# T_List:           List of T_i (Topic changes tuples) for the i-th transcript or meeting
# Y_List:           List of Y_i (Topic changes flat) for the i-th transcript or meeting
# transcripts_list: List of transcript names. Meetings or video_id.
def load_ami_dataset(transcript_path=""):

  # Initiate variables
  S_list=[]
  T_list = []
  Y_list = []
  transcripts_list=[]
  meeting_transcripts=[]

  # Reads JSON from Folder
  try:
    transcripts = []
    meeting_transcripts = []
    transcripts_list = []
    for transcriptPath, transcriptDname, transcriptFname in os.walk(os.path.join(transcripts_path)):
      for transcript_name in transcriptFname:
        transcripts_list.append(transcript_name.replace(".json",""))
        transcript_path = os.path.join(transcripts_path,transcript_name)
        with open(transcript_path) as f:
          data = json.load(f)
        meeting_transcripts.append(data)
  except Exception as error:
    print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + " ---  Error Loading JSON files - " + str(error))
    return S_list, T_list, Y_list, transcripts_list

  # Process data set to return Main Variables
  try:
    S = []                
    S_list = []            
    T = []
    T_list = []            
    Y = []            
    Y_list = []
    W_count = []
    W_count_list = []
    W_T_count = []
    W_T_count_list = []
    W_M_count = []

    T_start = 0
    T_prev = 0
    T_end = 0
    idx_prev = 0
    vocabulary = set()
    sentence_greater = []

    for meeting_idx, meeting_transcript in enumerate(meeting_transcripts):
      # Change this to get one big vector
      S = []
      T = []
      Y = []
      W_count = []
      W_T_count = []
      T_start = 0
      T_prev = 0
      T_end = 0
      idx_prev = 0
      meeting_word_count = 0

      transcript_id = transcripts_list[meeting_idx]
      # print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + " ---  " + str(transcript_id) + " - Start Process")
      # Remove above to get one big vector
      for topics_idx, topics in enumerate(meeting_transcript):
        topic_word_count = 0
        for sentence_idx, sentence in enumerate(topics['sentences']):
          # Generate S Vector
          sentence_text = sentence['text']
          sentence_text = sentence_text.replace(" . ",". ")
          sentence_text = sentence_text.replace(" . ",". ")
          sentence_text = sentence_text.replace("[gap]","")
          sentence_text = sentence_text.replace("[vocalsound]","")
          sentence_text = sentence_text.replace("[disfmarker]","")
          sentence_text = sentence_text.replace("[transformerror]"," ")
          sentence_word_count = len(sentence_text.split())
          topic_word_count = topic_word_count + sentence_word_count
          meeting_word_count = meeting_word_count + sentence_word_count
          W_count.append(sentence_word_count)
          # print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + " ---  " + str(transcript_id) + " - Topic: " + str(topics['topic_idx']) + " Sentence: " + str(sentence_idx) + " Word Count: " + str(sentence_word_count))

          # Check if Sentence over 512 Words
          if sentence_word_count > 512:
            print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + " ---  Error wordcount over 512 " + str(transcript_id) + " - Topic: "  + str(topics_idx) + " title: " + str(topics['topic_idx']) + " Sentence: " + str(sentence_idx) + " Word Count: " + str(sentence_word_count))
            sentence_greater.append(sentence_text)
          S.append(sentence_text)
          # Generate T initial and T end 
          if sentence_idx == 0:
            T_start = T_prev
            Y.append(1)
          else:
            Y.append(0)

          # Create Vocabulary set
          for word in sentence_text.split():
            vocabulary.add(word)
        T_end = sentence_idx + T_prev
        T.append((T_start,T_end))
        T_prev = T_end + 1
        W_T_count.append(topic_word_count)
        # print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + " ---  " + str(transcript_id) + " - Word count per topic: " + str(topic_word_count) + " Total Sentences: " + str(len(S)) + " Total sentence words: " + str(len(W_count)) + " Total topics: " + str(len(T)))
      S_list.append(S)
      T_list.append(T)
      Y_list.append(Y)
      W_count_list.append(W_count)
      W_T_count_list.append(W_T_count)
      W_M_count.append(meeting_word_count)
      # print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + " ---  " + str(transcript_id) + " - Total Sentences: " + str(len(S)) + " Total Sentence word count: " + str(len(W_count)) + " Total topics: " + str(len(T)) + " Total topics word_count: " + str(len(W_T_count)))
    # test_sentenece = 25
    # print(Y_list[test_sentenece])
    # print(len(S_list[test_sentenece][0]))
    # print(T_list[test_sentenece])
    # print(sentence_greater)

    # print("Total Topics for meeting 25: " + str(len(meeting_transcripts[25])))
    # print("Example of meeting: " + str(meeting_transcripts[25]))
    # print("Meeting name: " + str(transcripts_list[25]))
    # print(len(meeting_transcripts[:1]))
    # print(len(meeting_transcripts[0]))

    # index_test = 115
    # print(len(S[index_test].split()))
    # print(W_count[index_test])
    # print(len(meeting_transcripts))
    # print(len(W_M_count))

    # index_test = 25
    for meeting_idx, meeting_transcript in enumerate(meeting_transcripts):
      meeting_word_count = 0
      transcript_id = transcripts_list[meeting_idx]
      # if meeting_idx != index_test:
      #   continue
      for topics_idx, topics in enumerate(meeting_transcript):
        for sentence_idx, sentence in enumerate(topics['sentences']):
          sentence_text = sentence['text']
          sentence_text = sentence_text.replace(" . ",". ")
          sentence_text = sentence_text.replace(" . ",". ")
          sentence_text = sentence_text.replace("[gap]","")
          sentence_text = sentence_text.replace("[vocalsound]","")
          sentence_text = sentence_text.replace("[disfmarker]","")
          sentence_text = sentence_text.replace("[transformerror]"," ")
          sentence_word_count = len(sentence_text.split())
          meeting_word_count = meeting_word_count + sentence_word_count
      # print(meeting_word_count)
    # print(W_M_count[index_test])
  except Exception as error:
    print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + " ---  Error Processing AMI Dataset - " + str(error))
    return S_list, T_list, Y_list, transcripts_list

  return S_list, T_list, Y_list, transcripts_list

# load_youtube_dataset:
# Loads csvs from .YouTube/ folder and converts them into
# S:                list of M utterances S = {S_1,..., S_M}
# T:                Underlying topic structure Ti ∈ [Sj , Sk]
# Y:                Label sequence Y = {y1,.., yM} yi is binary indicates whether the utterance Si is the start of a new topic segment
# S_List:           List of S_i (utterances) for the i-th transcript or meeting
# T_List:           List of T_i (Topic changes tuples) for the i-th transcript or meeting
# Y_List:           List of Y_i (Topic changes flat) for the i-th transcript or meeting
# transcripts_list: List of transcript names. Meetings or video_id.
def load_youtube_dataset(dataset_path="", filename=""):
  # Initiate variables
  S_list=[]
  T_list = []
  Y_list = []
  transcripts_list=[]
  meeting_transcripts=[]

  # Reads CSV from Folder
  try:
    # filename = "yt_small_spacy_dev.csv"
    # filename = "yt_scripts_segments_yt_simple_110922.csv"
    csv_path = os.path.join(dataset_path,filename)
    yt_pods = pd.read_pickle(csv_path)
    # print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + " --- Loaded CSV: " + str(csv_path))
  except Exception as error:
    print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + " ---  Error Loading csv from YouTube Dataset - " + str(error))
    return S_list, T_list, Y_list, transcripts_list

  # Process data set to return Main Variables
  try:
    for video_idx, video_id in enumerate(yt_pods["Video_Id"]):
      transcripts_list.append(video_id)

    for sentences_idx, sentences in enumerate(yt_pods["Sentence_Word_Lists"]):
      S=[]
      for sentence_idx, sentence in enumerate(sentences):
        sentence_text = sentence[0]
        S.append(sentence_text)
      S_size = len(S)
      S_list.append(S)

    for transcript_labels_tuple_idx, transcript_labels_tuple in enumerate(yt_pods["Transition_Labels_Tuple"]):
      T_list.append(transcript_labels_tuple)

    for transcript_labels_idx, transcript_labels in enumerate(yt_pods["Transition_Labels"]):
      Y_list.append(transcript_labels)
      
    # print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + " --- Processed Video: " + str(video_id) + " - Sentences: " + str(S_size))
  except Exception as error:
    print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + " --- Error processing Video: " + str(video_id) + " - Index: " +str(sentences_idx) + " - " + str(error))
    return S_list, T_list, Y_list, transcripts_list
  
  return S_list, T_list, Y_list, transcripts_list

# load_process_dataset:
# Process all datasets, and returns main variables and
# transcripts, daset path
def load_process_dataset(dataset_name="", filename=""):
  S_list=[]
  T_list=[]
  Y_list=[]
  W_count_list=[]
  meeting_transcripts = []
  transcripts_list = []
  dataset_path = ""
  transcripts_path = ""
  embeddings_path = ""
  dataset_path = os.path.join(path,'data/' + dataset_name + "/")
  transcripts_path = os.path.join(dataset_path,'transcripts/')
  if dataset_name == "AMIDataset":
    S_list, T_list, Y_list, transcripts_list = load_ami_dataset(transcripts_path)
  elif dataset_name == "YouTube":
    S_list, T_list, Y_list, transcripts_list = load_youtube_dataset(dataset_path=dataset_path,filename=filename)
  return (S_list, T_list, Y_list, W_count_list), (meeting_transcripts, transcripts_list), (dataset_path, transcripts_path)

def rand_baseline(Y):
  #Get the number of topic changes within the starting text
  topic_changes = [i for i, x in enumerate(Y) if x == 1]
  n = len(topic_changes)
  
  #Random Baseline Structure
  rand_list=[]

  #Generate a list of random increments for n number of topics
  for i in range(n):
      rand_list.append(random.randint(0,len(Y)-1))

  # print("True Topic List Index:", topic_changes)
  # print("True Topic Array:", Y)

  # print("\nRandom List Index:", rand_list)

  Y_random = np.zeros(len(Y)).astype(int)

  for i in rand_list:
    Y_random[i] = 1

  return list(Y_random)

def even_baseline(Y):
  #Even Baseline Structure

  # number of even increments between topics
  # Technically this should be the average utterances of each "topic"
  n = int(average_sentences(Y))
  Y_even = np.zeros(len(Y)).astype(int)

  #Generate a list of x increments for n number of topics
  idx = 0
  count = 0
  while (idx < len(Y_even)) and (count < n):
    Y_even[idx] = 1
    idx += n
    count += 1

  return list(Y_even)

# Define None baseline calculation
def none_baseline(Y):
  listofzeros = [0] * len(Y)
  return listofzeros

#Mae calculation for all predictions
def mae_std(y_true, y_pred):
  # ep_ref_tc = tf.reduce_sum(np.array(y_true, dtype='int'), axis=1).numpy()
  # ep_pred_tc = tf.reduce_sum(np.array(y_pred, dtype='int'), axis=1).numpy()
  # ep_diff = np.abs(ep_ref_tc - ep_pred_tc)
  # ep_rel_diff = ep_diff/ep_ref_tc
  # mae_std = np.mean(ep_rel_diff)
  
  if len(y_true) != len(y_pred) or len(y_true) <= 1:
    print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
        " - Error calculating MAE. Y_hat_list size vs Y_list size")
    raise SystemExit
  ep_rel_diff = []
  for y_true_i_idx, y_true_i in enumerate(y_true):
    # print("calculating MAE for: " + str(y_true_i_idx + 1) + " of " + str(len(y_true)))
    ep_ref_tc = np.sum(y_true_i)
    ep_pred_tc = np.sum(y_pred[y_true_i_idx])
    ep_diff = np.abs(ep_ref_tc - ep_pred_tc)
    ep_rel_diff.append(round(ep_diff/ep_ref_tc,5))

  mae_std_measure = np.mean(ep_rel_diff)
  return mae_std_measure


Filename:                  yt_scripts_segments_split_n5_111422.csv
Dataset:                   YouTube
Model:                     SBERT
Pre-trained Model:         all-MiniLM-L6-v2
Dimensionality Redyction:  meanpooling


In [None]:
#@title Load Dataset
(S_list, T_list, Y_list, W_count_list), (meeting_transcripts, transcripts_list), (dataset_path, transcripts_path) = load_process_dataset(dataset_name = dataset_name, filename = filename)

# Print Test
index_test = 3
print("\n")
print("Transcripts path:            " + str(transcripts_path))
print("Dataset path:                " + str(dataset_path))
print("Numb of transcripts S_list:  " + str(len(S_list)))
print("Numb of Topics T_list:       " + str(len(T_list)))
print("Numb of Outputs Y_list:      " + str(len(Y_list)))
print("Transcripts name:            " + str(transcripts_list[index_test]))
print("Numb of sentences in Test    " + str(len(S_list[index_test])))
print("Numb of topics in Test       " + str(len(T_list[index_test])))
print("Numb of Outputs in Test      " + str(len(Y_list[index_test])))



Transcripts path:            /content/drive/MyDrive/W266/project/nlp_podcast_segmentation/data/YouTube/transcripts/
Dataset path:                /content/drive/MyDrive/W266/project/nlp_podcast_segmentation/data/YouTube/
Numb of transcripts S_list:  3757
Numb of Topics T_list:       3757
Numb of Outputs Y_list:      3757
Transcripts name:            vNhSCF9i8Qs
Numb of sentences in Test    1167
Numb of topics in Test       19
Numb of Outputs in Test      1167


In [None]:
from transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES
#@title Prepare embeddings and calculate Y_hat

# Prepare Variables
done_embeddings = []
done_metrics = []
sentence_embeddings_list = []
Y_hat_list = []
T_hat_list = []
sims_list = []

# define Pre_trainde_model_fullname
pre_trained_model_fullname = str(pre_trained_model + '-' + dim_redux_method)# + "-WS" + similarity_window + "-Z" + str(Z) + "-" + z_partition_dataset)

# Define Embeddings path depending on Dataset
embeddings_path = os.path.join(dataset_path,'embeddings/', str(pre_trained_model_fullname + '/'))
if dataset_name == "YouTube":
  filename_folder = filename.split(".")[-2]
  embeddings_path = os.path.join(dataset_path,'embeddings/', str(filename_folder), str(pre_trained_model_fullname + '/'))

if baseline == "NO":
  pre_trained_model_fullname_metrics_result = str(pre_trained_model + '-' + str(dim_redux_method) + "-WS" + str(similarity_window) + "-Z" + str(Z) + "-" + z_partition_dataset)
else:
  pre_trained_model_fullname_metrics_result = 'baseline-' + str(baseline) + "-WS" + str(similarity_window) + "-Z" + str(Z) + "-" + str(z_partition_dataset)

# Define metrics_results path depending on Dataset
metrics_results_path = os.path.join(dataset_path,'metrics_results/', str(pre_trained_model_fullname_metrics_result + '/'))
if dataset_name == "YouTube":
  filename_folder = filename.split(".")[-2]
  metrics_results_path = os.path.join(dataset_path,'metrics_results/', str(filename_folder + '/'), str(pre_trained_model_fullname_metrics_result + '/'))
metric_results_filename_path = os.path.join(metrics_results_path, str("results_" + pre_trained_model_fullname + '.csv'))
Y_hat_list_filename_path = os.path.join(metrics_results_path, str("Y_hat_list_" + pre_trained_model_fullname + '.npy'))
T_hat_list_filename_path = os.path.join(metrics_results_path, str("T_hat_list_" + pre_trained_model_fullname + '.npy'))
sims_list_filename_path = os.path.join(metrics_results_path, str("sims_list_" + pre_trained_model_fullname + '.npy'))

# Check if embeddings folder exists if not create it
if not os.path.exists(metrics_results_path):
   os.makedirs(metrics_results_path)
   print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
         " - " + str(pre_trained_model_fullname_metrics_result) + 
         " - metrics_results_path folder not found. New folder created")

# List all embeddings in folder embeddings_path and done metrics
try:
  print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
                " - Reading embeddings in path: " + str(embeddings_path))
  
  done_embeddings = get_done_embeddings(embeddings_path)

  print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
                " - Total embeddings: " + str(len(done_embeddings)))
  
  print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
                " - Reading metrics in path: " + str(metric_results_filename_path))
  
  done_metrics, PK_metrics, WD_metrics, Y_hat_list, T_hat_list, sims_list = get_done_metrics(metric_results_filename_path, Y_hat_list_filename_path, T_hat_list_filename_path, sims_list_filename_path)
  
  print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
                " - Total metrics: " + str(len(done_metrics)))
  
  print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
                " - Total Y_hat_list: " + str(len(Y_hat_list)))
  
  print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
                " - Total T_hat_list: " + str(len(T_hat_list)))
  
  print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
                " - Total sims_list: " + str(len(sims_list)))  
except Exception as error:
  print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
        " - Error reading embeddings path: " + str(metric_results_filename_path) + 
        "\n" +
        " - Error: " + str(error))

if len(done_metrics) != len(Y_hat_list):
  print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
      " - Error Y_hat_list size")
  raise SystemExit

# Initiate internal variables
S_list_cropped = []
T_list_cropped = []
Y_list_cropped = []
transcripts_list_cropped = []

for done_embedding_idx, done_embedding in enumerate(done_embeddings):
  if done_embedding not in transcripts_list:
    print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
          " - " + str(done_embedding) + 
          " - Skipping File. Not a transcript: ")
    continue

  S = []
  T = []
  Y = []
  transcript_name_cropped = ""
  sims = []
  T_hat = []
  R = None
  csv_data = []

  # Get Sentences, topics, outputs from list of created embedding (Embeddings in folder)
  S, T, Y, transcript_name_cropped = get_meeting_sentences(done_embedding, S_list, T_list, Y_list, transcripts_list)
  S_list_cropped.append(S)
  T_list_cropped.append(T)
  Y_list_cropped.append(Y)
  transcripts_list_cropped.append(transcript_name_cropped)

  if done_embedding not in done_metrics:

    if baseline == "NO":
      # Load tensors
      if model_name == "SBERT":

        # Load tensor files
        try:
          tensor_filename = embeddings_path + str(done_embedding) + '.pt'
          R = torch.load(tensor_filename)

          # Check if tensor size ok
          if len(S) != len(R):
          # if len(S) != R.size()[0]:
            print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
                  " - " + str(done_embedding) + 
                  " - Error on Tensor size: " + str(R.size()[0]) + 
                  " - Sentence Size: " + str(len(S)))
            raise SystemExit
        except Exception as error:
          print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
                " - " + str(done_embedding) + 
                " - Error on Loading tensor file: " + str(tensor_filename) +
                "\n" +
                " - Error : " + str(error))
          break

      elif model_name == "Universal Sentence Encoder":
        # Load tensor files
        try:
          tensor_filename = embeddings_path + str(done_embedding) + '.npy'
          R = np.load(tensor_filename,allow_pickle=True).tolist()
          # Check if tensor size ok
          if len(S) != len(R):
            print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
                  " - " + str(done_embedding) + 
                  " - Error on Tensor size: " + str(len(R)) + 
                  " - Sentence Size: " + str(len(S)))
            raise SystemExit
        except Exception as error:
          print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
                " - " + str(done_embedding) + 
                " - Error on Loading tensor file: " + str(tensor_filename) +
                "\n" +
                " - Error : " + str(error))
          break

      # Calculate Similarity Vectors
      try:
        if similarity_window != "1":
          if similarity_window != "Average":
            similarity_window = int(similarity_window)
            for R_idx, r_i in enumerate(R):
              sim_before = []
              sim_after = []
              if R_idx == 0:
                sim_i = 1
              elif R_idx + 1 == len(R):
                sim_i = 0
              else:
                if R_idx - similarity_window < 0:
                  similarity_range = range(0,R_idx)
                else:
                  similarity_range = range(R_idx - similarity_window,R_idx)
                sim_after  = cos_sim(R[R_idx],R[R_idx + 1])
                for sim_indx in similarity_range:
                  # print("index: " + str(R_idx) + " VS " + str(sim_indx))
                  sim_before.append(cos_sim(R[sim_indx], R[R_idx]))
                # print(sim_before)
                # print(similarity_range)
                sim_i = np.mean(sim_before) + sim_after
                # print(sim_i)
              sims.append(sim_i)
            sims_list.append(sims)
          else:
            similarity_window = 1
        else:
          r_i_1 = 0
          for R_idx, r_i in enumerate(R):
            if R_idx == 0:
              # r_i_1 = np.ones_like(r_i)
              sim_i = 1
            else:
              r_i_1 = R[R_idx - 1]
              sim_i = cos_sim(r_i_1, r_i)
            sims.append(sim_i)
          sims_list.append(sims)
      except Exception as error:
        print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
              " - " + str(done_embedding) + 
              " - Error calculating sims" +
              "\n" +
              " - Error: " + str(error))
        break

      # Calculate Y_hat, T_hat
      try:
        sim_treshold = 0
        Y_hat = []
        T_hat = []
        miu = np.mean(sims)
        sigma = np.std(sims)
        for sim_idx, sim_i in enumerate(sims):
          if sim_idx == 0:
            t_start = sim_idx
            Y_hat.append(1)
          else:
            sim_treshold = miu - (Z * sigma)
            if sim_i < sim_treshold:
              t_end = sim_idx
              T_hat.append((t_start,t_end))
              t_start = sim_idx + 1
              Y_hat.append(1)
            else:
              Y_hat.append(0)
        Y_hat_list.append(Y_hat)
        T_hat_list.append(T_hat)
      except Exception as error:
        print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
              " - " + str(done_embedding) + 
              " - Error Calculating Y_Hat, T_hat" +
              "\n" +
              " - Error: " + str(error))
        break
    
    elif baseline == "even":
      sims_list.append(sims)
      Y_hat = even_baseline(Y)
      if len(Y_hat) != len(Y):
        print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
        " - Error calculating even baseline. Y_hat_list size vs Y_list size")
        raise SystemExit
      Y_hat_list.append(Y_hat)
      for y_hat_idx, y_hat_i in enumerate(Y_hat):
        if y_hat_idx == 0:
          t_start = y_hat_idx
        else:
          if y_hat_i == 1:
            t_end = y_hat_idx
            T_hat.append((t_start,t_end))
            t_start = y_hat_idx + 1
      T_hat_list.append(T_hat)

    elif baseline == "random":
      sims_list.append(sims)
      Y_hat = rand_baseline(Y)
      if len(Y_hat) != len(Y):
        print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
        " - Error calculating random baseline. Y_hat_list size vs Y_list size")
        raise SystemExit
      Y_hat_list.append(Y_hat)
      for y_hat_idx, y_hat_i in enumerate(Y_hat):
        if y_hat_idx == 0:
          t_start = y_hat_idx
        else:
          if y_hat_i == 1:
            t_end = y_hat_idx
            T_hat.append((t_start,t_end))
            t_start = y_hat_idx + 1
      T_hat_list.append(T_hat)
    
    elif baseline == "none":
      sims_list.append(sims)
      Y_hat = none_baseline(Y)
      if len(Y_hat) != len(Y):
        print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
        " - Error calculating none baseline. Y_hat_list size vs Y_list size")
        raise SystemExit
      Y_hat_list.append(Y_hat)
      for y_hat_idx, y_hat_i in enumerate(Y_hat):
        if y_hat_idx == 0:
          t_start = y_hat_idx
        else:
          if y_hat_i == 1:
            t_end = y_hat_idx
            T_hat.append((t_start,t_end))
            t_start = y_hat_idx + 1
      T_hat_list.append(T_hat)
    
    # Calculate Pk per video or meeting
    try:
      #legacy metric
      # pk_metric_meeting = evaluate_pk(Y_hat, Y, int(average_sentences(Y)/2))
      
      #nltk PK metric, when k is not filled in, it auto calculates half of the average labeled segments
      pk_metric_meeting = pk(''.join(str(i) for i in Y), ''.join(str(i) for i in Y_hat))
    except Exception as error:
      print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
            " - " + str(done_embedding) + 
            " - Error Calculating PK" +
            "\n" +
            " - Error: " + str(error))
      raise SystemExit
    
    # Calculate Wd per video or meeting
    try:
      #legacy metric
      # wd_metric_meeting = evaluate_wd(Y_hat, Y, int(average_sentences(Y)/2))

      #nltk wd metric, when k is not filled in, it auto calculates half of the average labeled segments
      wd_metric_meeting = windowdiff(''.join(str(i) for i in Y), ''.join(str(i) for i in Y_hat), int(average_sentences(Y)/2))
    except Exception as error:
      print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
            " - " + str(done_embedding) + 
            " - Error Calculating Wd" +
            "\n" +
            " - Error: " + str(error))
      raise SystemExit
    
    if print_debug == "Yes":
      print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
            " - " + str(done_embedding) + 
            " - " + str(done_embedding_idx + 1) + " of " + str(len(done_embeddings)) +
            " - PK: " + str(round(pk_metric_meeting,5)) +
            " - WD: " + str(round(wd_metric_meeting,5)) +
            " - T: " + str(len(T)) +
            " T_hat: " + str(np.sum(Y_hat)))
    
    # Save to results_*.csv File
    try:
      csv_data = [datetime.today().strftime('%Y-%m-%d %H:%M:%S'), done_embedding, pk_metric_meeting, wd_metric_meeting]
      with open(metric_results_filename_path, 'a+', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        # write multiple rows
        writer.writerow(map(lambda x: x, csv_data))
    except Exception as error:
      print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
            " - " + str(done_embedding) + 
            " - Error writing to file: " + str(metric_results_filename_path) +
            "\n" +
            " - Error: " + str(error))
    
    # Save Y_hat_list and T_hat_list, sims_list to file
    try:
      np.save(Y_hat_list_filename_path, np.asarray(Y_hat_list, dtype=object))
    except Exception as error:
      print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
            " - " + str(done_embedding) + 
            " - Error writing to file: " + str(Y_hat_list_filename_path) +
            "\n" +
            " - Error: " + str(error))
    
    try:
      np.save(T_hat_list_filename_path, np.asarray(T_hat_list, dtype=object))
    except Exception as error:
      print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
            " - " + str(done_embedding) + 
            " - Error writing to file: " + str(T_hat_list_filename_path) +
            "\n" +
            " - Error: " + str(error))
    
    try:
      np.save(sims_list_filename_path, np.asarray(sims_list, dtype=object))
    except Exception as error:
      print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
            " - " + str(done_embedding) + 
            " - Error writing to file: " + str(sims_list_filename_path) +
            "\n" +
            " - Error: " + str(error))
    
    R = None

    done_metrics.append(done_embedding)

  else:
    done_metrics_idx = done_metrics.index(done_embedding)
    if print_debug == "Yes":
      print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
            " - " + str(done_embedding) + 
            " - Skipping tensor load. Reading from: " + str(metric_results_filename_path) +
            #" - Skipping tensor load. Reading from: " + str(metric_results_filename_path.split("/")[-2]) + "/" + str(metric_results_filename_path.split("/")[-1]) +
            " - " + str(done_embedding_idx + 1) + " of " + str(len(done_embeddings)) +
            " - PK: " + str(round(float(PK_metrics[done_metrics_idx]),5)) +
            " - WD: " + str(round(float(WD_metrics[done_metrics_idx]),5)) +
            " - T: " + str(len(T)) +
            " T_hat: " + str(len(T_hat_list[done_metrics_idx])))

    continue

if metric_calculation == "append" or metric_calculation == "smooth":
  # Calculate Y_Total and Y_hat_total
  try:
    Y_total = []
    Y_hat_total = []
    for Y_list_cropped_idx, Y in enumerate(Y_list_cropped):
      if len(Y) != len(Y_hat_list[Y_list_cropped_idx]):
        print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
          " - " + str(done_embedding) + 
          " - Error Calculating Y_hat, Y sizes are different" +
          "\n")
        raise SystemExit
      for y_idx, y in enumerate(Y):
        Y_total.append(y)
        Y_hat_total.append(Y_hat_list[Y_list_cropped_idx][y_idx])
  except Exception as error:
    print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
          " - " + str(done_embedding) + 
          " - Error Calculating Y_total, Y_hat_total" +
          "\n" +
          " - Error: " + str(error))
  
  # Remove adjacent topic changes
  if metric_calculation == "smooth":
    Y_hat_total = clean_adj_topic(Y_hat_total)
  
  # Evaluate Pk Total
  try:
    pk_metric = 0
    pk_metric = evaluate_pk(Y_hat_total, Y_total, int(average_sentences(Y_total)/2))
  except Exception as error:
    print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
          " - " + str(done_embedding) + 
          " - Error Calculating PK" +
          "\n" +
          " - Error: " + str(error))
    
  # Evaluate Wd Total
  try:
    wd=metric = 0
    wd_metric = evaluate_wd(Y_hat_total, Y_total, int(average_sentences(Y_total)/2))
  except Exception as error:
    print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
          " - " + str(done_embedding) + 
          " - Error Calculating Wd" +
          "\n" +
          " - Error: " + str(error))    

elif metric_calculation == "average":
  Y_total = []
  Y_hat_total = []
  PK_metrics_cropped = []
  WD_metrics_cropped = []
  for transcript_idx, transcript in enumerate(transcripts_list_cropped):
    metrics_cropeed_idx = done_metrics.index(transcript)
    PK_metrics_cropped.append(PK_metrics[metrics_cropeed_idx])
    WD_metrics_cropped.append(WD_metrics[metrics_cropeed_idx])
  pk_metric = np.mean(PK_metrics_cropped)
  wd_metric = np.mean(WD_metrics_cropped)

# Evaluate total topic count
topic_quantity_difference = []
for transcript_cropped_idx, transcript_cropped in enumerate(transcripts_list_cropped):
    transcript_idx = transcripts_list.index(transcript_cropped)
    done_metrics_idx = done_metrics.index(transcript_cropped)
    topic_quantity = len(T_list[transcript_idx])
    topic_quantity_hat = len(T_hat_list[done_metrics_idx])
    topic_quantity_difference.append(topic_quantity_hat-topic_quantity)

# Calculate MAE
if len(Y_list_cropped) != len(Y_hat_list):
  print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
    " - Error Y_hat_list size vs Y_list size" + 
    " - Y_hat embedding: " +  transcript_cropped + 
    " - Y_hat embedding: " +  done_embeddings[done_metrics_idx])
  raise SystemExit
mae = mae_std(Y_list_cropped, Y_hat_list)



# Print Results
# try:
print("\n")
print("Size of sims_list:                 " + str(len(sims_list)))
print("Size of S_list_cropped:            " + str(len(S_list_cropped)))
print("Numb of transcripts S_list:        " + str(len(S_list)))
print("Size of T_list_cropped:            " + str(len(T_list_cropped)))
print("Numb of Topics T_list:             " + str(len(T_list)))
print("Size of Y_list_cropped:            " + str(len(Y_list_cropped)))
print("Numb of Outputs Y_list:            " + str(len(Y_list)))
print("Size of transcripts_list_cropped:  " + str(len(transcripts_list_cropped)))
print("Size of T_hat_list:                " + str(len(T_hat_list)))
print("Size of Y_hat_list:                " + str(len(Y_hat_list)))
print("Size of Y_total:                   " + str(len(Y_total)))
print("Size of Y_hat_total:               " + str(len(Y_hat_total)))
print("\n")

index_test_cropped = 3
index_test = transcripts_list.index(transcripts_list_cropped[index_test_cropped])
print("Transcripts name:                  " + str(transcripts_list[index_test]))
print("Transcripts_cropped name:          " + str(transcripts_list_cropped[index_test_cropped]))
print("Numb of S_list in Test             " + str(len(S_list[index_test])))
print("Numb of S_list_cropped in Test     " + str(len(S_list_cropped[index_test_cropped])))
print("Numb of T_list in Test             " + str(len(T_list[index_test])))
print("Numb of T_list_cropped in Test     " + str(len(T_list_cropped[index_test_cropped])))
print("Numb of Y_list in Test             " + str(len(Y_list[index_test])))
print("Numb of Y_list_cropped in Test     " + str(len(Y_list_cropped[index_test_cropped])))
print("Numb of Y_hat_list:                " + str(len(Y_hat_list[index_test_cropped])))
print("Numb of T_list:                    " + str(len(T_list[index_test])))
print("Numb of T_list_np:                 " + str(np.sum(Y_list[index_test])))
print("Numb of T_hat_list:                " + str(len(T_hat_list[index_test_cropped])))
print("Numb of T_hat_list_np:             " + str(np.sum(Y_hat_list[index_test_cropped])))
print("\n")

print("T_list:                            " + str(T_list[index_test]))
print("T_hat_list:                        " + str(T_hat_list[index_test_cropped]))

print("\n")
Y_int = []
for Y_to_int in Y_list[index_test]:
  Y_int.append(int(Y_to_int))
print("Y_list:                            " + str(Y_int))
print("Y_hat_list:                        " + str(Y_hat_list[index_test_cropped]))

print("\n")
print("sims_list:                         " + str(sims_list[index_test_cropped]))
miu = np.mean(sims_list[index_test_cropped])
sigma = np.std(sims_list[index_test_cropped])
treshold = miu - (Z * sigma)
sims_t_hat = []
# for T_test in T_hat_list[index_test_cropped]:
#   sims_t_hat.append((round(sims_list[index_test_cropped][T_test[0]],5),round(sims_list[index_test_cropped][T_test[1]],5)))

sims_t = []
# for T_test in T_list[index_test]:
#   print(T_test)
#   sims_t.append((round(sims_list[index_test_cropped][T_test[0]],5),round(sims_list[index_test_cropped][T_test[1]],5)))

print("Miu:                               " + str(round(miu,5)))
print("sigma:                             " + str(round(sigma,5)))
print("treshold:                          " + str(round(treshold,5)))
print("sims_t:                            " + str(sims_t))
print("sims_t_hat:                        " + str(sims_t_hat))
print("topic_quantity_difference:         " + str(topic_quantity_difference))


print("\n")
print("PK:                                " + str(round(pk_metric,4)))
print("Wd:                                " + str(round(wd_metric,4)))
print("Mean Topic Q:                      " + str(round(np.mean(topic_quantity_difference),4)))
print("Std Topic Q:                       " + str(round(np.std(topic_quantity_difference),4)))
print("MAE:                               " + str(round(np.mean(mae),4)))
# except Exception as error:
#   print(datetime.today().strftime('%Y-%m-%d %H:%M:%S') + 
#         " - " + str(done_embedding) + 
#         " - Error Printing results Wd" +
#         "\n" +
#         " - Error: " + str(error))

2022-12-04 01:57:57 - all-MiniLM-L6-v2-meanpooling-WS1-Z1.0-all - metrics_results_path folder not found. New folder created
2022-12-04 01:57:57 - Reading embeddings in path: /content/drive/MyDrive/W266/project/nlp_podcast_segmentation/data/YouTube/embeddings/yt_scripts_segments_split_n5_111422/all-MiniLM-L6-v2-meanpooling/
2022-12-04 01:57:57 - Total embeddings: 3757
2022-12-04 01:57:57 - Reading metrics in path: /content/drive/MyDrive/W266/project/nlp_podcast_segmentation/data/YouTube/metrics_results/yt_scripts_segments_split_n5_111422/all-MiniLM-L6-v2-meanpooling-WS1-Z1.0-all/results_all-MiniLM-L6-v2-meanpooling.csv
2022-12-04 01:57:57 - Error reading embeddings path: /content/drive/MyDrive/W266/project/nlp_podcast_segmentation/data/YouTube/metrics_results/yt_scripts_segments_split_n5_111422/all-MiniLM-L6-v2-meanpooling-WS1-Z1.0-all/results_all-MiniLM-L6-v2-meanpooling.csv
 - Error: [Errno 2] No such file or directory: '/content/drive/MyDrive/W266/project/nlp_podcast_segmentation/dat