In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
import os
from tqdm import tqdm
import ast
import json
import random
from tensorflow.keras.utils import pad_sequences

path = '/content/drive/MyDrive/nlp_podcast_segmentation/'
yt_path = path + 'data/YouTube/'
splits = 'yt_scripts_segments_split_n5_111422'
pooling = 'all-MiniLM-L6-v2-meanpooling' # all-MiniLM-L6-v2-meanpooling or #stsb-mpnet-base-v2-meanpooling
embed_path = yt_path + f'/embeddings/{splits}/{pooling}/'

## Split Episodes

In [None]:
# Load episode df, then merge embedding file paths
df = pd.read_pickle(yt_path+'yt_scripts_segments_split_n5_111422_slimcols.csv')
emb_df = pd.DataFrame({'Video_Id': [x[:-3] for x in os.listdir(embed_path)],
                      'emb_file': os.listdir(embed_path),})
vids = pd.merge(df, emb_df, on='Video_Id', how='inner')

# Get rid of episodes with only a few segments
vids['num_segments'] = vids['Transition_Labels'].apply(lambda x: sum(x))
vids = vids.loc[vids['num_segments'] > 4, ].copy()

## Split into train and test sets
mask = np.random.rand(len(vids)) < 0.5 # set split to 50%
np.save(f"{embed_path}split_mask.npy", mask)
print("Generating and saving split mask")

train = vids[mask]
test = vids[~mask]

Generating and saving split mask


## Create Segment-Level DF

In [None]:
# Init df
seg_df = pd.DataFrame(columns=['vid_id', 'seg', 'seg_embs', 'seg_labels'])
seg_df.to_csv(embed_path+"seg_df.csv", index=False)

# Loop through train episodes
num_segs = 0
for idx, row in tqdm(train.iterrows(), total=train.shape[0]):
  embs = torch.load(embed_path + row['emb_file']).numpy().tolist() # <----- need smaller embeddings
  labels = row['Transition_Labels'].tolist()

  # Get transition inds
  inds = np.where(np.array(labels)==1)[0]
  inds = np.append(inds, len(labels))

  # Loop through inds
  for i in range(len(inds)-1):
    ind_s = inds[i]
    ind_e = inds[i+1]
    seg_l = labels[ind_s:ind_e]
    seg_e = embs[ind_s:ind_e]

    # create df row
    seg_row = [row['Video_Id'], i, seg_e, seg_l]
    seg_df.loc[len(seg_df)] = seg_row
    num_segs += 1
  
    # append to df to save RAM, restart df in RAM
    if (num_segs%5000==0):
      print("\nAppending chunk to df on disk")
      seg_df.to_csv(embed_path+"seg_df.csv", mode='a', header=False, index=False)
      seg_df = pd.DataFrame(columns=['vid_id', 'seg', 'seg_embs', 'seg_labels'])

# Save last set of segments
seg_df.to_csv(embed_path+"seg_df.csv", mode='a', header=False, index=False)
print(f"Num segments: {num_segs}")

 28%|██▊       | 453/1596 [04:26<11:58,  1.59it/s]


Appending chunk to df on disk


 56%|█████▌    | 887/1596 [09:40<06:39,  1.77it/s]


Appending chunk to df on disk


 72%|███████▏  | 1154/1596 [13:33<06:14,  1.18it/s]


Appending chunk to df on disk


 81%|████████  | 1294/1596 [16:16<04:15,  1.18it/s]


Appending chunk to df on disk


 91%|█████████ | 1453/1596 [19:02<01:20,  1.77it/s]


Appending chunk to df on disk


100%|██████████| 1596/1596 [21:13<00:00,  1.25it/s]


Num segments: 27855
