In [1]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Authenticate to access cloud bucket
from google.colab import auth
auth.authenticate_user()

In [3]:
import pandas as pd
import numpy as np
import torch
import os
from tqdm import tqdm
import ast
import json
import random
from tensorflow.keras.utils import pad_sequences

path = '/content/drive/MyDrive/nlp_podcast_segmentation/'
yt_path = path + 'data/YouTube/'
splits = 'yt_scripts_segments_split_n5_111422'
pooling = 'all-MiniLM-L6-v2-meanpooling' # all-MiniLM-L6-v2-meanpooling or #stsb-mpnet-base-v2-meanpooling
embed_path = yt_path + f'embeddings/{splits}/{pooling}/' 

## Set params

In [4]:
# Set params
continue_run = True
max_sequence = 350 # no. sentences per synth episode <---- maybe match up with z-stat in test set?
synth_eps = 50000 # no. synthetic episodes to generates
seg_batch = 100 # no. segments to use for each synthetic episode (max for 100 sentences is ~13 segments)
num_segs = 27855 # number of records in file (from seg_df script)
save_every = 1000 # save dicts out in increments
num_parallel = 1000 # how many episodes to build fro mthe seg_batch

## Load Splits (from gen_seg_df.py)

In [5]:
if not continue_run:
  print("Load splits")
  # Load episode df, then merge embedding file paths
  df = pd.read_pickle(yt_path+'yt_scripts_segments_split_n5_111422_slimcols.csv')
  emb_df = pd.DataFrame({'Video_Id': [x[:-3] for x in os.listdir(embed_path)],
                        'emb_file': os.listdir(embed_path),})
  vids = pd.merge(df, emb_df, on='Video_Id', how='inner')

  # Get rid of episodes with only a few segments
  vids['num_segments'] = vids['Transition_Labels'].apply(lambda x: sum(x))
  vids = vids.loc[vids['num_segments'] > 4, ].copy()

  ## Split into train and test sets
  mask = np.load(f"{embed_path}split_mask.npy")
  train = vids[mask]
  test = vids[~mask]

## Save Initial Train, Test Set and Partition Dicts

In [6]:
def pt_to_npy(ids, max_sequence):
  for id in tqdm(ids):
    emb = torch.load(embed_path + id + '.pt').numpy().tolist() # load
    # Truncate, or Pad if num sentences < max_seq
    if len(emb) >= max_sequence:
      pad = emb[:max_sequence]
    else:
      pad = emb.copy()
      for i in range(max_sequence-len(emb)):
        pad.append(np.zeros(len(emb[0])))
    np.save(f"{embed_path}train_test/{id}.npy", pad) # save to npy


def save_dicts(train_ids, test_ids, all_ids, all_labs):
  partitions = {'train': train_ids, 'test': test_ids}
  labels = dict(zip(all_ids, all_labs))
  with open(f"{embed_path}train_test/partitions.json", 'w') as fp:
      json.dump(partitions, fp)
  with open(f"{embed_path}train_test/labels.json", 'w') as fp:
      json.dump(labels, fp)


def load_dicts():
  with open(f"{embed_path}train_test/partitions.json", 'r') as f:
    partitions = json.load(f)
  with open(f"{embed_path}train_test/labels.json", 'r') as f:
    labels = json.load(f)

  train_ids = partitions['train']
  test_ids = partitions['test']
  all_ids = list(labels.keys())
  all_labs = list(labels.values())

  return train_ids, test_ids, all_ids, all_labs

In [7]:
if not os.path.exists(f"{embed_path}train_test/"):
  os.mkdir(f"{embed_path}train_test/")
  print("Creating new directory")
else:
  print("Directory exists")

Directory exists


In [8]:
if continue_run: # resuming where we left off
  print({"Loading dictionaries from previous run"})
  # Reload partitions, labels dicts
  train_ids, test_ids, all_ids, all_labs = load_dicts()

  # Get num synth episodes remaining
  synth_gen = len([x for x in train_ids if 'synth' in x])
  synth_eps = synth_eps - synth_gen
  print(f"Synth eps generated: {synth_gen}, num remaining: {synth_eps}")

else:
  print("Beginning new run, saving ids, embeddings, labels for initial splits")
  # Save test embeddings to numpy files
  pt_to_npy(test['Video_Id'].values, max_sequence)
  pt_to_npy(train['Video_Id'].values, max_sequence)
  synth_gen = 0
  print(f"Synth eps generated: {synth_gen}, num remaining: {synth_eps}")

  ## Init partition dfs
  # Track Ids
  train_ids = []
  all_ids = []
  test_ids = test['Video_Id'].tolist()
  train_ids.extend(train['Video_Id'].values)
  all_ids.extend(train['Video_Id'].values)
  all_ids.extend(test['Video_Id'].values)

  # Track labels
  all_labs = []
  train_labs = pad_sequences(train['Transition_Labels'].values, maxlen=max_sequence, 
                        dtype='uint8', padding="post", truncating="post").tolist()
  test_labs = pad_sequences(test['Transition_Labels'].values, maxlen=max_sequence, 
                        dtype='uint8', padding="post", truncating="post").tolist()
  all_labs.extend(train_labs)
  all_labs.extend(test_labs)

  # Save initial partition, label dicts
  save_dicts(train_ids, test_ids, all_ids, all_labs)

{'Loading dictionaries from previous run'}


OSError: ignored

In [None]:
remaining = [x for x in train_ids if f"{x}.npy" in os.listdir(f"{embed_path}train_test")]
print(len(remaining))

## Create Synthetic Train Records

In [None]:
# Loop to create synthetic episodes
c_list = [] # track num segs used for each synth episode
for i in tqdm(range(round(synth_eps/num_parallel))):
  # load and shuffle segments
  skip = sorted(random.sample(range(1, num_segs), num_segs-seg_batch))
  batch = pd.read_csv(embed_path + "seg_df.csv", header=0, skiprows=skip) # load in a random subset of segments
  batch['seg_labels'] = batch['seg_labels'].apply(lambda x: ast.literal_eval(x))
  batch['seg_embs'] = batch['seg_embs'].apply(lambda x: ast.literal_eval(x))
  
  # use the loaded segments multiple times by shuffling and reconcatenating
  # to save runtime
  for z in range(num_parallel):
    synth_id = synth_gen+(num_parallel*i)+z
    shuffled = batch.sample(frac=1) # shuffle once more
    shuffled.reset_index(inplace=True, drop=True)

    # loop through segments, combining them until max length reached
    ne_emb = []
    ne_lab = []
    c = 0
    while len(ne_lab) < max_sequence:
      # get one row sents, labels, append to new ep
      ne_lab.extend(shuffled.loc[c, 'seg_labels'])
      ne_emb.extend(shuffled.loc[c, 'seg_embs'])
      c += 1
    c_list.append(c)

    # Save to partition dict
    np.save(f"{embed_path}train_test/synth_{synth_id}.npy", ne_emb[:max_sequence])
    train_ids.append(f"synth_{synth_id}")
    all_ids.append(f"synth_{synth_id}")
    all_labs.append([int(x) for x in ne_lab[:max_sequence]])

    # Save partition dicts periodically
    if ((synth_id+1)%save_every==0):
      save_dicts(train_ids, test_ids, all_ids, all_labs)
      #print(f"\nSaving latest batch of dicts out: {synth_id}")

save_dicts(train_ids, test_ids, all_ids, all_labs)
print(f"\nSaving final batch of dicts out: {synth_id}")
print(f"\nMax segs needed: {np.max(c_list)}")

In [None]:
# push to cloud storage
!gsutil -m cp -r -n /content/drive/MyDrive/nlp_podcast_segmentation/data/YouTube/embeddings/yt_scripts_segments_split_n5_111422/all-MiniLM-L6-v2-meanpooling/train_test/ gs://podcast_episodes/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Skipping existing item: gs://podcast_episodes/train_test/synth_45000.npy
Skipping existing item: gs://podcast_episodes/train_test/synth_44997.npy
Skipping existing item: gs://podcast_episodes/train_test/synth_45003.npy
Skipping existing item: gs://podcast_episodes/train_test/synth_45006.npy
Skipping existing item: gs://podcast_episodes/train_test/synth_45007.npy
Skipping existing item: gs://podcast_episodes/train_test/synth_45004.npy
Skipping existing item: gs://podcast_episodes/train_test/synth_45002.npy
Skipping existing item: gs://podcast_episodes/train_test/synth_45005.npy
Skipping existing item: gs://podcast_episodes/train_test/synth_45009.npy
Skipping existing item: gs://podcast_episodes/train_test/synth_45008.npy
Skipping existing item: gs://podcast_episodes/train_test/synth_45010.npy
Skipping existing item: gs://podcast_episodes/train_test/synth_45011.npy
Skipping existing item: gs://podcast_episodes/train_test/sy