## Handle Files

In [1]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# # Authenticate to access cloud bucket
# from google.colab import auth
# auth.authenticate_user()

# #create a LOCAL directory in /content/  so you can move stuff from bucket to local
# !mkdir /content/nlp_podcast_episodes
# # # copy from google bucket to local directory
# !gsutil -m -q cp -r gs://podcast_episodes/ /content/nlp_podcast_episodes

## Imports and Paths

In [3]:
import pandas as pd
import numpy as np
from tensorflow import convert_to_tensor
from tensorflow.keras import layers
from tensorflow.keras.metrics import Accuracy, Recall
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import json
import sys

path = '/content/drive/MyDrive/nlp_podcast_segmentation/'
yt_path = path + 'data/YouTube/'
splits = 'yt_scripts_segments_split_n5_111422'
pooling = 'all-MiniLM-L6-v2-meanpooling' # all-MiniLM-L6-v2-meanpooling or #stsb-mpnet-base-v2-meanpooling
embed_path = yt_path + f'embeddings/{splits}/{pooling}/'
dev_dir =  path + 'scripts/john/supervised/'
train_test_path = '/content/nlp_podcast_episodes/podcast_episodes/'

sys.path.append(dev_dir)
from data_loader import DataGenerator

In [4]:
# Set params
max_sequence = 100
embed_dim = 384
batch_size = 1024

## Data loaders

In [5]:
# Parameters
params = {'input_dim': (max_sequence, embed_dim),
          'output_dim': max_sequence,
          'batch_size': batch_size,
          'shuffle': True}

# Datasets
with open(f"{embed_path}partitions.json", 'r') as f:
  partitions = json.load(f)
with open(f"{embed_path}labels.json", 'r') as f:
  labels = json.load(f)

# Generators
training_generator = DataGenerator(partitions['train'], labels, **params)
test_generator = DataGenerator(partitions['test'], labels, **params)

In [6]:
# Prepare test inputs, labels
test_size = len(partitions['test'])
X_test = np.empty((test_size, max_sequence, embed_dim))
y_test = np.empty((test_size, max_sequence), dtype=int)

for i, ID in enumerate(partitions['test']):
  X_test[i] = np.load(f"{train_test_path}{ID}.npy")
  y_test[i] = np.array(labels[ID], dtype='uint8')

In [7]:
# Get pos and neg labels
pos = 0
neg = 0

# Loop through batches to get counts
for ID in partitions['train']:
  pos += np.sum(labels[ID])
  neg += len(labels[ID]) - np.sum(labels[ID])

# Get total
total = pos + neg
print(f"Pos rate: {pos/total}, Neg rate: {neg/total}, Total: {total}")

# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
initial_bias = np.log([pos/neg])
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}
print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Pos rate: 0.048399046289834845, Neg rate: 0.9516009537101652, Total: 5158800
Weight for class 0: 0.53
Weight for class 1: 10.33


## Baseline Model (Simple RNN)

In [8]:
def create_rnn_model(max_sequence_length=1000,
                     embed_dim = 384,
                     hidden_dim=64,
                     dropout=0.10,
                     learning_rate=0.001,
                     output_bias=np.log(0.05/0.95),
                     loss=tf.keras.losses.BinaryCrossentropy(from_logits=True)
                     ):

  ## Build model
  output_bias = tf.keras.initializers.Constant(output_bias)
  input_layer = tf.keras.layers.Input(
      shape=(max_sequence_length, embed_dim), dtype='float32')
  lstm1 = LSTM(
      hidden_dim, return_sequences=True, activation='relu', name='rnn_layer_1'
      )(input_layer)
  lstm2 = LSTM(
      hidden_dim, return_sequences=True, activation='relu', name='rnn_layer_2'
      )(lstm1)
  #drop1 = tf.keras.layers.Dropout(dropout)(lstm2)
  dense1 = tf.keras.layers.Dense(128, activation='relu', name='dense_1')(lstm2)
  dense2 = tf.keras.layers.Dense(32, activation='relu', name='dense_2')(dense1)                  
  output = tf.keras.layers.Dense(1, activation='sigmoid',
                                 bias_initializer=output_bias,
                                 name='segment_classifier')(dense2)
  
  rnn_model = tf.keras.models.Model(inputs=input_layer, outputs=[output])
  rnn_model.compile(loss=loss,
                    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate),
                    metrics=[tf.keras.metrics.Recall(), 
                             tf.keras.metrics.Accuracy()]
                    )

  ### END YOUR CODE
  return rnn_model

In [9]:
# Eventual loss function strucutre
def my_loss(weight):
  def weighted_cross_entropy_with_logits(labels, logits):
    labels = tf.cast(labels, 'float32')
    #logits = tf.where(tf.less(logits, 0), 0., logits) # prevents odd error of pred < 0
    loss = tf.nn.weighted_cross_entropy_with_logits(
        labels, logits, weight
    )
    return loss
  return weighted_cross_entropy_with_logits

## LR Scheduler
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=4,
    verbose=1,
    mode="auto",
    min_delta=0.0001)

In [10]:
rnn_model = create_rnn_model(max_sequence_length=max_sequence,
                              embed_dim=embed_dim,
                              hidden_dim=128,
                              dropout=0.05,
                              learning_rate=0.004,
                              output_bias=initial_bias,
                              loss=my_loss(weight=weight_for_1*4)
                              )
rnn_model.summary()



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100, 384)]        0         
                                                                 
 rnn_layer_1 (LSTM)          (None, 100, 128)          262656    
                                                                 
 rnn_layer_2 (LSTM)          (None, 100, 128)          131584    
                                                                 
 dense_1 (Dense)             (None, 100, 128)          16512     
                                                                 
 dense_2 (Dense)             (None, 100, 32)           4128      
                                                                 
 segment_classifier (Dense)  (None, 100, 1)            33        
                                                                 
Total params: 414,913
Trainable params: 414,913
Non-trainable

## Train

In [11]:
rnn_history = rnn_model.fit(training_generator,
                            validation_data=test_generator,
                            epochs=200,
                            callbacks=[reduce_lr],
                            )

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.0020000000949949026.
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 10: ReduceLROnPlateau reducing learning rate to 0.0010000000474974513.
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 14: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 18: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 22: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 27: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 31: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 35: ReduceL

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-868cf0ace880>", line 4, in <module>
    callbacks=[reduce_lr],
  File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1409, in fit
    tmp_logs = self.train_function(iterator)
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/traceback_utils.py", line 150, in error_handler
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/def_function.py", line 915, in __call__
    result = self._call(*args, **kwds)
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/def_function.py", line 947, in _call
    return self._stateless_fn(*args,

KeyboardInterrupt: ignored

## Evaluate

In [None]:
def average_sentences(topic_list):
  '''
  Counts the number of sentences between topics
  input: topic labels
  returns: average number of sentences between topics
  '''

  idx = 0
  N = len(topic_list)
  sentence_counts = []
  count = 0

  while(idx < N):
    if(topic_list[idx] == 1):
      sentence_counts.append(count)
      count = 0

    count += 1
    idx += 1

  
  if len(sentence_counts) == 0:
    return 0
    
  return sum(sentence_counts) / len(sentence_counts)

# Evaluates PK
def evaluate_pk(pred, act, k=5):
  idx = k
  miss_count = 0
  measurement = 0

  while (idx+k) < len(act):
    topic_change_pred = False
    topic_change_act = False

    #Checking if there is a topic change - not including the first index
    if sum(pred[idx-(k-1):idx+k]) >= 1:
      topic_change_pred = True
    if sum(act[idx-(k-1):idx+k]) >= 1:
      topic_change_act = True

    if topic_change_pred != topic_change_act:
      miss_count += 1.0

    measurement += 1.0
    idx += 1

  # print(miss_count)
  # print(measurement)
  pk = miss_count/measurement

  return pk

# evaluate_pk
# Evaluates WD
def evaluate_wd(pred, act, k=5):
  idx = k
  N = len(act)
  count = 0

  while (idx+k) < N:
    # print(pred[idx-(k-1):idx+k])
    sum_pred = sum(pred[idx-(k):idx+k])
    sum_act = sum(act[idx-(k):idx+k])

    #adds a count only if the number of boundaries is greater than 0
    if abs(sum_pred - sum_act) > 0:
      count += 1

    idx += 1

  # print(miss_count)
  # print(measurement)
  wd = (1/(N-k))*count

  return wd

In [None]:
# Get preds, estimate a threshold for now
preds = rnn_model.predict(X_test)

# Convert preds to binary using threshold
thresh = 0.5
preds_bin = [[1 if x > thresh else 0 for x in p.reshape(-1)] for p in preds]

# Eval PK
pk_metrics = []
wd_metrics = []
for i in range(test_size):
  pk = evaluate_pk(np.array(preds_bin[i]), np.array(y_test[i]), k=int(average_sentences(y_test[i])/2))
  pk_metrics.append(pk)
  wd = evaluate_wd(np.array(preds_bin[i]), np.array(y_test[i]), k=int(average_sentences(y_test[i])/2))
  wd_metrics.append(wd)

print(f"\nThreshold: {thresh}")
print(f"PK Mean: {np.mean(pk_metrics)}")
print(f"WD Mean: {np.mean(wd_metrics)}")

In [None]:
# # Get threshold for each episode by using mu, sigma of probs
# cos_mus = tf.reduce_mean(preds, axis=0).numpy().reshape(-1)
# cos_sigs = tf.math.reduce_std(preds, axis=0).numpy().reshape(-1)
# ep_thresh = cos_mus + cos_sigs

# # Apply
# preds_bin = np.empty((test_size, max_sequence), dtype=int)
# for i in range(len(preds)):
#   preds_bin[i] = [1 if x > ep_thresh[i] else 0 for x in preds[i].reshape(-1)]

# # Eval PK
# pk_metrics = []
# wd_metrics = []
# for i in range(test_size):
#   pk = evaluate_pk(np.array(preds_bin[i]), np.array(y_test[i]), k=int(average_sentences(y_test[i])/2))
#   pk_metrics.append(pk)xa
#   wd = evaluate_wd(np.array(preds_bin[i]), np.array(y_test[i]), k=int(average_sentences(y_test[i])/2))
#   wd_metrics.append(wd)

# print(f"PK Mean: {np.mean(pk_metrics)}")
# print(f"WD Mean: {np.mean(wd_metrics)}")