## Handle Files

In [1]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Authenticate to access cloud bucket
from google.colab import auth
auth.authenticate_user()

#create a LOCAL directory in /content/  so you can move stuff from bucket to local
!mkdir /content/nlp_podcast_episodes_2
# # copy from google bucket to local directory
!gsutil -m -q cp -r -n gs://podcast_episodes/train_test/ /content/nlp_podcast_episodes_2

## Imports and Paths

In [3]:
import pandas as pd
import numpy as np
from tensorflow import convert_to_tensor
from tensorflow.keras import layers
from tensorflow.keras.metrics import Accuracy, Recall
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import json
import sys
from nltk.metrics.segmentation import windowdiff
from nltk.metrics.segmentation import pk

path = '/content/drive/MyDrive/nlp_podcast_segmentation/'
yt_path = path + 'data/YouTube/'
splits = 'yt_scripts_segments_split_n5_111422'
pooling = 'all-MiniLM-L6-v2-meanpooling' # all-MiniLM-L6-v2-meanpooling or #stsb-mpnet-base-v2-meanpooling
embed_path = yt_path + f'embeddings/{splits}/{pooling}/'
dev_dir =  path + 'scripts/john/supervised/modeling/'
train_test_path = '/content/nlp_podcast_episodes_2/train_test/'

sys.path.append(dev_dir)
from data_loader import DataGenerator

In [4]:
# Set params
max_sequence = 350
embed_dim = 384
batch_size = 1024
c = 1

In [5]:
## Eval function
def average_sentences(topic_list):
  '''
  Counts the number of sentences between topics
  input: topic labels
  returns: average number of sentences between topics
  '''
  idx = 0
  count = 0
  sentence_counts = []
  
  while(idx < len(topic_list)):
    if(topic_list[idx] == 1):
      sentence_counts.append(count)
      count = 0
    count += 1
    idx += 1

  if len(sentence_counts) == 0:
    return 0
    
  return sum(sentence_counts) / len(sentence_counts)

## Data loaders

In [6]:
c = 1

In [7]:
# Parameters
params = {'input_dim': (max_sequence, embed_dim),
          'output_dim': max_sequence,
          'batch_size': batch_size,
          'shuffle': True}

# Datasets
with open(f"{train_test_path}partitions.json", 'r') as f:
  partitions = json.load(f)
with open(f"{train_test_path}labels.json", 'r') as f:
  labels = json.load(f)

# Generators
train_ids = partitions['train']
test_ids = partitions['test']
training_generator = DataGenerator(train_ids, labels, train_test_path, **params)
test_generator = DataGenerator(test_ids, labels, train_test_path, **params)
print(len(train_ids), len(test_ids))

42521 1652


In [8]:
## TEST LABELS
# Prepare test inputs, labels
test_size = len(test_ids)
X_test = np.empty((test_size, max_sequence, embed_dim))
k_test = []
y_test = []
y_test_list = np.empty((test_size, max_sequence), dtype=str)

for i, ID in enumerate(test_ids):
  X_test[i] = np.load(f"{train_test_path}{ID}.npy")
  y_test_list[i] = np.array(labels[ID], dtype='uint8')
  y_test.append(''.join(str(z) for z in labels[ID]))
  k_test.append(int(average_sentences(np.array(labels[ID], dtype='uint8'))/2))

In [9]:
# Loop through batches to get label counts
pos = 0
neg = 0
for ID in partitions['train']:
  pos += np.sum(labels[ID])
  neg += len(labels[ID]) - np.sum(labels[ID])
total = pos + neg
print(f"Pos rate: {pos/total}, Neg rate: {neg/total}, Total: {total}")

# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
initial_bias = np.log([pos/neg])
print(f"Initial Bias: {initial_bias}")
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)
class_weight = {0: weight_for_0, 1: weight_for_1}
print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

pos_weight = c*weight_for_1/weight_for_0
print(f"Pos_weight: {pos_weight}")

Pos rate: 0.04250545108803381, Neg rate: 0.9574945489119662, Total: 14882350
Initial Bias: [-3.1146877]
Weight for class 0: 0.52
Weight for class 1: 11.76
Pos_weight: 22.526394248325516


## Baseline Model (Simple RNN)

In [10]:
def create_rnn_model(max_sequence_length=1000,
                     embed_dim = 384,
                     hidden_dim=64,
                     dropout=0.10,
                     learning_rate=0.001,
                     output_bias=np.log(0.05/0.95),
                     loss=tf.keras.losses.BinaryCrossentropy(from_logits=True)
                     ):

  ## Build model
  output_bias = tf.keras.initializers.Constant(output_bias)
  input_layer = tf.keras.layers.Input(
      shape=(max_sequence_length, embed_dim), dtype='float32')
  bilstm1 = Bidirectional(LSTM(
      hidden_dim, return_sequences=True, activation='relu', name='rnn_layer_1'
      ))(input_layer)
  bilstm2 = Bidirectional(LSTM(
      hidden_dim, return_sequences=True, activation='relu', name='rnn_layer_2'
      ))(bilstm1)
  drop1 = tf.keras.layers.Dropout(dropout)(bilstm2)
  dense1 = tf.keras.layers.Dense(128, activation='relu', name='dense_1')(drop1)
  dense2 = tf.keras.layers.Dense(32, activation='relu', name='dense_2')(dense1)                  
  output = tf.keras.layers.Dense(1, activation='sigmoid',
                                 bias_initializer=output_bias,
                                 name='segment_classifier')(dense2)
  
  rnn_model = tf.keras.models.Model(inputs=input_layer, outputs=[output])
  rnn_model.compile(loss=loss,
                    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate),
                    metrics=[tf.keras.metrics.Recall(), 
                             tf.keras.metrics.Accuracy()]
                    )

  ### END YOUR CODE
  return rnn_model

In [11]:
# Eventual loss function strucutre
def my_loss(weight):
  def weighted_cross_entropy_with_logits(labels, logits):
    labels = tf.cast(labels, 'float32')
    #logits = tf.where(tf.less(logits, 0), 0., logits) # prevents odd error of pred < 0
    loss = tf.nn.weighted_cross_entropy_with_logits(
        labels, logits, weight
    )
    return loss
  return weighted_cross_entropy_with_logits

## LR Scheduler
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.1,
    patience=3,
    verbose=1,
    mode="auto",
    min_delta=0.0001)

In [17]:
rnn_model = create_rnn_model(max_sequence_length=max_sequence,
                              embed_dim=embed_dim,
                              hidden_dim=32,
                              dropout=0.05,
                              learning_rate=0.00001,
                              output_bias=initial_bias,
                              loss=my_loss(weight=pos_weight)
                              )
rnn_model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 350, 384)]        0         
                                                                 
 bidirectional_6 (Bidirectio  (None, 350, 64)          106752    
 nal)                                                            
                                                                 
 bidirectional_7 (Bidirectio  (None, 350, 64)          24832     
 nal)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 350, 64)           0         
                                                                 
 dense_1 (Dense)             (None, 350, 128)          8320      
                                                                 
 dense_2 (Dense)             (None, 350, 32)           4128

## Train

In [None]:
rnn_history = rnn_model.fit(training_generator,
                            validation_data=test_generator,
                            epochs=20,
                            callbacks=[reduce_lr],
                            )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 8: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-07.
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 11: ReduceLROnPlateau reducing learning rate to 9.999999974752428e-08.
Epoch 12/20
Epoch 13/20
Epoch 14/20

KeyboardInterrupt: ignored

## Evaluate

In [None]:
# MAE functions
def mae_std(y_true, y_pred):
  ep_ref_tc = tf.reduce_sum(np.array(y_true, dtype='int'), axis=1).numpy()
  ep_pred_tc = tf.reduce_sum(np.array(y_pred, dtype='int'), axis=1).numpy()
  ep_diff = np.abs(ep_ref_tc - ep_pred_tc)
  ep_rel_diff = ep_diff/ep_ref_tc
  mae_std = np.mean(ep_rel_diff)

  return mae_std

# Remove back to back transition preds (keep last one in bb set)
def remove_bb_preds(y_pred_bin_int):
  for ep_num in range(len(y_pred_bin_int)):
    pos_inds = np.where(np.array(y_pred_bin_int[ep_num])==1)[0]
    for i in range(1, len(pos_inds)):
      if pos_inds[i-1] == (pos_inds[i]-1):
        y_pred_bin_int[ep_num][pos_inds[i-1]] = 0 #set left ind to zero

  return y_pred_bin_int


def eval_preds(preds, thresh):
  preds_bin_int = [[1 if x > thresh else 0 for x in p.reshape(-1)] for p in preds]
  preds_bin_int = remove_bb_preds(preds_bin_int)
  preds_bin_str = [''.join(str(a) for a in z) for z in preds_bin_int]

  # Get PK, WD
  pk_metrics = []
  wd_metrics = []
  for i in range(test_size):
    pk_metric = pk(preds_bin_str[i], y_test[i], k=k_test[i])
    wd = windowdiff(preds_bin_str[i], y_test[i], k=k_test[i])
    pk_metrics.append(pk_metric)
    wd_metrics.append(wd)

  # Get MAE
  mae = mae_std(y_test_list, preds_bin_int)

  print(f"\nThreshold: {thresh}")
  print(f"PK Mean: {np.mean(pk_metrics)}")
  print(f"WD Mean: {np.mean(wd_metrics)}")
  print(f"MAE    : {round(mae, 3)}")

  return preds_bin_str, pk_metrics, wd_metrics, mae

In [None]:
preds = rnn_model.predict(X_test)
thresholds = [0.25, 0.5, 0.75, 0.9 , 0.99, 0.999, 0.99999999999]
for thresh in thresholds:
  preds_bin, pk_metrics, wd_metrics, mae = eval_preds(preds, thresh)


Threshold: 0.25
PK Mean: 0.3094044987009672
WD Mean: 0.3094044987009672
MAE    : 1.0

Threshold: 0.5
PK Mean: 0.3094044987009672
WD Mean: 0.3094044987009672
MAE    : 1.0

Threshold: 0.75
PK Mean: 0.3094044987009672
WD Mean: 0.3094044987009672
MAE    : 1.0

Threshold: 0.9
PK Mean: 0.3094044987009672
WD Mean: 0.3094044987009672
MAE    : 1.0

Threshold: 0.99
PK Mean: 0.3094044987009672
WD Mean: 0.3094044987009672
MAE    : 1.0

Threshold: 0.999
PK Mean: 0.3094044987009672
WD Mean: 0.3094044987009672
MAE    : 1.0

Threshold: 0.99999999999
PK Mean: 0.3094044987009672
WD Mean: 0.3094044987009672
MAE    : 1.0


In [None]:
# # Get threshold for each episode by using mu, sigma of probs
# cos_mus = tf.reduce_mean(preds, axis=0).numpy().reshape(-1)
# cos_sigs = tf.math.reduce_std(preds, axis=0).numpy().reshape(-1)
# ep_thresh = cos_mus + cos_sigs

# # Apply
# preds_bin = np.empty((test_size, max_sequence), dtype=int)
# for i in range(len(preds)):
#   preds_bin[i] = [1 if x > ep_thresh[i] else 0 for x in preds[i].reshape(-1)]

# # Eval PK
# pk_metrics = []
# wd_metrics = []
# for i in range(test_size):
#   pk = evaluate_pk(np.array(preds_bin[i]), np.array(y_test[i]), k=int(average_sentences(y_test[i])/2))
#   pk_metrics.append(pk)xa
#   wd = evaluate_wd(np.array(preds_bin[i]), np.array(y_test[i]), k=int(average_sentences(y_test[i])/2))
#   wd_metrics.append(wd)

# print(f"PK Mean: {np.mean(pk_metrics)}")
# print(f"WD Mean: {np.mean(wd_metrics)}")