# BERT Embeddings

This code was run in Google Colab environment to avoid conflicts in version of tensorflow used in the rest of this repository. The code of the encoder model is partially based upon the following [notebook](https://www.tensorflow.org/text/tutorials/bert_glue).

In [None]:
# A dependency of the preprocessing for BERT inputs
!pip install -q -U "tensorflow-text==2.8.*"

In [None]:
!pip install -q tf-models-official==2.7.0

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

import numpy as np

tf.get_logger().setLevel('ERROR')

In [None]:
#@title Choose a BERT model to fine-tune

bert_model_name = 'bert_en_uncased_L-12_H-768_A-12'  #@param ["bert_en_uncased_L-12_H-768_A-12", "bert_en_cased_L-12_H-768_A-12", "bert_multi_cased_L-12_H-768_A-12", "small_bert/bert_en_uncased_L-2_H-128_A-2", "small_bert/bert_en_uncased_L-2_H-256_A-4", "small_bert/bert_en_uncased_L-2_H-512_A-8", "small_bert/bert_en_uncased_L-2_H-768_A-12", "small_bert/bert_en_uncased_L-4_H-128_A-2", "small_bert/bert_en_uncased_L-4_H-256_A-4", "small_bert/bert_en_uncased_L-4_H-512_A-8", "small_bert/bert_en_uncased_L-4_H-768_A-12", "small_bert/bert_en_uncased_L-6_H-128_A-2", "small_bert/bert_en_uncased_L-6_H-256_A-4", "small_bert/bert_en_uncased_L-6_H-512_A-8", "small_bert/bert_en_uncased_L-6_H-768_A-12", "small_bert/bert_en_uncased_L-8_H-128_A-2", "small_bert/bert_en_uncased_L-8_H-256_A-4", "small_bert/bert_en_uncased_L-8_H-512_A-8", "small_bert/bert_en_uncased_L-8_H-768_A-12", "small_bert/bert_en_uncased_L-10_H-128_A-2", "small_bert/bert_en_uncased_L-10_H-256_A-4", "small_bert/bert_en_uncased_L-10_H-512_A-8", "small_bert/bert_en_uncased_L-10_H-768_A-12", "small_bert/bert_en_uncased_L-12_H-128_A-2", "small_bert/bert_en_uncased_L-12_H-256_A-4", "small_bert/bert_en_uncased_L-12_H-512_A-8", "small_bert/bert_en_uncased_L-12_H-768_A-12", "albert_en_base", "electra_small", "electra_base", "experts_pubmed", "experts_wiki_books", "talking-heads_base"]

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


# Define encoder model

In [None]:
def make_bert_preprocess_model(input_feature, seq_length=128):
  """Returns Model mapping string features to BERT inputs.

  Args:
    sentence_features: a list with the names of string-valued features.
    seq_length: an integer that defines the sequence length of BERT inputs.

  Returns:
    A Keras Model that can be called on a list or dict of string Tensors
    (with the order or names, resp., given by sentence_features) and
    returns a dict of tensors for input to BERT.
  """

  input = tf.keras.layers.Input(shape=(), dtype=tf.string, name=input_feature)

  # Tokenize the text to word pieces.
  bert_preprocess = hub.load(tfhub_handle_preprocess)
  tokenizer = hub.KerasLayer(bert_preprocess.tokenize, name='tokenizer')
  tokenized_input = tokenizer(input)

  # Optional: Trim segments in a smart way to fit seq_length.
  # Simple cases (like this example) can skip this step and let
  # the next step apply a default truncation to approximately equal lengths.
  #truncated_segments = segments

  # Pack inputs. The details (start/end token ids, dict of output tensors)
  # are model-dependent, so this gets loaded from the SavedModel.
  packer = hub.KerasLayer(bert_preprocess.bert_pack_inputs,
                          arguments=dict(seq_length=seq_length),
                          name='packer')
  model_inputs = packer([tokenized_input])
  return tf.keras.Model(input, model_inputs)

In [None]:
def build_encoder_model():

  class BertEncoder(tf.keras.Model):
    def __init__(self):
      super(BertEncoder, self).__init__(name="bert_encoder")
      self.encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=False)

    def call(self, preprocessed_text):
      encoder_outputs = self.encoder(preprocessed_text)
      pooled_output = encoder_outputs["pooled_output"]
      return pooled_output

  model = BertEncoder()
  return model

## Test the encoder model

In [None]:
test_preprocess_model = make_bert_preprocess_model('my_input', seq_length=512)
test_text = np.array([''])
text_preprocessed = test_preprocess_model(test_text)

print('Keys           : ', list(text_preprocessed.keys()))
print('Shape Word Ids : ', text_preprocessed['input_word_ids'].shape)
print('Word Ids       : ', text_preprocessed['input_word_ids'][0, :16])
print('Shape Mask     : ', text_preprocessed['input_mask'].shape)
print('Input Mask     : ', text_preprocessed['input_mask'][0, :16])
print('Shape Type Ids : ', text_preprocessed['input_type_ids'].shape)
print('Type Ids       : ', text_preprocessed['input_type_ids'][0, :16])

Keys           :  ['input_mask', 'input_type_ids', 'input_word_ids']
Shape Word Ids :  (1, 512)
Word Ids       :  tf.Tensor([101 102   0   0   0   0   0   0   0   0   0   0   0   0   0   0], shape=(16,), dtype=int32)
Shape Mask     :  (1, 512)
Input Mask     :  tf.Tensor([1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(16,), dtype=int32)
Shape Type Ids :  (1, 512)
Type Ids       :  tf.Tensor([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(16,), dtype=int32)


In [None]:
test_encoder_model = build_encoder_model()

In [None]:
bert_result = test_encoder_model(text_preprocessed)
print(bert_result)

tf.Tensor(
[[-0.9061686  -0.30534935  0.3171521   0.83511347  0.4395665  -0.29972827
   0.8893458   0.36543632 -0.22864583 -0.99962205 -0.7577885   0.8510732
   0.99460214 -0.7135146   0.9438737  -0.67603296 -0.7742429  -0.40936226
   0.44422138 -0.89821154  0.8173      0.99742836  0.36303943  0.2179146
   0.2788157   0.97933036 -0.84844613  0.9776022   0.959225    0.6766515
  -0.82754135  0.27336928 -0.99674845 -0.09600674 -0.6522807  -0.9957248
   0.34605992 -0.817617   -0.13695836 -0.19245848 -0.97509336  0.46894208
   0.9992123   0.91534936  0.6905341  -0.00565991 -0.9999982   0.13878094
  -0.8992897   0.29528618  0.4055255   0.80721307 -0.11436047  0.4293809
   0.24548663 -0.05618633  0.07560661  0.18532558 -0.08784298 -0.44802493
  -0.32473582  0.19809651 -0.49014044 -0.9536841   0.7677338  -0.80937487
  -0.4013765  -0.44057786  0.07018607 -0.09922893  0.81822723  0.31495848
   0.36488968 -0.89640534 -0.61990035  0.09573567 -0.3418816   1.
  -0.5366012  -0.9968454   0.59286726 -0

# Encode YAudit data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
from tqdm import tqdm
import pickle
import os

In [None]:
videos = pd.read_csv('/content/gdrive/MyDrive/YAudit_data/annotated-data.csv')

In [None]:
videos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2622 entries, 0 to 2621
Data columns (total 40 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   is_deleted                 2622 non-null   bool   
 1   deleted_at                 0 non-null      float64
 2   uuid                       2622 non-null   object 
 3   published_at               2622 non-null   object 
 4   title                      2622 non-null   object 
 5   description                2589 non-null   object 
 6   default_language           771 non-null    object 
 7   default_audio_language     1781 non-null   object 
 8   duration                   2622 non-null   object 
 9   dimension                  2622 non-null   object 
 10  definition                 2622 non-null   object 
 11  caption                    2622 non-null   bool   
 12  licensed_content           2622 non-null   bool   
 13  projection                 2622 non-null   objec

In [None]:
preprocess_model = make_bert_preprocess_model('text_input', seq_length=512)
encoder_model = build_encoder_model()

In [None]:
def load_data(filepath):
  data = {}
  base_path = '/content/gdrive/MyDrive/YAudit_data/'
  if os.path.exists(f'{base_path}{filepath}'):
    with open(f'{base_path}{filepath}', 'rb') as file:
      data = pickle.load(file)
  
  return data

def dump_data(data, filepath):
  base_path = '/content/gdrive/MyDrive/YAudit_data/'
  with open(f'{base_path}{filepath}', 'wb') as file:
      pickle.dump(data, file)

## Snippets

In [None]:
snippets_pickle = 'video_snippets.pickle'
all_video_snippets = load_data(snippets_pickle)

In [None]:
with tqdm(total=len(videos)) as pbar:
  for i, video in videos.iterrows():
    if video['uuid'] in all_video_snippets:
      #print('pass')
      pbar.update(1)
      continue
    
    snippet_text = np.array([f"{video['title']}. {video['description']}"])
    preprocessed_text = preprocess_model(snippet_text)
    encoded_text = encoder_model(preprocessed_text)
    all_video_snippets[video['uuid']] = encoded_text
    if (i % 10) == 0:
      dump_data(all_video_snippets, snippets_pickle)
    pbar.update(1)
  dump_data(all_video_snippets, snippets_pickle)

100%|██████████| 2622/2622 [1:43:42<00:00,  2.37s/it]


In [None]:
len(all_video_snippets)

2622

## Transcripts

In [None]:
import re


def remove_tags(text):
    """
    Remove vtt markup tags
    """
    tags = [
        r'</c>',
        r'<c(\.color\w+)?>',
        r'<\d{2}:\d{2}:\d{2}\.\d{3}>',

    ]

    for pat in tags:
        text = re.sub(pat, '', text)

    # extract timestamp, only kep HH:MM
    text = re.sub(
        r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .* align:start position:0%',
        r'\g<1>',
        text
    )

    text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
    return text

def remove_header(lines):
    """
    Remove vtt file header
    """
    pos = -1
    for mark in ('##', 'Language: en',):
        if mark in lines:
            pos = lines.index(mark)
    lines = lines[pos+1:]
    return lines


def merge_duplicates(lines):
    """
    Remove duplicated subtitles. Duplacates are always adjacent.
    """
    last_timestamp = ''
    last_cap = ''
    for line in lines:
        if line == "":
            continue
        if re.match('^\d{2}:\d{2}$', line):
            if line != last_timestamp:
#                 yield line
                last_timestamp = line
        else:
            if line != last_cap:
                yield line
                last_cap = line


def merge_short_lines(lines):
    buffer = ''
    for line in lines:
        if line == "" or re.match('^\d{2}:\d{2}$', line):
            yield '\n' + line
            continue

        if len(line+buffer) < 80:
            buffer += ' ' + line
        else:
            yield buffer.strip()
            buffer = line
    yield buffer


def parse_transcript(text):
    text = remove_tags(text)
    lines = text.splitlines()
    lines = remove_header(lines)
    lines = merge_duplicates(lines)
    lines = list(lines)
    lines = merge_short_lines(lines)
    lines = list(lines)
    result = ' '.join(lines)
    return re.sub('\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3} ', '', result)

In [None]:
videos['clean_transcript_old'] = videos['clean_transcript']
videos['clean_transcript'] = videos['transcript'].apply(lambda transcript: parse_transcript(str(transcript)))

In [None]:
transcripts_pickle = 'video_transcripts.pickle'
all_video_transcripts = load_data(transcripts_pickle)

In [None]:
with tqdm(total=len(videos)) as pbar:
  for i, video in videos.iterrows():
    if video['uuid'] in all_video_transcripts:
      #print('pass')
      pbar.update(1)
      continue
    
    transcript_text = np.array([video['clean_transcript']])
    preprocessed_text = preprocess_model(transcript_text)
    encoded_text = encoder_model(preprocessed_text)
    all_video_transcripts[video['uuid']] = encoded_text
    if (i % 10) == 0:
      dump_data(all_video_transcripts, transcripts_pickle)
    pbar.update(1)
  dump_data(all_video_transcripts, transcripts_pickle)

100%|██████████| 2622/2622 [1:55:31<00:00,  2.64s/it]


In [None]:
len(all_video_transcripts)

2622

## Comments

In [None]:
comments = pd.read_csv('/content/gdrive/MyDrive/YAudit_data/youtube-comments.csv')

In [None]:
video_uuids = videos.set_index('id')['uuid'].to_dict()
comments['video_uuid'] = comments['video_id'].apply(video_uuids.get)
comments = comments.loc[~comments['video_uuid'].isna()]

In [None]:
comments.shape

(444605, 26)

In [None]:
comments.loc[comments['text_original'].isna(), ['text_display']]

Unnamed: 0,text_display
200801,"<a href=""about:invalid#zCSafez""></a>"
332999,"<a href=""about:invalid#zCSafez""></a>"
381745,"<a href=""about:invalid#zCSafez""></a>"
389751,"<a href=""about:invalid#zCSafez""></a>"
396454,"<a href=""about:invalid#zCSafez""></a>"
467200,"<a href=""about:invalid#zCSafez""></a>"
478880,"<a href=""about:invalid#zCSafez""></a>"


In [None]:
comments['text_original'] = comments['text_original'].fillna('')

In [None]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 444605 entries, 0 to 487499
Data columns (total 26 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   is_deleted                444605 non-null  object 
 1   deleted_at                0 non-null       float64
 2   uuid                      444605 non-null  object 
 3   author_display_name       444583 non-null  object 
 4   author_profile_image_url  444605 non-null  object 
 5   author_channel_url        444605 non-null  object 
 6   text_display              444605 non-null  object 
 7   text_original             444605 non-null  object 
 8   can_rate                  444605 non-null  object 
 9   like_count                444605 non-null  int64  
 10  published_at              444605 non-null  object 
 11  youtube_update_timestamp  444605 non-null  object 
 12  can_reply                 218129 non-null  object 
 13  total_reply_count         218129 non-null  f

In [None]:
comments_per_video =  {}
comments_grouped_by_video = comments.groupby('video_uuid')
with tqdm(total=len(comments_grouped_by_video)) as pbar:
  for video_uuid, video_comments in comments_grouped_by_video:
    comments_per_video[video_uuid] = ' '.join(video_comments['text_original'].values)
    pbar.update(1)

100%|██████████| 2431/2431 [00:00<00:00, 3093.94it/s]


In [None]:
comments_per_video_file = 'comments_per_video.pickle'
dump_data(comments_per_video, comments_per_video_file)

In [None]:
comments_per_video['-5xNn86mNh8']

In [None]:
comments_pickle = 'video_comments.pickle'
all_video_comments = load_data(comments_pickle)

In [None]:
with tqdm(total=len(comments_per_video)) as pbar:
  for i, (video_uuid, comments_text) in enumerate(comments_per_video.items()):
    if video_uuid in all_video_comments:
      pbar.update(1)
      continue
    
    preprocessed_text = preprocess_model(np.array([comments_text]))
    encoded_text = encoder_model(preprocessed_text)
    all_video_comments[video_uuid] = encoded_text
    if (i % 10) == 0:
      dump_data(all_video_comments, comments_pickle)
    pbar.update(1)
  dump_data(all_video_comments, comments_pickle)

100%|██████████| 2431/2431 [1:45:14<00:00,  2.60s/it]


In [None]:
len(all_video_comments)

2431

### Fill missing video comments

In [None]:
filled_comments_pickle = 'video_comments_filled.pickle'
filled_video_comments = load_data(filled_comments_pickle)

In [None]:
preprocessed_empty_text = preprocess_model(np.array(['']))
encoded_empty_text = encoder_model(preprocessed_empty_text)

with tqdm(total=len(all_video_snippets)) as pbar:
  for i, video_uuid in enumerate(all_video_snippets.keys()):
    if video_uuid in filled_video_comments:
      pbar.update(1)
      continue
    
    if video_uuid in all_video_comments:
      filled_video_comments[video_uuid] = all_video_comments[video_uuid]
    else:
      filled_video_comments[video_uuid] = encoded_empty_text
    
    pbar.update(1)
  
  dump_data(filled_video_comments, filled_comments_pickle)

100%|██████████| 2622/2622 [00:00<00:00, 25149.08it/s]
