<a href="https://colab.research.google.com/github/jmbost20/YouTransfer/blob/main/Clean_Prepare_Transcript.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#For handling
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#For tokenizing 
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

#For vocab building and tensor operations
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(42)

import json

from collections import Counter, OrderedDict
from torchtext.vocab import vocab

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/drive


In [None]:
def clean_words_string(id, df):
  words_string = df.loc[id]['Full_Transcript']
  words_string = words_string.replace('\\n',' ').replace('\\xa0', '').replace('\'','').replace('[Music]', '').replace('[', '').replace(']', '').replace(',,',',').lower()
  return words_string

In [None]:
def pull_draw(threshold,minimum, mean_len):
  #grab initial draw with mean equal to either mean of 
  draw = np.random.poisson(min(mean_len, 15))
  while(draw > threshold and draw < minimum):
      draw = np.random.poisson(mean_len)
  return draw


In [None]:
#words is a list of lists containing word tokens from each data entry

def generate_formatted_inputs(words):
  #Calc lengths to check sizes
  t = [len(i) for i in words]
  mean_len = sum(t)/len(words)
  minimum = max(min(t),15)  #Want a reasonably high lower bound
  del t

  #Set hyper-parameters
  stopping_threshold_size = minimum
  upper_bound_input_size = 75
  
  new_list = []
  for i in words:
    #Calc number of word tokens in the ith entry
    length = len(i)

    #Check if entry is too long
    if length > upper_bound_input_size:
      #Draw initial value to determine first index cutoff
      draw = pull_draw(upper_bound_input_size,minimum,mean_len)

      #define sum of draws to serve as rolling lower index
      draw_sum = draw

      #append first formatted entry to list of all entries
      new_list.append(i[0:draw])

      #compute how many tokens remain
      d = length - draw_sum

      while(d > stopping_threshold_size):
        #draw with a bound dependent on amount of space left 
        draw = pull_draw(min(upper_bound_input_size,d),minimum,mean_len)

        #append new entry
        new_list.append(i[draw_sum:draw_sum+draw])
        
        #update conditions for index and loop
        draw_sum += draw
        d -= draw
      #append remaining as final list from this entry
      new_list.append(i[draw_sum:length-1])
    else:
      #append formatted entry
      new_list.append(i)
  
  return new_list

   

In [None]:
#words is a list of lists containing word tokens from each data entry

def generate_formatted_inputs_NEW(words):
  #Calc lengths to check sizes
  t = [len(i) for i in words]
  mean_len = sum(t)/len(words)
  minimum = max(min(t),10)  #Want a reasonably high lower bound  #Chnage from 15 to 10
  del t

  #Set hyper-parameters
  stopping_threshold_size = minimum
  upper_bound_input_size = 20  #Change from 75 to 20
  
  new_list = []
  for i in words:
    #Calc number of word tokens in the ith entry
    length = len(i)

    #Check if entry is too long
    if length > upper_bound_input_size:
      #Draw initial value to determine first index cutoff
      draw = pull_draw(upper_bound_input_size,minimum,mean_len)

      #define sum of draws to serve as rolling lower index
      draw_sum = draw

      #append first formatted entry to list of all entries
      new_list.append(np.array(i[0:draw]))

      #compute how many tokens remain
      d = length - draw_sum

      while(d > stopping_threshold_size):
        #draw with a bound dependent on amount of space left 
        draw = pull_draw(min(upper_bound_input_size,d),minimum,mean_len)

        #append new entry
        new_list.append(np.array(i[draw_sum:draw_sum+draw]))
        
        #update conditions for index and loop
        draw_sum += draw
        d -= draw
      #append remaining as final list from this entry
      new_list.append(np.array(i[draw_sum:length-1]))
    else:
      #append formatted entry
      new_list.append(np.array(i))
  
  return np.array(new_list)

   

In [None]:
def format_transcript_data(Channel_Name):

  #Note this may need to change depending on the machine, ideally we will have a shared folder that is accessible to all using same path
  file_path = f'/content/drive/MyDrive/Style-Transfer/Youtuber-Transcripts/{Channel_Name}.csv'   
  df = pd.read_csv(file_path, index_col = 0)

  max_dim_size = 20 #Change from 75 to 20

  all_transcript_inputs = np.array([])
  num_videos = len(df.index) # df.index
  ### Use to get analytics on tokens
  print(f"total num of videos for {Channel_Name}: ",num_videos) # number of the videos

  for id in df.index:
    #Get transcript as single string and perform text cleaning
    words_string = clean_words_string(id, df)

    #Break string into sequence of sentence tokens: NLTK sent_tokenizer preferred
    list_nltk_sentence_tokenizer = sent_tokenize(words_string)

    #Break every sentence into word tokens
    word_tokenization_list = [word_tokenize(i) for i in list_nltk_sentence_tokenizer]
    
    formatted_input_list = generate_formatted_inputs_NEW(word_tokenization_list)
    formatted_input_array = np.array(formatted_input_list)

    clean_input_array = formatted_input_array.flatten('C')
    # print("TOTAL Sentences in transcript", len(clean_input_array)) 
    all_transcript_inputs = np.hstack((all_transcript_inputs, clean_input_array))

    # if len(clean_input_array) <10:
    #   print(id)
    #   print([len(i) for i in clean_input_array])
  result = [[j[i] if i < len(j) else '<pad>' for i in range(max_dim_size-1)] for j in all_transcript_inputs]
  [i.insert(0, "<sos>") for i in result]
  #print(result[0])
  num_tokens= len(result)
  print(num_tokens) #sentence tokens

  print(num_tokens/ num_videos)
  return result

In [None]:


def make_vocabulary_map(loaded_corpus):
  filepath_bow = '/content/drive/MyDrive/Style-Transfer/Data/BoW_file.json'

  #Set size limit on vocab list
  slice_index = 9999
  temp_list = loaded_corpus
  #flat_list = [item for youtuber_list in temp_list for sublist in youtuber_list for item in sublist]  #Assume form [Youtuber[sentence[word]]]
  flat_list = [item for sentence in temp_list for item in sentence]
  counter = Counter(flat_list)
  sliced_sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)[0:slice_index]

  ordered_dict = OrderedDict(sliced_sorted_by_freq_tuples)

  with open(filepath_bow, "w") as outfile:
      outfile.write(json.dumps(ordered_dict))

  unk_token = '<unk>'
  #sos_token = '<sos>'
  #pad_token = '<pad>'
  default_index = -1
  vocab_mapping = vocab(ordered_dict, specials=[unk_token])
  #vocab_mapping = vocab(ordered_dict)
  vocab_mapping.set_default_index(default_index)
  return ordered_dict, vocab_mapping

In [None]:
 channels = ['Kings and Generals','3Blue1Brown'] ### Add
# #channels= ['3Blue1Brown']
# #/content/drive/MyDrive/Style-Transfer/Youtuber-Transcripts/Kings and Generals.csv
corpus = [format_transcript_data(i) for i in channels]
lengths = [len(i) for i in corpus]
corpus = [temp for sublist in corpus for temp in sublist]
#[i.insert(0,"<sos>") for i in corpus]
#ordered_dict, vocabulary_map = make_vocabulary_map(corpus)
# #
# corpus_path = '/content/drive/MyDrive/Style-Transfer/Data/Corpus.npy'
# np.save(corpus_path,np.array(corpus), allow_pickle=True)

total num of videos for Kings and Generals:  735


  return np.array(new_list)


91121
123.97414965986394
total num of videos for 3Blue1Brown:  119


  return np.array(new_list)


22060
185.3781512605042


In [None]:
channels =["ElectroBOOM", "Historia Civilis", "Kings and Generals", "PBS Eons", "Moth Light Media", "3Blue1Brown", "Dr Dray"]
#channels = ['Kings and Generals','3Blue1Brown'] ### Add
#channels= ['3Blue1Brown']
#/content/drive/MyDrive/Style-Transfer/Youtuber-Transcripts/Kings and Generals.csv
corpus = [format_transcript_data(i) for i in channels]
lengths = [len(i) for i in corpus] # 


total num of videos for ElectroBOOM:  198


  return np.array(new_list)


11788
59.535353535353536
total num of videos for Historia Civilis:  83


  return np.array(new_list)


16561
199.53012048192772
total num of videos for Kings and Generals:  735


  return np.array(new_list)


91230
124.12244897959184
total num of videos for PBS Eons:  225


  return np.array(new_list)


13649
60.66222222222222
total num of videos for Moth Light Media:  101


  return np.array(new_list)


199
1.9702970297029703
total num of videos for 3Blue1Brown:  119


  return np.array(new_list)


22190
186.47058823529412
total num of videos for Dr Dray:  2428


  return np.array(new_list)


676123
278.46911037891266


In [None]:
vocab_path = '/content/drive/MyDrive/Style-Transfer/Data/Vocabulary_obj.pth'
torch.save(vocabulary_map, vocab_path)  #Save Vocab Index

#corpus_path = '/content/drive/MyDrive/Style-Transfer/Data/Corpus.npy'
#np.save(corpus_path,np.array(corpus), allow_pickle=True)

In [None]:

#len(vocabulary_map)

In [None]:
lengths

[91130, 22265]

In [None]:
labels1 = ['Kings and Generals' if i < 91616 else '3Blue1Brown' for i in range(sum(lengths))]



In [None]:
labels = []
for i in range(len(lengths)):
  for j in range(lengths[i]):
    labels.append(channels[i])

labels==labels1

False

In [None]:
labels_path = '/content/drive/MyDrive/Style-Transfer/Data/labels.npy'
np.save(labels_path, labels)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Get token from index
vocabulary_map.lookup_token(5)
#Get index from token
vocabulary_map['and']

vocabulary_map['in']  #Word2index
vocabulary_map.lookup_token(10)  #index2Word
vocabulary_map.get_itos() #BOW

In [None]:

test = torch.load(vocab_path)      #Load Vocab Index
test

Vocab()

In [None]:
# evening out the number of the tokens in corpus, based on the shortest
torch.manual_seed(42)

temp = np.array(corpus)
random_indices = np.random.choice(lengths[0]-1, size = lengths[0] - lengths[1], replace=False)
temp_cut = np.delete(temp, obj=random_indices, axis=0)

ordered_dict, vocabulary_map = make_vocabulary_map(temp_cut)


labels = ['Kings and Generals' if i < lengths[1] else '3Blue1Brown' for i in range(sum(lengths))]

corpus_path = '/content/drive/MyDrive/Style-Transfer/Data/Corpus.npy'
np.save(corpus_path,temp_cut, allow_pickle=True)

vocab_path = '/content/drive/MyDrive/Style-Transfer/Data/Vocabulary_obj.pth'#tensor object from torch
torch.save(vocabulary_map, vocab_path)  #Save Vocab Index

labels_path = '/content/drive/MyDrive/Style-Transfer/Data/labels.npy' # disparate labels from code
np.save(labels_path, labels)

In [None]:
ordered_dict, vocabulary_map = make_vocabulary_map(temp_cut)
vocab_path = '/content/drive/MyDrive/Style-Transfer/Data/Vocabulary_obj.pth'
torch.save(vocabulary_map, vocab_path)  #Save Vocab Index

In [None]:
#44264//128

In [None]:
# corpus_path = '/content/drive/MyDrive/Style-Transfer/Data/Corpus.npy'

# test = np.load(corpus_path, allow_pickle = True)

In [None]:
#test[0]

array(['<sos>', 'at', 'pretty', 'much', 'every', ',', 'place', 'they',
       'had', 'attempted', 'to', 'invade', '<pad>', '<pad>', '<pad>',
       '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], dtype='<U67')

In [None]:
#Cut corpus into halves:



In [None]:
#list_period_delim = test_str.split(sep='.')
#list_comma_delim = test_str.split(sep=',')
#list_nltk_sentence_tokenizer = sent_tokenize(test_str)

#print(len(list_period_delim))
#print(len(list_comma_delim))
#print(len(list_nltk_sentence_tokenizer))

#[sent_tokenize(token) for token in list_comma_delim]

#[len(token) for token in list_nltk_sentence_tokenizer]