In [1]:
from tensorflow import keras
from typing import List
from keras.preprocessing.text import Tokenizer
import nltk
import os
import pandas as pd
import numpy as np
import spacy
from nltk.corpus import stopwords
from tqdm import tqdm

In [2]:
path = os.getcwd()
src_folder = os.path.abspath(os.path.join(path, os.pardir))
project_folder = os.path.abspath(os.path.join(src_folder, os.pardir))
outside_folder = os.path.abspath(os.path.join(project_folder, os.pardir))
data_folder = outside_folder + '/sb-mirror'
sponsor_df_save_path = data_folder + '/sponsor_dataframe.csv'

# initialize tqdm for pandas
tqdm.pandas() 

In [3]:
sponsor_df = pd.read_csv(sponsor_df_save_path)
sponsor_df.head()

Unnamed: 0.1,Unnamed: 0,videoID,Transcript,channelID,title,published,sponsored
0,0,GaGphoDeT2w,in the anime Community we make up a lot. of wo...,UCr8XdVBXUrjEYX3nxobTmIQ,Sasuke is the REAL Hokage?!,0.0,False
1,1,JzB7yS9t1YE,[MUSIC PLAYING]. LILY PENG: Hi everybody.. My ...,UC_x5XG1OV2P6uZZ5FSM9Ttw,Bringing AI and machine learning innovations t...,1525910000.0,False
2,2,9g_Q0QPsOtI,[Music]. hey guys welcome back to another vide...,UCQ2k71p7MJKU9iPpKfSWYOA,Turning My OC into a Desktop Buddy (Shimeji)! ...,0.0,False
3,3,P6aUSrw03bE,this video was sponsored by morningbrew. hey h...,UCRG_N2uO405WO4P3Ruef9NA,Phone labels: The EU's best idea yet,1662077000.0,True
4,4,RPO57PLwdY0,foreign. [Music]. welcome to audit the audit w...,UCc-0YpRpqgA5lPTpSQ5uo-Q,This Cop Doesn't Understand Basic Civilian Rights,0.0,True


In [4]:
sponsor_df.iloc[0]['Transcript']

"in the anime Community we make up a lot. of words that just don't exist in the. show or the manga words like gin cloak. jubidara jubita all things that we as a. community either shortened because we. talk about it so much or just made up. because it sounds cool however things is. a community that we make up isn't simply. limited to terms we also create entirely. fictional and non-real roles for. characters and shows what do I mean by. that well the most popular example of. this would have to be the shadow kage a. word for a character who may not be a. kage in name but as a kage enroll. essentially these Shadow kage do. everything but wear the big hat they're. usually the second most powerful person. in the village at the time whatever. respective kage's ruling and while the. actual Kake stays in the office and does. the paperwork this shadow kage goes off. and deals with the actual threats that. are threatening the respected Village. this term doesn't actually exist it's. never stated

In [4]:
def preprocess(one_transcript: str):
    # Lemmatization
    tokenized_transcript = nltk.word_tokenize(one_transcript)
    sponsor_lemma = nltk.WordNetLemmatizer()
    lemmatized_transcript = ' '.join([sponsor_lemma.lemmatize(t) for t in tokenized_transcript])
    lemmatized_transcript = lemmatized_transcript.lower()
    spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    spacy_parse_transcript = spacy_nlp(lemmatized_transcript)
    spacy_transcript_list = [token.lemma_ for token in spacy_parse_transcript]
    
    # Remove stopwords
    stop_words_nltk = list(stopwords.words('english'))
    stop_words_spacy = spacy_nlp.Defaults.stop_words
    stop_words = stop_words_nltk + list(stop_words_spacy)
    filtered_stopwords = [word for word in spacy_transcript_list if word not in stop_words and len(word) >3]
    return filtered_stopwords

In [5]:
sponsor_df['processed_transcript'] = sponsor_df['Transcript'].progress_apply(lambda x: preprocess(x))

100%|███████████████████████████████████████████| 1372/1372 [12:56<00:00,  1.77it/s]


In [7]:
# save the processed sponsor transcripts locally
processed_transcript_save_path = data_folder + '/processed_sponsor_dataframe.csv'
sponsor_df.to_csv(processed_transcript_save_path)
sponsor_df

Unnamed: 0.1,Unnamed: 0,videoID,Transcript,channelID,title,published,sponsored,processed_transcript
0,0,GaGphoDeT2w,in the anime Community we make up a lot. of wo...,UCr8XdVBXUrjEYX3nxobTmIQ,Sasuke is the REAL Hokage?!,0.000000e+00,False,"[anime, community, word, exist, manga, word, l..."
1,1,JzB7yS9t1YE,[MUSIC PLAYING]. LILY PENG: Hi everybody.. My ...,UC_x5XG1OV2P6uZZ5FSM9Ttw,Bringing AI and machine learning innovations t...,1.525910e+09,False,"[music, play, lily, peng, everybody, lily, pen..."
2,2,9g_Q0QPsOtI,[Music]. hey guys welcome back to another vide...,UCQ2k71p7MJKU9iPpKfSWYOA,Turning My OC into a Desktop Buddy (Shimeji)! ...,0.000000e+00,False,"[music, welcome, video, tell, today, intro, li..."
3,3,P6aUSrw03bE,this video was sponsored by morningbrew. hey h...,UCRG_N2uO405WO4P3Ruef9NA,Phone labels: The EU's best idea yet,1.662077e+09,True,"[video, sponsor, morningbrew, happy, friday, t..."
4,4,RPO57PLwdY0,foreign. [Music]. welcome to audit the audit w...,UCc-0YpRpqgA5lPTpSQ5uo-Q,This Cop Doesn't Understand Basic Civilian Rights,0.000000e+00,True,"[foreign, music, welcome, audit, audit, sort, ..."
...,...,...,...,...,...,...,...,...
1367,1367,yP-lYJm8_bQ,ladies and gentlemen we have lost yet. another...,,,0.000000e+00,True,"[lady, gentleman, lose, useful, application, t..."
1368,1368,7B5cMK7gUVY,I do love it when Brands put an effort. into t...,UCwwuSBYcErVlOpveYubHv4g,"Goodies from Google! Pixel 7, Pixel 7 Pro, Pix...",0.000000e+00,False,"[love, brand, effort, unbox, experience, googl..."
1369,1369,ScHzMnAcn_s,hello everybody today we're going to. talk abo...,,,0.000000e+00,True,"[hello, everybody, today, talk, opinion, overl..."
1370,1370,C9p7WO01yA8,this episode of moist Mater brought to. you by...,UCq6VFHwMzcMXbuKyG7SQYIg,Moist Meter | M3GAN,0.000000e+00,True,"[episode, moist, mater, bring, godslap, issue,..."
