In [1]:
from tensorflow import keras
from typing import List
from keras.preprocessing.text import Tokenizer
import nltk
import os
import pandas as pd

In [2]:
path = os.getcwd()
src_folder = os.path.abspath(os.path.join(path, os.pardir))
project_folder = os.path.abspath(os.path.join(src_folder, os.pardir))
outside_folder = os.path.abspath(os.path.join(project_folder, os.pardir))
data_folder = outside_folder + '/sb-mirror'
sponsor_df_save_path = data_folder + '/sponsor_dataframe.csv'
sponsor_df_save_path

'/Users/jinlingxing/Projects/Kaggle/sb-mirror/sponsor_dataframe.csv'

In [4]:
sponsor_df = pd.read_csv(sponsor_df_save_path)
sponsor_df.head()

Unnamed: 0.1,Unnamed: 0,videoID,Transcript,channelID,title,published,sponsored
0,0,GaGphoDeT2w,in the anime Community we make up a lot. of wo...,UCr8XdVBXUrjEYX3nxobTmIQ,Sasuke is the REAL Hokage?!,0.0,False
1,1,JzB7yS9t1YE,[MUSIC PLAYING]. LILY PENG: Hi everybody.. My ...,UC_x5XG1OV2P6uZZ5FSM9Ttw,Bringing AI and machine learning innovations t...,1525910000.0,False
2,2,9g_Q0QPsOtI,[Music]. hey guys welcome back to another vide...,UCQ2k71p7MJKU9iPpKfSWYOA,Turning My OC into a Desktop Buddy (Shimeji)! ...,0.0,False
3,3,P6aUSrw03bE,this video was sponsored by morningbrew. hey h...,UCRG_N2uO405WO4P3Ruef9NA,Phone labels: The EU's best idea yet,1662077000.0,True
4,4,RPO57PLwdY0,foreign. [Music]. welcome to audit the audit w...,UCc-0YpRpqgA5lPTpSQ5uo-Q,This Cop Doesn't Understand Basic Civilian Rights,0.0,True


In [6]:
one_transcript = sponsor_df.iloc[0]['Transcript']
one_transcript

"in the anime Community we make up a lot. of words that just don't exist in the. show or the manga words like gin cloak. jubidara jubita all things that we as a. community either shortened because we. talk about it so much or just made up. because it sounds cool however things is. a community that we make up isn't simply. limited to terms we also create entirely. fictional and non-real roles for. characters and shows what do I mean by. that well the most popular example of. this would have to be the shadow kage a. word for a character who may not be a. kage in name but as a kage enroll. essentially these Shadow kage do. everything but wear the big hat they're. usually the second most powerful person. in the village at the time whatever. respective kage's ruling and while the. actual Kake stays in the office and does. the paperwork this shadow kage goes off. and deals with the actual threats that. are threatening the respected Village. this term doesn't actually exist it's. never stated

**Tokenize: split the sentence into words**

In [7]:
tokenized_transcript = nltk.word_tokenize(one_transcript)

**Lemmatization: Stemming, but resulting stems are all valid words**

In [8]:
sponsor_lemma = nltk.WordNetLemmatizer()

In [100]:
lemmatized_transcript = ' '.join([sponsor_lemma.lemmatize(t) for t in tokenized_transcript])
lemmatized_transcript = lemmatized_transcript.lower()
lemmatized_transcript

"in the anime community we make up a lot . of word that just do n't exist in the . show or the manga word like gin cloak . jubidara jubita all thing that we a a. community either shortened because we . talk about it so much or just made up . because it sound cool however thing is . a community that we make up is n't simply . limited to term we also create entirely . fictional and non-real role for . character and show what do i mean by . that well the most popular example of . this would have to be the shadow kage a. word for a character who may not be a. kage in name but a a kage enroll . essentially these shadow kage do . everything but wear the big hat they 're . usually the second most powerful person . in the village at the time whatever . respective kage 's ruling and while the . actual kake stay in the office and doe . the paperwork this shadow kage go off . and deal with the actual threat that . are threatening the respected village . this term doe n't actually exist it 's . ne

Use **Spacy** as a **Lemmatization** method since the above lemmatization didn't convert 'running' to 'run'

In [101]:
import spacy

In [102]:
# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [103]:
# Parse the lemmatized transcript using the loaded 'en' model object spacy_nlp
spacy_parse_transcript = spacy_nlp(lemmatized_transcript)

In [105]:
spacy_transcript_list = [token.lemma_ for token in spacy_parse_transcript]

Here, 'running' is lemmalized to 'run'

**Filter out stop words** <br/>
Stopwords such as the, and, etc. should be removed. <br/>
Words that have fewer than 3 characters are removed.

In [106]:
from nltk.corpus import stopwords

In [107]:
stop_words_nltk = list(stopwords.words('english'))
stop_words_spacy = spacy_nlp.Defaults.stop_words
stop_words = stop_words_nltk + list(stop_words_spacy)

In [108]:
filtered_stopwords = [word for word in spacy_transcript_list if word not in stop_words and len(word) >3]
filtered_stopwords

**Bag of words**

In [109]:
vocab = set(filtered_stopwords)
len_vector = len(vocab)

In [110]:
index_word = {}
i = 0
for word in vocab:
    index_word[word] = i 
    i += 1

In [111]:
import numpy as np
from collections import defaultdict 
def bag_of_words(sent):
    count_dict = defaultdict(int)
    vec = np.zeros(len_vector)
    for item in sent:
        count_dict[item] += 1
    for key, item in count_dict.items():
        vec[index_word[key]] = item
    return vec  

In [123]:
vector = bag_of_words(filtered_stopwords)
vector

array([  1.,   1.,   2.,   3.,   1.,   1.,   1.,   1.,   2.,   1.,   1.,
         1.,   1.,   5.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,
         1.,   6.,   1.,   1.,   1.,   5.,   1.,   2.,   1.,   2.,  14.,
         1.,   5.,   1.,   9.,   1.,   1.,  17.,   1.,   2.,   3.,   1.,
         1.,   2.,   1.,   2.,   1.,   1.,   3.,   1.,   1.,   2.,   2.,
         1.,   2.,   1.,   4.,   1.,   2.,   1.,   9.,  11.,   1.,   1.,
         1.,   4.,   1.,   4.,   1.,   1.,   1.,   8.,   4.,   1.,   1.,
         2.,   1.,   1.,   3.,   1.,   1.,   6.,   1.,   1.,   1.,   4.,
         1.,   1.,   1.,   1.,   7.,   1.,   7.,   2.,   1.,   4.,   1.,
         1.,  12.,   5.,   1.,   2.,   4.,   1.,   1.,   1.,   1.,   1.,
         1.,   1.,   1.,   1.,   5.,   2.,   6.,  13.,   1.,   3.,   9.,
         1.,   1.,   1.,   2.,   9.,   2.,   2.,   1.,   1.,   1.,   1.,
         4.,   1.,   1.,   2.,   1.,   1.,   1.,   1.,   2.,   1.,   3.,
         1.,   1.,   1.,   5.,   2.,   1.,   2.,   

In [113]:
index_word

{'konawas': 0,
 'involved': 1,
 'coup': 2,
 'close': 3,
 'shoe': 4,
 'anybody': 5,
 'shinobis': 6,
 'kidnapping': 7,
 'base': 8,
 'equally': 9,
 'choose': 10,
 'caveat': 11,
 'legitimate': 12,
 'kona': 13,
 'christmas': 14,
 'flying': 15,
 'corn': 16,
 'runner': 17,
 'black': 18,
 'sudo': 19,
 'focused': 20,
 'shinyoke': 21,
 'dragon': 22,
 'push': 23,
 'chiha': 24,
 'yishiki': 25,
 'ship': 26,
 'child': 27,
 'democratic': 28,
 'reconnaissance': 29,
 'susano': 30,
 'sell': 31,
 'people': 32,
 'replacement': 33,
 'essentially': 34,
 'importantly': 35,
 'important': 36,
 'cynthia': 37,
 'percent': 38,
 'minato': 39,
 'modera': 40,
 'yeah': 41,
 'respective': 42,
 'simultaneously': 43,
 'keeping': 44,
 'information': 45,
 'wielder': 46,
 'happen': 47,
 'save': 48,
 'view': 49,
 'wall': 50,
 'jaraya': 51,
 'minus': 52,
 'assassination': 53,
 'wood': 54,
 'irony': 55,
 'evil': 56,
 'diplomacy': 57,
 'point': 58,
 'gather': 59,
 'darkness': 60,
 'category': 61,
 'kage': 62,
 'actually': 63,


In [114]:
def largest_indices(ary, n):
    """Returns the n largest indices from a numpy array."""
    flat = ary.flatten()
    indices = np.argpartition(flat, -n)[-n:]
    indices = indices[np.argsort(-flat[indices])]
    return np.unravel_index(indices, ary.shape)

In [115]:
max_n_indices = largest_indices(vector, 10)
max_n_indices[0]

array([710, 421, 704, 380, 220, 385, 579, 527,  39, 624])

In [124]:
vector[421]

65.0

In [116]:
keys = [k for k, v in index_word.items() for idx in max_n_indices[0] if v == idx]
keys

['minato',
 'naruto',
 'village',
 'akatsuki',
 'shadow',
 'thing',
 'konoha',
 'toby',
 'like',
 'hokage']

We can also apply **LDA(Latent Dirichlet Allocation)** for topic modelling 