week3 objectives:
1. topic extraction for variations of transcript: phrase, sentence and chunk
2. visualize topic clusters
3. compare quality of topics for variations

In [1]:
import os

import warnings
warnings.filterwarnings('ignore')

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf

In [2]:
# install
%pip install -q bertopic

# %pip install bertopic | grep -v 'already satisfied\|wheel'

from transformers.utils import logging
# disable tqdm
logging.disable_progress_bar()

Note: you may need to restart the kernel to use updated packages.


In [104]:
import json

with open("../w2/chunk.json") as f:
    data = json.load(f)

In [105]:
print(data['data'][0]['duration'])

data = data['data'][0]['chunks']

5.0


In [106]:
from sentence_transformers import SentenceTransformer


# paraphrase-multilingual-mpnet-base-v2

sentenc_model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [107]:
# extract embeddings
embeddings = sentenc_model.encode([i['text'] for i in data], show_progress_bar=True)

embeddings.shape  # num of phrases = 244

Batches: 100%|██████████| 4/4 [00:00<00:00,  8.68it/s]


(124, 768)

In [108]:
import umap

# dimensionality reduction
'''
dim_embeddings = umap.UMAP(n_neighbors=5, n_components=20, metric='cosine').fit_transform(embeddings)

dim_embeddings.shape
'''

"\ndim_embeddings = umap.UMAP(n_neighbors=5, n_components=20, metric='cosine').fit_transform(embeddings)\n\ndim_embeddings.shape\n"

In [109]:
import hdbscan

# clustering
'''
clusters = hdbscan.HDBSCAN(min_cluster_size=7).fit(dim_embeddings)
'''

'\nclusters = hdbscan.HDBSCAN(min_cluster_size=7).fit(dim_embeddings)\n'

In [114]:
# stitched together
from bertopic import BERTopic

from sklearn.feature_extraction.text import CountVectorizer

# sub models
vectorizer = CountVectorizer(stop_words='english')

reduc_model = umap.UMAP(n_neighbors=5, n_components=25, metric='cosine')
clustr_model = hdbscan.HDBSCAN(min_cluster_size=5)

model = BERTopic(embedding_model=sentenc_model, 
                 umap_model=reduc_model, 
                 hdbscan_model=clustr_model, 
                 vectorizer_model=vectorizer).fit([i['text'] for i in data], embeddings)

In [115]:
model.get_topic_info().head(15)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,24,-1_people_trans_gun_don,"[people, trans, gun, don, guns, control, owner...","[them as we are, say, Ukraine's trans army? A..."
1,0,28,0_npr_know_ussie_radio,"[npr, know, ussie, radio, reporter, national, ...",[But it turns out NPR didn't tell the entire s...
2,1,25,1_like_want_dangerous_fear,"[like, want, dangerous, fear, curious, people,...","[a good thing. The pedophile curious, now the..."
3,2,18,2_trans_thing_firearms_don,"[trans, thing, firearms, don, 15, bang, need, ...",[Ban assault weapons and high capacity magazin...
4,3,15,3_targeting_changing_hampshire_shooting,"[targeting, changing, hampshire, shooting, tra...",[some queer people to take up arms. New Hamps...
5,4,9,4_day_got_frivolity_yard,"[day, got, frivolity, yard, measurably, litera...",[I got it done once. I'm going to do it again...
6,5,5,5_party_instance_eastern_living,"[party, instance, eastern, living, head, neigh...","[Joe Biden, for instance, makes this point reg..."


In [121]:
model.get_topic_info().Name[1]

'0_npr_know_ussie_radio'

In [122]:
model.get_topic_info().Representative_Docs[1]

["But it turns out NPR didn't tell the entire story.  What NPR left out was that Ussie had shot a sheriff's deputy prior to killing himself.",
 "pronouns than human life.  NPR would rather you didn't know any of this.  We only know that this actually happened because reporters like Jason Rantz and Andy Ngo, one",
 'Watch out.  Wait a second, we thought.  This is NPR, national public radio, suddenly telling you that actually guns are good.']

In [116]:
# nicer labels
topic_labels = model.generate_topic_labels(nr_words=3, topic_prefix=False, word_length=15, separator=" ")

topic_labels

['people trans gun',
 'npr know ussie',
 'like want dangerous',
 'trans thing firearms',
 'targeting changing hampshire',
 'day got frivolity',
 'party instance eastern']

In [117]:
model.visualize_documents([i['text'] for i in data], embeddings=embeddings, hide_annotations=False, custom_labels=True)

In [81]:
# topics = {}

topics['hdbscan_5_chunk'] = topic_labels

In [84]:
with open("topics_de_fnc.json", 'w') as f:
    json.dump(topics, f, indent=4)

main: compare topics for variations- phrase, sentence, 5 seconds chunk, 10 seconds chunk, 15 seconds chunk, 20 seconds chunk

observations:
1. with increase in length of samples i.e. increased context per sample, number of clusters reduce
2. clusters become more sparse with increased context
3. no such trend among the count of background samples(cluster: -1)
4. quality of topics is comparable for sentence and 5 seconds chunk