week4 objectives:
1. topic extraction on an MSNBC gun-control story 
2. compare topics with the FNC story(last 3 weeks)
4. script for topic extraction

In [1]:
import os

import warnings
warnings.filterwarnings('ignore')

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf


In [2]:
import json

with open("../w1/meta/transcript.json") as f:
    data = json.load(f)

In [3]:
data = data["pro_nbc.mp4"]

In [4]:
def get_sentences(segments):
    '''
    merge segments to sentences
    '''
    sentences, temp = [], ""

    for _, i  in enumerate(segments):
        # with period
        if not len(temp) and "." in i['text']:
            
            sentences.append({
                'text': i['text'].strip(),
                'start': round(i['start'], 3),
                'end': round(i['end'], 3)
            }) 
            
            continue
        # first condition fails
        elif not len(temp):

            temp, start = i['text'], round(i['start'], 3)

            continue

        temp += i['text']

        if "." in i['text']:

            sentences.append({
                'text': temp.strip(),
                'start': start,
                'end': round(i['end'], 3)
            })
            temp = ""
    
    return sentences


In [5]:
sentences = get_sentences(data)

In [6]:
sentences[ : 5]

[{'text': 'But we begin tonight with a tragedy that is unique to America.',
  'start': 0.0,
  'end': 4.3},
 {'text': 'Another school massacre, this time in Nashville, Tennessee, where a 28-year-old woman shot and killed three students and three staff members at a private Christian school.',
  'start': 4.3,
  'end': 15.0},
 {'text': 'The shooter was armed with two assault-style rifles and a handgun.',
  'start': 15.0,
  'end': 19.3},
 {'text': 'The Nashville Police Department said the shooter was a former student at the school who identifies as trans.',
  'start': 19.3,
  'end': 26.6},
 {'text': 'The children who were fatally shot are Evelyn Dykhouse, Hallie Scruggs, and William Kinney, all elementary school age.',
  'start': 26.8,
  'end': 35.2}]

In [7]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk.tag import pos_tag

import yake


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/karanjot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/karanjot/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/karanjot/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [8]:
def get_meta(sentence):
    '''
    fetch keywords and nouns
    '''
    extractor = yake.KeywordExtractor()
    words = extractor.extract_keywords(sentence)

    words = [i[0] for i in words if i[1] > 0.05 and len(i[0].split()) == 1]

    tagged_senten = pos_tag(sentence.split())
    
    proper_nouns = [word for word, pos in tagged_senten if pos == 'NNP']

    return [words, proper_nouns]
    

In [9]:
def make_chunks(sentences, dur=5.0):
  i, chunks = 0, []

  while i < len(sentences):
    # check duration
    phrase, start, end = sentences[i]['text'], sentences[i]['start'], sentences[i]['end']

    while end - start < dur:
      i += 1
      try:
        phrase, end = phrase + ' ' + sentences[i]['text'], sentences[i]['end']
      except IndexError: break

    words, nouns = get_meta(phrase.strip())
      
    chunks.append({
        'text': phrase.strip(),
        'start': round(start, 3),
        'end': round(end, 3),
        'words': words,
        'nouns': nouns
    })
    i += 1

  return chunks


In [10]:
chunks = make_chunks(data, 5.0)

In [11]:
chunks[ : 2]

[{'text': 'But we begin tonight with a tragedy that is unique to America.  Another school massacre, this time in Nashville, Tennessee,',
  'start': 0.0,
  'end': 7.9,
  'words': ['America',
   'Tennessee',
   'Nashville',
   'begin',
   'tonight',
   'tragedy',
   'unique',
   'massacre',
   'school',
   'time'],
  'nouns': ['America.', 'Nashville,', 'Tennessee,']},
 {'text': 'where a 28-year-old woman shot and killed three students  and three staff members at a private Christian school.',
  'start': 7.9,
  'end': 15.0,
  'words': ['Christian',
   'woman',
   'school',
   'shot',
   'killed',
   'students',
   'staff',
   'members',
   'private'],
  'nouns': []}]

In [20]:
data = chunks

In [21]:
from sentence_transformers import SentenceTransformer


# paraphrase-multilingual-mpnet-base-v2

sentenc_model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [22]:
# extract embeddings
embeddings = sentenc_model.encode([i['text'] for i in data], show_progress_bar=True)

embeddings.shape  # num of phrases = 244

Batches: 100%|██████████| 3/3 [00:00<00:00,  7.53it/s]


(92, 768)

In [23]:
# stitched together
import umap
import hdbscan
from bertopic import BERTopic

from sklearn.feature_extraction.text import CountVectorizer

# sub models
vectorizer = CountVectorizer(stop_words='english')

reduc_model = umap.UMAP(n_neighbors=5, n_components=25, metric='cosine')
clustr_model = hdbscan.HDBSCAN(min_cluster_size=5)

model = BERTopic(embedding_model=sentenc_model, 
                 umap_model=reduc_model, 
                 hdbscan_model=clustr_model, 
                 vectorizer_model=vectorizer).fit([i['text'] for i in data], embeddings)

In [24]:
model.get_topic_info().head(15)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,34,-1_guns_children_like_ar,"[guns, children, like, ar, today, school, book...",[But the rest of them have guns. And this has ...
1,0,20,0_gun_laws_lobby_guns,"[gun, laws, lobby, guns, talk, want, shannon, ...",[electing people who support the gun lobby and...
2,1,19,1_shootings_school_year_country,"[shootings, school, year, country, mass, old, ...","[It's, you know, I have, and these are three n..."
3,2,14,2_people_know_time_going,"[people, know, time, going, random, read, pani...",[that white people once hated black people dur...
4,3,5,3_tennessee_land_book_state,"[tennessee, land, book, state, dolly, ironical...","[It is a children's book, by the way. And it's..."


In [25]:
# nicer labels
topic_labels = model.generate_topic_labels(nr_words=3, topic_prefix=False, word_length=15, separator=" ")

topic_labels

['guns children like',
 'gun laws lobby',
 'shootings school year',
 'people know time',
 'tennessee land book']

In [26]:
model.visualize_documents([i['text'] for i in data], embeddings=embeddings, hide_annotations=False, custom_labels=True)

In [27]:
# topics = {}

topics["hdbscan_5_chunk"] = topic_labels

In [28]:
with open("topics_pro_nbc.json", 'w') as f:
    json.dump(topics, f, indent=4)

main: compare topics surfacing in stories on gun-control by FCN(de) and MSNBC(pro), compare topics extracted per variation: sentence and 5 seconds chunk

sentence based topics are more detailed for both stories

script: pynb/transcribe.py