week2 objectives:
1. clean transcript to get text with timestamps
2. merge phrases to form sentences
3. merge sentences to form chunks having a minimum duration
4. keyword extraction from chunks

In [1]:
import json

with open("../w1/meta/transcript.json") as f:
    data = json.load(f)


text with timestamps

In [2]:
phrases = [{'text': i['text'].strip(), 'start': round(i['start'], 3), 'end': round(i['end'], 3)} for i in data['de_fnc.mp4']]

phrases[ : 10]

[{'text': 'Good evening and welcome to Tucker Carlson tonight.',
  'start': 0.0,
  'end': 3.56},
 {'text': "If there's one thing your average liberal understands perfectly well, it's that there's",
  'start': 3.56,
  'end': 7.96},
 {'text': 'safety in numbers.', 'start': 7.96, 'end': 10.08},
 {'text': "Don't go out alone.", 'start': 10.08, 'end': 11.48},
 {'text': 'Bring 80 million people with you.', 'start': 11.48, 'end': 13.28},
 {'text': "It's safer that way.", 'start': 13.28, 'end': 15.2},
 {'text': 'There is a reason, a fundamental reason, that Democrats are natural joiners and organizers',
  'start': 15.2,
  'end': 20.54},
 {'text': 'and petition signers and that their highest virtue is conformity.',
  'start': 20.54,
  'end': 25.86},
 {'text': "They know that as long as they're all wearing the same uniform, they'll probably be okay.",
  'start': 25.86,
  'end': 30.74},
 {'text': "This is why you'll see one person in Brookline or Bethesda raise a Ukrainian flag in the",
  'start'

In [19]:
with open("phrase.json", 'w') as f:
    json.dump(phrases, f, indent=4)


merge phrases to form sentences

In [3]:
def get_sentences(segments):
    '''
    merge segments to sentences
    '''
    sentences, temp = [], ""

    for _, i  in enumerate(segments):
        # with period
        if not len(temp) and "." in i['text']:
            
            sentences.append({
                'text': i['text'].strip(),
                'start': round(i['start'], 3),
                'end': round(i['end'], 3)
            }) 
            
            continue
        # first condition fails
        elif not len(temp):

            temp, start = i['text'], round(i['start'], 3)

            continue

        temp += i['text']

        if "." in i['text']:

            sentences.append({
                'text': temp.strip(),
                'start': start,
                'end': round(i['end'], 3)
            })
            temp = ""
    
    return sentences


In [4]:
sentences = get_sentences(data['de_fnc.mp4'])

sentences[ : 10]

[{'text': 'Good evening and welcome to Tucker Carlson tonight.',
  'start': 0.0,
  'end': 3.56},
 {'text': "If there's one thing your average liberal understands perfectly well, it's that there's safety in numbers.",
  'start': 3.56,
  'end': 10.08},
 {'text': "Don't go out alone.", 'start': 10.08, 'end': 11.48},
 {'text': 'Bring 80 million people with you.', 'start': 11.48, 'end': 13.28},
 {'text': "It's safer that way.", 'start': 13.28, 'end': 15.2},
 {'text': 'There is a reason, a fundamental reason, that Democrats are natural joiners and organizers and petition signers and that their highest virtue is conformity.',
  'start': 15.2,
  'end': 25.86},
 {'text': "They know that as long as they're all wearing the same uniform, they'll probably be okay.",
  'start': 25.86,
  'end': 30.74},
 {'text': "This is why you'll see one person in Brookline or Bethesda raise a Ukrainian flag in the yard and the very next day, everybody on the street will have one too.",
  'start': 30.74,
  'end': 4

In [20]:
with open("sentence.json", 'w') as f:
    json.dump(sentences, f, indent=4)


keyword extraction

In [5]:
%pip install -q yake  # yet another

%pip install -q rake-nltk  # rapid automatic

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [6]:
senten = "This is why you'll see one person in Brookline or Bethesda raise a Ukrainian flag in the yard and the very next day, everybody on the street will have one too."

In [7]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/karanjot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/karanjot/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/karanjot/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [8]:
from rake_nltk import Rake


rake = Rake()

rake.extract_keywords_from_text(senten)
words = rake.get_ranked_phrases()

words

['see one person',
 'ukrainian flag',
 'next day',
 'bethesda raise',
 'one',
 'yard',
 'street',
 'everybody',
 'brookline']

In [9]:
import yake


extractor = yake.KeywordExtractor()
words = extractor.extract_keywords(senten)

words = [i[0] for i in words if i[1] > 0.05]

words

['Brookline',
 'Bethesda',
 'Ukrainian',
 'day',
 'person',
 'raise',
 'flag',
 'yard',
 'street']

In [10]:
from nltk.tag import pos_tag


tagged_senten = pos_tag(senten.split())
proper_nouns = [word for word, pos in tagged_senten if pos == 'NNP']

proper_nouns

['Brookline', 'Bethesda']

minimum duration chunks with meta data

In [11]:
def get_meta(sentence):
    '''
    fetch keywords and nouns
    '''
    extractor = yake.KeywordExtractor()
    words = extractor.extract_keywords(sentence)

    words = [i[0] for i in words if i[1] > 0.05 and len(i[0].split()) == 1]

    tagged_senten = pos_tag(sentence.split())
    
    proper_nouns = [word for word, pos in tagged_senten if pos == 'NNP']

    return [words, proper_nouns]
    

In [12]:
def make_chunks(sentences, dur=5.0):
  i, chunks = 0, []

  while i < len(sentences):
    # check duration
    phrase, start, end = sentences[i]['text'], sentences[i]['start'], sentences[i]['end']

    while end - start < dur:
      i += 1
      try:
        phrase, end = phrase + ' ' + sentences[i]['text'], sentences[i]['end']
      except IndexError: break

    words, nouns = get_meta(phrase.strip())
      
    chunks.append({
        'text': phrase.strip(),
        'start': round(start, 3),
        'end': round(end, 3),
        'words': words,
        'nouns': nouns
    })
    i += 1

  return chunks

In [13]:
chunks = {'data': []}

for i in [5.0, 10.0, 15.0, 20.0]:
    
    chunks['data'].append({
        'duration': i,
        'chunks': make_chunks(data['de_fnc.mp4'], dur=i)
    })


In [17]:
chunks['data'][0]['chunks'][6]

{'text': "Suddenly it's an entire neighborhood of foreign policy experts all specializing in Eastern  European border disputes.",
 'start': 40.58,
 'end': 47.3,
 'words': ['Eastern',
  'European',
  'Suddenly',
  'disputes',
  'entire',
  'neighborhood',
  'foreign',
  'policy'],
 'nouns': ['Eastern', 'European']}

In [18]:
with open("chunk.json", 'w') as f:
    json.dump(chunks, f, indent=4)


main: attain variations among transcript - phrase, sentence and chunk(duration-based), capture meta data- keywords and proper nouns(will act as targets for stance detection)