In [1]:
import spacy
import json
from pprint import pprint
from spacy import displacy


spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

In [2]:
with open('../data/caption.json') as f:
    data = json.load(f)

In [3]:
pprint(data['captions'])

[{'end': '00:00:08,720',
  'index': 1,
  'start': '00:00:01,920',
  'text': 'The histogram above shows the distribution of calories for all '
          'orders.'},
 {'end': '00:00:14,480',
  'index': 2,
  'start': '00:00:08,720',
  'text': 'The spike around 1000 calories represents standard burrito orders'},
 {'end': '00:00:22,480',
  'index': 3,
  'start': '00:00:14,480',
  'text': 'a meat burrito with typical additions cheese, salsa, lettuce, sour, '
          'cream, rice and beans.'},
 {'end': '00:00:30,160',
  'index': 4,
  'start': '00:00:22,480',
  'text': "If you order a meat burrito at chipotle with these toppings, it's "
          'very likely to reach 1000 calories'},
 {'end': '00:00:33,840',
  'index': 5,
  'start': '00:00:30,160',
  'text': "but there's so much more to these data than the averages."},
 {'end': '00:00:43,280',
  'index': 6,
  'start': '00:00:33,840',
  'text': 'Chipotle customers can and do order meals with fewer than 650 '
          'calories such as a che

In [4]:
sentences = [caption['text'] for caption in data['captions']]

In [6]:
# https://spacy.io/usage/visualizers

from pathlib import Path

for i, sentence in enumerate(sentences):
    doc = nlp(sentence)
    
    svg = displacy.render(doc, style="dep", jupyter=False)
    output_filename = "../data/sentence.dependency.{}.svg".format(i)
    print(output_filename)
    output_path = Path(output_filename)
    output_path.open("w", encoding="utf-8").write(svg)

../data/sentence.dependency.0.svg
../data/sentence.dependency.1.svg
../data/sentence.dependency.2.svg
../data/sentence.dependency.3.svg
../data/sentence.dependency.4.svg
../data/sentence.dependency.5.svg
../data/sentence.dependency.6.svg


In [5]:
new_captions = []
for idx, sentence in enumerate(sentences, 1):
    doc = nlp(sentence)
    nummods = [token for token in doc if token.dep_ == "nummod"]
    emphasis_texts = []
    for nummod in nummods:    
        for token in doc:
            if nummod in token.lefts:
                emph = ''.join([t.text_with_ws for t in token.subtree]).strip()
                emphasis_texts.append(emph)
                
    _caption = next(filter(lambda cap: cap['index'] == idx, data['captions']))
    _caption['properties'] = {}
    _caption['properties']['emphasis'] = emphasis_texts
    new_captions.append(_caption)


In [8]:
emphasis_texts

['10 meals', 'more than 1600 calories']

In [7]:
for c in new_captions:
    pprint(c)

{'end': '00:00:08,720',
 'index': 1,
 'properties': {'emphasis': []},
 'start': '00:00:01,920',
 'text': 'The histogram above shows the distribution of calories for all '
         'orders.'}
{'end': '00:00:14,480',
 'index': 2,
 'properties': {'emphasis': ['The spike around 1000 calories']},
 'start': '00:00:08,720',
 'text': 'The spike around 1000 calories represents standard burrito orders'}
{'end': '00:00:22,480',
 'index': 3,
 'properties': {'emphasis': []},
 'start': '00:00:14,480',
 'text': 'a meat burrito with typical additions cheese, salsa, lettuce, sour, '
         'cream, rice and beans.'}
{'end': '00:00:30,160',
 'index': 4,
 'properties': {'emphasis': ['1000 calories']},
 'start': '00:00:22,480',
 'text': "If you order a meat burrito at chipotle with these toppings, it's "
         'very likely to reach 1000 calories'}
{'end': '00:00:33,840',
 'index': 5,
 'properties': {'emphasis': []},
 'start': '00:00:30,160',
 'text': "but there's so much more to these data than the av

In [9]:
d = { "captions": new_captions }
json_object = json.dumps(d, indent = 2)  

print(json_object)

{
  "captions": [
    {
      "index": 1,
      "start": "00:00:01,920",
      "end": "00:00:08,720",
      "text": "The histogram above shows the distribution of calories for all orders.",
      "properties": {
        "emphasis": []
      }
    },
    {
      "index": 2,
      "start": "00:00:08,720",
      "end": "00:00:14,480",
      "text": "The spike around 1000 calories represents standard burrito orders",
      "properties": {
        "emphasis": [
          "The spike around 1000 calories"
        ]
      }
    },
    {
      "index": 3,
      "start": "00:00:14,480",
      "end": "00:00:22,480",
      "text": "a meat burrito with typical additions cheese, salsa, lettuce, sour, cream, rice and beans.",
      "properties": {
        "emphasis": []
      }
    },
    {
      "index": 4,
      "start": "00:00:22,480",
      "end": "00:00:30,160",
      "text": "If you order a meat burrito at chipotle with these toppings, it's very likely to reach 1000 calories",
      "properties