In [None]:
%pip install -r requirements.txt
!python -m spacy download en_core_web_sm

In [None]:
import symbl

def symbl_get_topics(text: str, video_id: str | None):
    """Gets (abstract) topics keywords"""

    conversation_object = symbl.Text.process({
        "name": video_id,
        "messages": [{"payload": {"content": text}}] 
    }, wait=True)

    data = conversation_object.get_topics()
    return [x["text"] for x in data]

In [8]:
import textacy
from textacy.extract import keyterms
from textacy import text_stats


def add_topics(data: dict):
    """Parses and adds topics to the video data"""

    # Ignore the first segment, usually introduction.
    txt = " ".join([x["text"] for x in data["transcript"][1:]]).lower()

    try:
        # TODO: The secret is that this always panics, so it's never used.
        # topics = symbl_get_topics(txt, data["video_id"])
        raise RuntimeError()

    except:
        doc = textacy.make_spacy_doc(txt, lang="en_core_web_sm")
        topics = keyterms.yake(doc, normalize="lemma", topn=20)

    if "metadata" not in data:
        data["metadata"] = {}
    data["metadata"]["topics"] = topics

def add_misc_readability(data: dict):
    
    readability = []

    for ts in data["transcript"]:
        doc = textacy.make_spacy_doc(ts["text"], lang="en_core_web_sm")
        readability.append(text_stats.flesch_kincaid_grade_level(doc))

    if "metadata" not in data:
        data["metadata"] = {}
    data["metadata"]["readability"] = readability

In [22]:
import json
import numpy as np

def topic_frequency(text: str, topics: list) -> float:
    text = text.lower()
    
    weight = 0.0
    for topic in topics:
        name, prob = topic[0], 1-topic[1]
        weight += text.count(name)*prob

    return weight

def implications(text: str) -> float:
    implicators = {
        "implies": ["implies", "imply", "thus", "therefore", "makes"],
        "compound": ["and", "but", "since", "so", "while"],
        "steps": ["first", "next", "then", "last", "final"],
        "question": ["why", "which", "where", "how", "when", "?"],
    }
    weights = {
        "implies": 10.0,
        "compound": 0.0,
        "steps": 3.0,
        "question": 2.0,
    }

    text = text.lower()

    data = {}
    for (im, arr) in implicators.items():
        data[im] = sum(text.count(x) for x in arr)

    return sum(data[im] * weights[im] for im in implicators)

def main():
    testdata = json.load(open("testdata.json"))

    for td in testdata:
        readbty = np.array(td["metadata"]["readability"])
        topic_freq = np.array([
            topic_frequency(x["text"], td["metadata"]["topics"])
            for x in td["transcript"]
        ])
        impls = np.array([
            implications(x["text"])
            for x in td["transcript"]
        ])

        heatmap = np.array([x["heat"] for x in td["transcript"]])

        a = np.column_stack((readbty, topic_freq, impls))
        b = np.column_stack((heatmap))



main()

[[ 5.53101982  8.08965349  2.        ]
 [ 7.05393035  5.48554324  6.        ]
 [ 5.01027653  4.77267763 20.        ]
 [ 9.30419012 13.99706299  8.        ]
 [ 5.36513547 10.93219189  3.        ]
 [ 8.15318433 14.82157225  9.        ]
 [ 3.22772495 10.74268329 15.        ]
 [ 5.99369565 11.46247589 14.        ]
 [ 6.64800699 20.41486411  9.        ]
 [ 4.82804167 10.7143381  13.        ]
 [ 3.13259091 14.11187714  6.        ]
 [ 4.03250972 14.24363344  9.        ]
 [ 3.97559047 18.6183754  15.        ]
 [ 2.8724359  15.53911455 14.        ]
 [ 5.5395197  11.42705033 10.        ]
 [ 2.99       14.73017     0.        ]
 [ 2.06023706 10.6253764  19.        ]
 [ 1.70379944 11.87605144 28.        ]
 [ 1.06548064  5.70017006 11.        ]
 [ 3.18181818 16.32963196  7.        ]
 [ 5.80227273 11.91567713 14.        ]
 [ 2.39921053  6.37010762 14.        ]
 [ 2.32432432 10.33530531  5.        ]
 [ 2.80660773  9.01908352 10.        ]
 [ 5.0504142  18.24176578 20.        ]
 [ 4.10321101  8.83172202