Here's the Beautiful Soup code from last time:

In [None]:
import re
import csv
import os
from bs4 import BeautifulSoup

def parse(the_soup):
    # both title and author are can be parsed in separate tags.
    author = the_soup.select_one("h4.h12.talk-link__speaker").text.encode("utf-8")
    title = the_soup.select_one("h4.h9.m5").text
    # just need to strip the text from the date string, no regex needed.
    date = the_soup.select_one("span.meta__val").text.strip()      
    # we want the last time which is the talk-transcript__para__time previous to the footer.
    mn, sec = map(int, the_soup.select_one("footer.footer").find_previous("data", {
    "class": "talk-transcript__para__time"}).text.split(":"))
    length = (mn * 60 + sec)        
    # to ignore (Applause) etc.. we can just pull from the actual text fragment checking for (
    text = " ".join(d.text for d in the_soup.select("span.talk-transcript__fragment") if not d.text.startswith("("))        
    # clean the text
    text = re.sub('[^a-zA-Z\.\']', ' ', text)
    return  author.strip(), title.strip(), date, length, text

def to_csv(pth, out):
    # open file to write to.
    with open(out, "w") as out:
        # create csv.writer. 
        wr = csv.writer(out)
        # write our headers.
        wr.writerow(["author", "title", "date", "length", "text"])
        # get all our html files.
        for html in os.listdir(pth):
            with open(os.path.join(pth, html)) as f:
                # parse the file are write the data to a row.
                wr.writerow(parse(BeautifulSoup(f, "lxml")))
                
to_csv("./test","test.csv")

And now we are going to pull it apart and work on only one file:

In [35]:
from bs4 import BeautifulSoup, Comment

# NB: no need to read() the file: BS4 does that
thesoup = BeautifulSoup(open("transcript.0.html"), "html5lib")

# Talk metadata is in <meta> tags in the <head>. 
# This finds all <meta> tags
metas = thesoup.find_all("meta")

# Let's see what this object is...
print(type(metas))

<class 'bs4.element.ResultSet'>


In [5]:
# ... and what's inside of it:
print([meta for meta in metas])

[<meta charset="utf-8"/>, <meta content="TED Talk Subtitles and Transcript: Anyone who has lost a loved one to pancreatic cancer knows the devastating speed with which it can affect an otherwise healthy person. TED Fellow and biomedical entrepreneur Laura Indolfi is developing a revolutionary way to treat this complex and lethal disease: a drug delivery device that acts as a cage at the site of a tumor, preventing it from spreading and delivering medicine only where it's needed. &quot;We are hoping that one day we can make pancreatic cancer a curable disease,&quot; she says." name="description"/>, <meta content="Laura Indolfi" name="author"/>, <meta content='Transcript of "Good news in the fight against pancreatic cancer"' property="og:title"/>, <meta content="https://pi.tedcdn.com/r/talkstar-photos.s3.amazonaws.com/uploads/70d551c2-1e5c-411e-b926-7d72590f66bb/LauraIndolfi_2016U-embed.jpg?c=1050%2C550&amp;w=1050" property="og:image"/>, <meta content="https://pi.tedcdn.com/r/talkstar-ph

In [6]:
print(metas[0])

<meta charset="utf-8"/>


In [7]:
print(type(metas), type(metas[0]))

<class 'bs4.element.ResultSet'> <class 'bs4.element.Tag'>


In [13]:
metalist = [meta.attrs for meta in metas]

Finally, the trick to getting the value of one attribute based on the value of another attribute is [found][]. 

[found]: https://stackoverflow.com/questions/36768068/get-meta-tag-content-property-with-beautifulsoup-and-python

In [28]:
for tag in thesoup.find_all("meta"):
    if tag.get("name", None) == "author":
        speaker = tag.get("content", None)
    if tag.get("itemprop", None) == "duration":
        length = tag.get("content", None)
    if tag.get("itemprop", None) == "uploadDate":
        published = tag.get("content", None)
    if tag.get("itemprop", None) == "interactionCount":
        views = tag.get("content", None)
    if tag.get("itemprop", None) == "description":
        description = tag.get("content", None)

print(speaker, length, published, views, description)

Laura Indolfi PT6M3S 2016-05-17T14:46:20+00:00 1246654 Anyone who has lost a loved one to pancreatic cancer knows the devastating speed with which it can affect an otherwise healthy person. TED Fellow and biomedical entrepreneur Laura Indolfi is developing a revolutionary way to treat this complex and lethal disease: a drug delivery device that acts as a cage at the site of a tumor, preventing it from spreading and delivering medicine only where it's needed. "We are hoping that one day we can make pancreatic cancer a curable disease," she says.


So far as I can tell, the `None` value within the `tag.get()` method isn't strictly necessary:

In [29]:
for tag in thesoup.find_all("meta"):
    if tag.get("name") == "author":
        author = tag.get("content")
print(author)

Laura Indolfi


In [32]:
# Returns transcript, but also some footer information which is in paragraph tags
text = thesoup.find_all("p")
print(text)

[<p>
											By raising your hand,
											how many of you know
at least one person on the screen?
											Wow, it's almost a full house.
											It's true, they are very famous
in their fields.
											And do you know what
all of them have in common?
											They all died of pancreatic cancer.
											However, although it's very,
very sad this news,
											it's also thanks to their personal stories
											that we have raised awareness
of how lethal this disease can be.
									</p>, <p>
											It's become the third cause
of cancer deaths,
											and only eight percent of the patients
will survive beyond five years.
											That's a very tiny number,
											especially if you compare it
with breast cancer,
											where the survival rate
is almost 90 percent.
											So it doesn't really come as a surprise
											that being diagnosed
with pancreatic cancer
											means facing an almost
certain death sentence.
											What's s

In [36]:
for comment in thesoup.findAll(text=lambda text:isinstance(text, Comment)):
    if comment in ['Transcript text']:
        print(comment.next_element.strip())