Here's the Beautiful Soup code from last time:

In [None]:
import re
import csv
import os
from bs4 import BeautifulSoup

def parse(the_soup):
    # both title and author are can be parsed in separate tags.
    author = the_soup.select_one("h4.h12.talk-link__speaker").text.encode("utf-8")
    title = the_soup.select_one("h4.h9.m5").text
    # just need to strip the text from the date string, no regex needed.
    date = the_soup.select_one("span.meta__val").text.strip()      
    # we want the last time which is the talk-transcript__para__time previous to the footer.
    mn, sec = map(int, the_soup.select_one("footer.footer").find_previous("data", {
    "class": "talk-transcript__para__time"}).text.split(":"))
    length = (mn * 60 + sec)        
    # to ignore (Applause) etc.. we can just pull from the actual text fragment checking for (
    text = " ".join(d.text for d in the_soup.select("span.talk-transcript__fragment") if not d.text.startswith("("))        
    # clean the text
    text = re.sub('[^a-zA-Z\.\']', ' ', text)
    return  author.strip(), title.strip(), date, length, text

def to_csv(pth, out):
    # open file to write to.
    with open(out, "w") as out:
        # create csv.writer. 
        wr = csv.writer(out)
        # write our headers.
        wr.writerow(["author", "title", "date", "length", "text"])
        # get all our html files.
        for html in os.listdir(pth):
            with open(os.path.join(pth, html)) as f:
                # parse the file are write the data to a row.
                wr.writerow(parse(BeautifulSoup(f, "lxml")))
                
to_csv("./test","test.csv")

And now we are going to pull it apart and work on only one file:

In [2]:
from bs4 import BeautifulSoup

# NB: no need to read() the file: BS does that
thesoup = BeautifulSoup(open("test_transcript.html"), "html5lib")

In [3]:
metas = thesoup.find_all("meta")

In [4]:
for i in metas:
    print(i)

<meta content="Good news in the fight against pancreatic cancer" itemprop="name"/>
<meta content="Anyone who has lost a loved one to pancreatic cancer knows the devastating speed with which it can affect an otherwise healthy person. TED Fellow and biomedical entrepreneur Laura Indolfi is developing a revolutionary way to treat this complex and lethal disease: a drug delivery device that acts as a cage at the site of a tumor, preventing it from spreading and delivering medicine only where it's needed. &quot;We are hoping that one day we can make pancreatic cancer a curable disease,&quot; she says." itemprop="description"/>
<meta content="PT6M3S" itemprop="duration"/>
<meta content="2016-05-17T14:46:20+00:00" itemprop="uploadDate"/>
<meta content="1246654" itemprop="interactionCount"/>
<meta content="Laura Indolfi" itemprop="name"/>


In [5]:
type(metas)

bs4.element.ResultSet

In [6]:
for key, value in metas.items() :
    print (key, value)

AttributeError: ResultSet object has no attribute 'items'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?

In [16]:
soup_zero = BeautifulSoup(open("transcript.0.html"), "html5lib")
metas = thesoup.findAll("meta")
# for meta in metas:
#     print(meta)
    
print(metas[0])

<meta content="Good news in the fight against pancreatic cancer" itemprop="name"/>


In [19]:
print(type(metas), type(metas[0]))

<class 'bs4.element.ResultSet'> <class 'bs4.element.Tag'>


In [20]:
# contents = thesoup.find_all("meta", {"name":"City"})['content']

for meta in metas:
    print(meta['content'])

Good news in the fight against pancreatic cancer
Anyone who has lost a loved one to pancreatic cancer knows the devastating speed with which it can affect an otherwise healthy person. TED Fellow and biomedical entrepreneur Laura Indolfi is developing a revolutionary way to treat this complex and lethal disease: a drug delivery device that acts as a cage at the site of a tumor, preventing it from spreading and delivering medicine only where it's needed. "We are hoping that one day we can make pancreatic cancer a curable disease," she says.
PT6M3S
2016-05-17T14:46:20+00:00
1246654
Laura Indolfi


In [21]:
for meta in metas:
    print(meta.attrs)

{'itemprop': 'name', 'content': 'Good news in the fight against pancreatic cancer'}
{'itemprop': 'description', 'content': 'Anyone who has lost a loved one to pancreatic cancer knows the devastating speed with which it can affect an otherwise healthy person. TED Fellow and biomedical entrepreneur Laura Indolfi is developing a revolutionary way to treat this complex and lethal disease: a drug delivery device that acts as a cage at the site of a tumor, preventing it from spreading and delivering medicine only where it\'s needed. "We are hoping that one day we can make pancreatic cancer a curable disease," she says.'}
{'itemprop': 'duration', 'content': 'PT6M3S'}
{'itemprop': 'uploadDate', 'content': '2016-05-17T14:46:20+00:00'}
{'itemprop': 'interactionCount', 'content': '1246654'}
{'itemprop': 'name', 'content': 'Laura Indolfi'}
