In [None]:
def get_metadata(the_file):
    
    # Load the modules we need
    from bs4 import BeautifulSoup
    import json
    import re
    from datetime import datetime
    
    # Read the file, load it into BS, then grab section we want
    text = the_file.read()
    soup = BeautifulSoup(text, "html5lib")
    my_list = [i.string.lstrip('q("talkPage.init", {\n\t"el": "[data-talk-page]",\n\t "__INITIAL_DATA__":')
               .rstrip('})')
               for i in soup.select('script') 
               if i.string and i.string.startswith('q')]
    
    # Read first layer of JSON and get out those elements we want
    pre_json = '{"' + "".join(my_list)
    my_json = json.loads(pre_json)
    slug = my_json['slug']
    vcount = my_json['viewed_count']
    event = my_json['event']
    
    # Read second layer of JSON and get out listed elements:
    properties = "filmed,published" # No spaces between terms!
    talks_listed = str(my_json['talks']).split(",")
    regex_list = [".*("+i+").*" for i in properties.split(",")]
    matches = []
    for e in regex_list:
        filtered = filter(re.compile(e).match, talks_listed)
        indexed = "".join(filtered).split(":")[1]
        matches.append(indexed)
    filmed = datetime.utcfromtimestamp(float(matches[0])).strftime('%Y-%m-%d')
    published = datetime.utcfromtimestamp(float(matches[1])).strftime('%Y-%m-%d')
    return slug, vcount, event, filmed, #published

def to_csv(dir_path, output_csv):
    # LOAD required modules
    import csv
    import os
    # OPEN file to which to write:
    with open(output_csv, "w") as out:
        # create csv.writer.
        wr = csv.writer(out)
        # write our headers.
        wr.writerow(["slug", "view_count", "event", "filmed", "published"])
        # get all our html files.
        for html in os.listdir(dir_path):
            with open(os.path.join(dir_path, html),"r", errors="ignore") as f:
                # parse the file and write the data to a row.
                wr.writerow(get_metadata(f))

In [12]:
to_csv("./html","metadata-2.csv")

ValueError: Unterminated string starting at: line 1 column 2 (char 1)

These are alternate ways to open files in Python that may solve various reading errors. 

In [11]:
def to_csv_codecs(dir_path, output_csv):
    # LOAD required modules
    import csv
    import os
    import codecs
    # OPEN file to which to write:
    with open(output_csv, "w") as out:
        # create csv.writer.
        wr = csv.writer(out)
        # write our headers.
        wr.writerow(["slug", "view_count", "event", "filmed", "published"])
        # get all our html files.
        for html in os.listdir(dir_path):
            with codecs.open(os.path.join(dir_path, html),"r", 
                             encoding='utf-8', errors='ignore') as f:
                # parse the file and write the data to a row.
                wr.writerow(get_metadata(f))
                
def to_csv_io(dir_path, output_csv):
    # LOAD required modules
    import csv
    import os
    import io
    # OPEN file to which to write:
    with open(output_csv, "w") as out:
        # create csv.writer.
        wr = csv.writer(out)
        # write our headers.
        wr.writerow(["slug", "view_count", "event", "filmed", "published"])
        # get all our html files.
        for html in os.listdir(dir_path):
            with io.open(os.path.join(dir_path, html),"r") as f:
                # parse the file and write the data to a row.
                wr.writerow(get_metadata(f))