In [1]:
from bs4 import BeautifulSoup
import json
import re
from datetime import datetime
import csv
import os

In [3]:
def get_metadata(the_file):
    
    # Load the modules we need: 
    # from bs4 import BeautifulSoup
    # import json
    # import re
    # from datetime import datetime
    
    # Read the file, load it into BS, then grab section we want
    text = the_file.read()
    soup = BeautifulSoup(text, "html5lib")
    my_list = [i.string.lstrip('q("talkPage.init", {\n\t"el": "[data-talk-page]",\n\t "__INITIAL_DATA__":')
               .rstrip('})')
               for i in soup.select('script') 
               if i.string and i.string.startswith('q')]
    
    # Read first layer of JSON and get out those elements we want
    pre_json = '{"' + "".join(my_list)
    my_json = json.loads(pre_json)
    slug = my_json['slug']
    vcount = my_json['viewed_count']
    event = my_json['event']
    
    # Read second layer of JSON and get out listed elements:
    properties = "filmed,published" # No spaces between terms!
    talks_listed = str(my_json['talks']).split(",")
    regex_list = [".*("+i+").*" for i in properties.split(",")]
    matches = []
    for e in regex_list:
        filtered = filter(re.compile(e).match, talks_listed)
        indexed = "".join(filtered).split(":")[1]
        matches.append(indexed)
    filmed = datetime.utcfromtimestamp(float(matches[0])).strftime('%Y-%m-%d')
#    published = datetime.utcfromtimestamp(float(matches[1])).strftime('%Y-%m-%d')
    return slug, vcount, event, filmed, #published

In [9]:
def to_csv(dir_path, output_csv):
    """import csv, os"""
    with open(output_csv, "w") as out:
        # create csv.writer.
        wr = csv.writer(out)
        # write our headers.
        wr.writerow(["slug", "view_count", "event", "filmed"]) # , "published"
        # get all our html files.
        for html in os.listdir(dir_path):
            if html[-5:] == '.html':
                with open(os.path.join(dir_path, html),"r", errors="ignore") as f:
                    # parse the file and write the data to a row.
                    print(os.path.join(dir_path,html))
                    wr.writerow(get_metadata(f))

In [25]:
to_csv("./html","metadata-14.csv")

./html/theaster_gates_how_to_revive_a_neighborhood_with_imagination_beauty_and_art.html
./html/thelma_golden_how_art_gives_shape_to_cultural_change.html
./html/theo_e_j_wilson_a_black_man_goes_undercover_in_the_alt_right.html
./html/theo_jansen_creates_new_creatures.html
./html/they_might_be_giants_play_at_8_30_am.html
./html/thom_mayne_on_architecture_as_connection.html
./html/thomas_barnett_draws_a_new_map_for_peace.html
./html/thomas_dolby_and_rachelle_garniez.html
./html/thomas_goetz_it_s_time_to_redesign_medical_data.html
./html/thomas_heatherwick.html
./html/thomas_hellum_the_world_s_most_boring_television_and_why_it_s_hilariously_addictive.html
./html/thomas_insel_toward_a_new_understanding_of_mental_illness.html
./html/thomas_p_campbell_weaving_narratives_in_museum_galleries.html
./html/thomas_peschak_dive_into_an_ocean_photographer_s_world.html
./html/thomas_piketty_new_thoughts_on_capital_in_the_twenty_first_century.html
./html/thomas_pogge_medicine_for_the_99_percent.html
./

./html/will_potter_the_shocking_move_to_criminalize_non_violent_protest.html
./html/will_wright_makes_toys_that_make_worlds.html
./html/willard_wigan_hold_your_breath_for_micro_sculpture.html
./html/william_black_how_to_rob_a_bank_from_the_inside_that_is.html
./html/william_kamkwamba_how_i_harnessed_the_wind.html
./html/william_kamkwamba_on_building_a_windmill.html
./html/william_li.html
./html/william_mcdonough_on_cradle_to_cradle_design.html
./html/william_noel_revealing_the_lost_codex_of_archimedes.html
./html/william_ury.html
./html/willie_smits_restores_a_rainforest.html
./html/wingham_rowan_a_new_kind_of_job_market.html
./html/wolfgang_kessling_how_to_air_condition_outdoor_spaces.html
./html/woody_norris_invents_amazing_things.html
./html/xavier_de_kestelier_adventures_of_an_interplanetary_architect.html
./html/xavier_vilalta_architecture_at_home_in_its_community.html
./html/yang_lan.html
./html/yanis_varoufakis_capitalism_will_eat_democracy_unless_we_speak_up.html
./html/yann_ar

These are alternate ways to open files in Python that may solve various reading errors. 

In [None]:
def to_csv_codecs(dir_path, output_csv):
    """Requires: import csv, os, codecs"""
    import csv
    import os
    import codecs
    # OPEN file to which to write:
    with open(output_csv, "w") as out:
        # create csv.writer.
        wr = csv.writer(out)
        # write our headers.
        wr.writerow(["slug", "view_count", "event", "filmed", "published"])
        # get all our html files.
        for html in os.listdir(dir_path):
            with codecs.open(os.path.join(dir_path, html),"r", 
                             encoding='utf-8', errors='ignore') as f:
                # parse the file and write the data to a row.
                wr.writerow(get_metadata(f))
                
def to_csv_io(dir_path, output_csv):
    # LOAD required modules
    import csv
    import os
    import io
    # OPEN file to which to write:
    with open(output_csv, "w") as out:
        # create csv.writer.
        wr = csv.writer(out)
        # write our headers.
        wr.writerow(["slug", "view_count", "event", "filmed", "published"])
        # get all our html files.
        for html in os.listdir(dir_path):
            with io.open(os.path.join(dir_path, html),"r") as f:
                # parse the file and write the data to a row.
                wr.writerow(get_metadata(f))