# TED Talk Transcripts

The challenge here was to transform 2000+ html files into a managable/meaningful data stores. The first section focused on creating the csv file that would contain information focused on the talks, while the second section focuses on storing information on the speakers and, later, we created a csv of just the descriptions to see if we could create a network based on a jacquard analysis of the talks.

## Talks

In [None]:
# =-=-=-=-=-=-=-=-=-=-=
# Beautiful Soup 1
# =-=-=-=-=-=-=-=-=-=-= 

import glob, re, csv
from bs4 import BeautifulSoup as soup

the_file = "/Users/john/Code/tedtalks/test/transcript?language=en.0"
holding = soup(open(the_file).read(), "lxml")
at = holding.find("title").text
author = at[0:at.find(':')]
title  = at[at.find(":")+1 : at.find("|") ]
date = re.sub('[^a-zA-Z0-9]',' ', holding.select_one("span.meta__val").text)
length_data = holding.find_all('data', {'class' : 'talk-transcript__para__time'})
(m, s) = ([x.get_text().strip("\n\r")
      for x in length_data if re.search(r"(?s)\d{2}:\d{2}",
                                        x.get_text().strip("\n\r"))][-1]).split(':')
length = int(m) * 60 + int(s)
firstpass = re.sub(r'\([^)]*\)', '', holding.find('div', class_ = 'talk-transcript__body').text)
text = re.sub('[^a-zA-Z\.\']',' ', firstpass)
data = [str(author), str(title)]
# print(data)
with open("./output.csv", "w", newline = "") as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        for item in data:
            writer.writerow(item)


After getting that to work, I imported some boilerplate that has worked for me in the past:

```
file_list = glob.glob('/Users/john/Code/tedtalks/test/*') # produces list
print(file_list)

['/Users/john/Code/tedtalks/test/transcript?language=en.0', '/Users/john/Code/tedtalks/test/transcript?language=en.1', '/Users/john/Code/tedtalks/test/transcript?language=en.2']
```

But handing that off to the initial script proved very tricky. It was clearly time to learn how to `define` functions. With more gratitude than I can express, [Padraic Cunningham][] not only developed the script below, but was also very patient in diagnosing a particular problem I encountered.

The script below is available in the repo as `talks_to_csv.py`.

In [None]:
# =-=-=-=-=-=-=-=-=-=-=
# Beautiful Soup 2
# =-=-=-=-=-=-=-=-=-=-= 

import re
import csv
import os
from bs4 import BeautifulSoup

def parse(the_soup):
    # both title and author are can be parsed in separate tags.
    author = the_soup.select_one("h4.h12.talk-link__speaker").text.encode("utf-8")
    title = the_soup.select_one("h4.h9.m5").text
    # just need to strip the text from the date string, no regex needed.
    date = the_soup.select_one("span.meta__val").text.strip()      
    # we want the last time which is the talk-transcript__para__time previous to the footer.
    mn, sec = map(int, the_soup.select_one("footer.footer").find_previous("data", {
    "class": "talk-transcript__para__time"}).text.split(":"))
    length = (mn * 60 + sec)        
    # to ignore (Applause) etc.. we can just pull from the actual text fragment checking for (
    text = " ".join(d.text for d in the_soup.select("span.talk-transcript__fragment") if not d.text.startswith("("))        
    # clean the text
    text = re.sub('[^a-zA-Z\.\']', ' ', text)
    return  author.strip(), title.strip(), date, length, text

def to_csv(pth, out):
    # open file to write to.
    with open(out, "w") as out:
        # create csv.writer. 
        wr = csv.writer(out)
        # write our headers.
        wr.writerow(["author", "title", "date", "length", "text"])
        # get all our html files.
        for html in os.listdir(pth):
            with open(os.path.join(pth, html)) as f:
                # parse the file are write the data to a row.
                wr.writerow(parse(BeautifulSoup(f, "lxml")))
                
to_csv("./test","test.csv")

Fix below is to remove parentheses and numbers.

In [None]:
import re
import csv
import os
from bs4 import BeautifulSoup


def parse(soup):
    # both title and author are can be parsed in separate tags.
    author = soup.select_one("h4.h12.talk-link__speaker").text
    title = soup.select_one("h4.h9.m5").text
    # just need to strip the text from the date string, no regex needed.
    date = soup.select_one("span.meta__val").text.strip()
    # we want the last time which is the talk-transcript__para__time previous to the footer.
    mn, sec = map(int, soup.select_one("footer.footer").find_previous("data", {
        "class": "talk-transcript__para__time"}).text.split(":"))
    length = (mn * 60 + sec)
    # to ignore time etc.. we can just pull from the actual text fragment and remove noise i.e (Applause).
    text = re.sub(r'\([^)]*\)',"", " ".join(d.text for d in soup.select("span.talk-transcript__fragment")))
    return author.strip(), title.strip(), date, length, re.sub('[^a-zA-Z\.\']', ' ', text)

def to_csv(pth, out):
    # open file to write to.
    with open(out, "w") as out:
        # create csv.writer.
        wr = csv.writer(out)
        # write our headers.
        wr.writerow(["author", "title", "date", "length", "text"])
        # get all our html files.
        for html in os.listdir(pth):
            with open(os.path.join(pth, html)) as f:
                print(html)
                # parse the file are write the data to a row.
                wr.writerow(parse(BeautifulSoup(f, "lxml")))
                
to_csv("./talks","talks.csv") # This is to the test directory!

## Speakers

In [None]:
import re
import csv
import os
from bs4 import BeautifulSoup

# name: <h1 class="h2 profile-header__name">
# occupation: <div class="p2 profile-header__summary">
# intro: <div class="profile-intro">
# profile: <div class="section section--minor">


def parse(soup):
    # both title and views are can be parsed in separate tags.
    name = soup.find('h1', {'class' : "h2 profile-header__name"}).text.strip('\n')
    occupation = soup.find('div', {'class' : "p2 profile-header__summary"}).text.strip('\n')
    intro = soup.find('div', {'class' : "profile-intro"}).text.strip('\n')
    profile = soup.find('div', {'class' : "section section--minor"}).text.strip('\n')
    return name, occupation, intro, profile

def to_csv(pth, out):
    # open file to write to.
    with open(out, "w") as out:
        # create csv.writer.
        wr = csv.writer(out)
        # write our headers.
        wr.writerow(["title", "views", "descr"])
        # get all our html files.
        for html in os.listdir(pth):
            with open(os.path.join(pth, html)) as f:
                print(html)
                # parse the file and write the data to a row.
                wr.writerow(parse(BeautifulSoup(f, "lxml")))

# This is the ACTION:
to_csv("./html_files/speakers/","speakers.csv")

In [None]:
import pandas

colnames = ['title', 'views' , 'descr']
data = pandas.read_csv('./descriptions.csv', names=colnames)
titles = data.title.tolist()
views = data.views.tolist()
descriptions = data.descr.tolist()