$\Large{\text{Web Scrapping Ted Talks.}}$

See [Ted Talks Website here](https://www.ted.com/talks?event=tedx&amp;sort=newest).


    first page:  https://www.ted.com/talks?event=tedx&amp;sort=newest 
    second page: https://www.ted.com/talks?event=tedx&page=2&sort=newest 
    last page:   https://www.ted.com/talks?event=tedx&page=17&sort=newest 

First page is different from rest:
    `&amp;sort` verses `&page=2&sort` for the other pages.

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
# get all websites to scrape from.
websites = []
website  = 'https://www.ted.com/talks?event=tedx&amp;sort=newest'
websites = [website,]
for i in range(2, 18):
    websites.append(website.replace('amp;','page=%i&'%i))
websites

['https://www.ted.com/talks?event=tedx&amp;sort=newest',
 'https://www.ted.com/talks?event=tedx&page=2&sort=newest',
 'https://www.ted.com/talks?event=tedx&page=3&sort=newest',
 'https://www.ted.com/talks?event=tedx&page=4&sort=newest',
 'https://www.ted.com/talks?event=tedx&page=5&sort=newest',
 'https://www.ted.com/talks?event=tedx&page=6&sort=newest',
 'https://www.ted.com/talks?event=tedx&page=7&sort=newest',
 'https://www.ted.com/talks?event=tedx&page=8&sort=newest',
 'https://www.ted.com/talks?event=tedx&page=9&sort=newest',
 'https://www.ted.com/talks?event=tedx&page=10&sort=newest',
 'https://www.ted.com/talks?event=tedx&page=11&sort=newest',
 'https://www.ted.com/talks?event=tedx&page=12&sort=newest',
 'https://www.ted.com/talks?event=tedx&page=13&sort=newest',
 'https://www.ted.com/talks?event=tedx&page=14&sort=newest',
 'https://www.ted.com/talks?event=tedx&page=15&sort=newest',
 'https://www.ted.com/talks?event=tedx&page=16&sort=newest',
 'https://www.ted.com/talks?event=te

In [3]:
from collections import OrderedDict

out = OrderedDict()

out['page'] = []
out['pagesource'] = []
out['title'] = []
out['speaker'] = []
out['dateposted'] = []
out['rated'] = []
out['duration'] = []
out['link'] = []


for i_site,site in enumerate(websites):
    r       = requests.get(site)
    soup    = BeautifulSoup(r.text, 'html.parser')
    results = soup.find_all('div', attrs={'class':'media media--sm-v'}) 

    for i_res,res in enumerate(results):
        
        title      = res.findAll('a')[1].text.replace('\n','')
        speaker    = res.find('h4').text
        dateposted = res.find(name='span', attrs={'class':'meta__val'}).text.replace('\n', '')
        duration   = res.find(name='span', attrs={'class', 'thumb__duration'}).text
        link       = res.find('a')['href']
        link       = site.split('/talks')[0] + link # add 'https://www.ted.com to beginning
        
        # NOT ALL TALKS HAVE RATED.
        if res.find(name='span', attrs={'class':'meta__row'}):
            rated = res.find(name='span', attrs={'class':'meta__row'}).text.replace('\n','')
        else:
            rated = ''
        
        
        # APPEND TO DICTIONARY.
        out['page'].append(i_site+1)
        out['pagesource'].append(site)  # or r.url
        out['title'].append(title)
        out['speaker'].append(speaker)
        out['dateposted'].append(dateposted)
        out['rated'].append(rated)
        out['duration'].append(duration)
        out['link'].append(link)
        

In [5]:
import pandas as pd

In [6]:
data = pd.DataFrame(out)

In [7]:
data.head()

Unnamed: 0,page,pagesource,title,speaker,dateposted,rated,duration,link
0,1,https://www.ted.com/talks?event=tedx&amp;sort=...,The power of appreciation,Mike Robbins,Jan 2019,,18:06,https://www.ted.com/talks/mike_robbins_the_pow...
1,1,https://www.ted.com/talks?event=tedx&amp;sort=...,The ruralities of autism,Amy Price Azano,Jan 2019,,12:31,https://www.ted.com/talks/amy_price_azano_the_...
2,1,https://www.ted.com/talks?event=tedx&amp;sort=...,How stigma shaped modern medicine,Nathalia Holt,Jan 2019,,15:30,https://www.ted.com/talks/nathalia_holt_how_st...
3,1,https://www.ted.com/talks?event=tedx&amp;sort=...,3 ways to build a happy marriage and avoid div...,George Blair-West,Jan 2019,,11:13,https://www.ted.com/talks/george_blair_west_3_...
4,1,https://www.ted.com/talks?event=tedx&amp;sort=...,A mother and son's photographic journey throug...,Tony Luciani,Jan 2019,,13:32,https://www.ted.com/talks/tony_luciani_a_mothe...


In [10]:
data.page.value_counts() # this is correct. last page only has 32 videos.

9     36
8     36
2     36
3     36
4     36
5     36
6     36
7     36
1     36
16    36
10    36
11    36
12    36
13    36
14    36
15    36
17    32
Name: page, dtype: int64

In [11]:
data.to_csv('data/ted_talks.csv', index=False, encoding='utf-8')