In [1]:
from bs4 import BeautifulSoup

import json
import requests

In [2]:
urls = [
    'https://www.ted.com/talks/helen_czerski_the_fascinating_physics_of_everyday_life/transcript?language=pt-br#t-81674',
    'https://www.ted.com/talks/kevin_kelly_how_ai_can_bring_on_a_second_industrial_revolution/transcript?language=pt-br',
    'https://www.ted.com/talks/sarah_parcak_help_discover_ancient_ruins_before_it_s_too_late/transcript?language=pt-br',
    'https://www.ted.com/talks/sylvain_duranton_how_humans_and_ai_can_work_together_to_create_better_businesses/transcript?language=pt-br',
    'https://www.ted.com/talks/chieko_asakawa_how_new_technology_helps_blind_people_explore_the_world/transcript?language=pt-br',
    'https://www.ted.com/talks/pierre_barreau_how_ai_could_compose_a_personalized_soundtrack_to_your_life/transcript?language=pt-br',
    'https://www.ted.com/talks/tom_gruber_how_ai_can_enhance_our_memory_work_and_social_lives/transcript?language=pt-br',
    'https://olhardigital.com.br/colunistas/wagner_sanchez/post/o_futuro_cada_vez_mais_perto/78972',
    'https://olhardigital.com.br/colunistas/wagner_sanchez/post/os_riscos_do_machine_learning/80584',
    'https://olhardigital.com.br/ciencia-e-espaco/noticia/nova-teoria-diz-que-passado-presente-e-futuro-coexistem/97786',
    'https://olhardigital.com.br/noticia/inteligencia-artificial-da-ibm-consegue-prever-cancer-de-mama/87030',
    'https://olhardigital.com.br/ciencia-e-espaco/noticia/inteligencia-artificial-ajuda-a-nasa-a-projetar-novos-trajes-espaciais/102772',
    'https://olhardigital.com.br/colunistas/jorge_vargas_neto/post/como_a_inteligencia_artificial_pode_mudar_o_cenario_de_oferta_de_credito/78999',
    'https://olhardigital.com.br/ciencia-e-espaco/noticia/cientistas-criam-programa-poderoso-que-aprimora-deteccao-de-galaxias/100683',
    'https://www.startse.com/noticia/startups/mobtech/deep-learning-o-cerebro-dos-carros-autonomos'
]

In [60]:
class Page(object):
    def __init__(self, url):
        self.url = url
        page = requests.get(self.url)
        self.soup = BeautifulSoup(page.content)

    def __str__(self):
        body = self.get_body()
        return 'title: {0}\nauthor: {1}\ntype: {2}\nbody: {3}\nurl: {4}\n'.format(
            self.get_title(),
            self.get_author(),
            self.get_type(),
            body[:25] + '... ...' + body[-25:],
            self.get_url()
        )

    def get_url(self):
        return self.url

    def get_type(self):
        return self.type
    
    def get_all_paragraphs(self, soup=None, skip=[]):
        if soup is None:
            soup = self.soup
        paragraphs = soup.find_all('p')
        paragraphs = [p.text.strip() for p in paragraphs if
                      not any(expr in p.text for expr in skip)]
        return '\n'.join(paragraphs).strip()

    def get_data(self):
        self.page_data = {
            'title': self.get_title(),
            'author': self.get_author(),
            'body': self.get_body(),
            'type': self.get_type(),
            'url': self.get_url()
        }
        return self

    def to_json(self, file_name):
        with open(file_name, 'w') as json_file:
            json.dump(self.page_data, json_file, indent=2, ensure_ascii=False)
        return self
    
    def get_body(self):
        return ''

    def get_author(self):
        return ''

    def get_title(self):
        return ''


class Ted(Page):

    def __init__(self, url):
        super().__init__(url)
        self.type = 'video'
    
    def get_body(self):
        copyright_notice = ('\nTED.com translations' +
        ' are made possible by volunteer\ntranslators.' +
        ' Learn more about the\nOpen Translation Project.' +
        '\n© TED Conferences, LLC. All rights reserved.')
        text = self.get_all_paragraphs()\
            .replace('\t', '')\
            .replace(copyright_notice, '')
        return text

    def get_author(self):
        return self.soup.find('title').text.split(': ', 1)[0]

    def get_title(self):
        suffix = ' | TED Talk Subtitles and Transcript | TED'
        try:
            return self.soup.find('title').text.split(': ', 1)[1].replace(suffix, '')
        except IndexError:
            return self.soup.find('title').text

class OlharDigital(Page):

    def __init__(self, url):
        super().__init__(url)
        self.type = 'article'
    
    def get_body(self):
        subtitle = self.soup.h2.text
        return subtitle + '\n' + self.get_all_paragraphs(
            skip = ['Via:', 'Fonte:'])

    def get_title(self):
        return self.soup.findAll('h1', {'class': 'mat-tit'})[0].text

    def get_author(self):
        authors = self.soup.findAll('span', {'class': 'meta-aut'})[0].text
        return authors.split(', editado por')[0]


class StartSe(Page):

    def __init__(self, url):
        super().__init__(url)
        self.type = 'article'
    
    def get_body(self):
        content = self.soup.findAll(
            'div',
            {'class': 'content-single__sidebar-content__content'}
        )
        return self.get_all_paragraphs(
            content[0],
            skip = ['Compartilhe em sua rede:', '*Foto:']
        )

    def get_title(self):
        return self.soup.findAll('h2', {'class': 'title-single__title__name'})[0].text

    def get_author(self):
        return self.soup.findAll('h4', {'class': 'title-single__info__author__about__name'})[0]\
            .findAll('a')[0].text           

In [61]:
for i, url in enumerate(urls, start=1):
    if 'https://www.ted.com' in url:
        page = Ted(url)
    elif 'https://olhardigital.com.br' in url:
        page = OlharDigital(url)
    else:
        page = StartSe(url)
    page.get_data()
    print(page)
    page.to_json('dados/{}.json'.format(i))

title: A fascinante física do dia a dia
author: Helen Czerski
type: video
body: Como ouviram, sou uma fís... ...igada.
(Aplausos) (Vivas)
url: https://www.ted.com/talks/helen_czerski_the_fascinating_physics_of_everyday_life/transcript?language=pt-br#t-81674

title: Como a Inteligência Artificial pode provocar uma segunda Revolução Industrial
author: Kevin Kelly
type: video
body: Falarei um pouco sobre o ... ...igado.
(Risos)
(Aplausos)
url: https://www.ted.com/talks/kevin_kelly_how_ai_can_bring_on_a_second_industrial_revolution/transcript?language=pt-br

title: Ajudem a descobrir ruínas antigas  — antes que seja tarde demais.
author: Sarah Parcak
type: video
body: Como arqueóloga,
vivem me... ...igada.
(Aplausos) (Vivas)
url: https://www.ted.com/talks/sarah_parcak_help_discover_ancient_ruins_before_it_s_too_late/transcript?language=pt-br

title: Como as pessoas e a inteligência artificial podem trabalhar juntas para criar melhores negócios
author: Sylvain Duranton
type: video
body: Per