<b style="font-size:40px;">Web scraping de canales políticos</b><br><br>
Este notebook automatiza la recopilación de datos de diferentes canales de YouTube, accediendo a varias pestañas como videos, shorts, streams, podcasts, playlists y publicaciones. Utiliza peticiones HTTP y técnicas de web scraping para guardar la información en archivos JSON organizados por canal y tipo de contenido. El flujo incluye la selección dinámica de agentes de usuario, manejo de rutas de almacenamiento y pausas aleatorias para evitar bloqueos por parte de la plataforma.


In [None]:
import json

import requests
import re
import json
import random

from datetime import datetime
import os

import time
from itertools import product
from pathlib import Path


In [None]:
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
]

CHANNELS = [
    'turnoenvivo',
    'delosquesobran',
    'SINFILTROSTV',
    'FNM_Chile',
    'politika_chilena_2025',
    'ChileanPolitics',
    'ElVillegaschile',
    'CanalElCiudadano',
    'Lamanoinvisible1723'
]

TABS = [
    'featured',
    'videos',
    'shorts',
    'streams',
    'podcasts',
    'playlists',
    'posts'
]

In [183]:
def get_data_from_video_tab(canal, tab='videos'):

    url = f"https://www.youtube.com/@{canal}/{tab}"
    
    headers = {
        "User-Agent": random.choice(USER_AGENTS),
        "Accept-Language": "es-ES,es;q=0.9",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Cache-Control": "no-cache",
        "Pragma": "no-cache",
        "Connection": "keep-alive",
    }

    r = requests.get(url, headers=headers)
    html = r.text

    pattern = r"var ytInitialData = ({.*?});</script>"
    match = re.search(pattern, html, re.DOTALL)

    json_text = match.group(1)
    return json.loads(json_text)


In [5]:
if os.getenv("GITHUB_ACTIONS") == "true":
    base_path = Path("")
else:
    base_path = Path("..")

In [None]:
for _channel, _tab in list(product(CHANNELS, TABS)):
    
    data = get_data_from_video_tab(_channel, _tab)

    folder = base_path / "data-scraped" / "channels" / _channel.lower() / f"tab-{_tab}"    
    os.makedirs(folder, exist_ok=True)
    
    now = datetime.now()
    file_name = os.path.join(folder, f"{now.strftime('%y%m%d_%H%M%S')}.json")
    print("JSON exportado en:", file_name)

    with open(file_name, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

    time.sleep(random.uniform(1.5, 4.0))


JSON exportado en: ..\data-scraped\channels\turnoenvivo\tab-featured\250711_210722.json
JSON exportado en: ..\data-scraped\channels\turnoenvivo\tab-videos\250711_210724.json
JSON exportado en: ..\data-scraped\channels\turnoenvivo\tab-shorts\250711_210727.json
JSON exportado en: ..\data-scraped\channels\turnoenvivo\tab-streams\250711_210730.json
JSON exportado en: ..\data-scraped\channels\turnoenvivo\tab-podcasts\250711_210734.json
JSON exportado en: ..\data-scraped\channels\turnoenvivo\tab-playlists\250711_210736.json
JSON exportado en: ..\data-scraped\channels\turnoenvivo\tab-posts\250711_210739.json
JSON exportado en: ..\data-scraped\channels\delosquesobran\tab-featured\250711_210741.json
JSON exportado en: ..\data-scraped\channels\delosquesobran\tab-videos\250711_210745.json
JSON exportado en: ..\data-scraped\channels\delosquesobran\tab-shorts\250711_210748.json
JSON exportado en: ..\data-scraped\channels\delosquesobran\tab-streams\250711_210752.json
JSON exportado en: ..\data-scrap