<a href="https://colab.research.google.com/github/hybrits/H-ALLY/blob/main/web_scraping_Techsupport_Forum_(Descarga%20el%20CSV%20cada%2010%20paginas).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
from urllib.parse import urljoin
import io

def get_headers():
    return {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }

def get_next_page_url(soup, base_url):
    next_page_link = soup.find('a', class_='pageNav-jump--next')
    if next_page_link and 'href' in next_page_link.attrs:
        return urljoin(base_url, next_page_link['href'])
    return None

def scrape_forum(base_url):
    print(f"Accediendo a la URL base: {base_url}")
    headers = get_headers()
    all_data = []
    current_url = base_url
    page_count = 0

    while current_url:
        page_count += 1
        try:
            response = requests.get(current_url, headers=headers)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Error al acceder a la URL: {e}")
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        thread_links = soup.find_all('a', class_='thread-title--gtm')

        print(f"Encontrados {len(thread_links)} enlaces de hilos en la página {page_count}")

        for i, link in enumerate(thread_links, 1):
            thread_url = urljoin(base_url, link['href'])
            print(f"Procesando hilo {i}/{len(thread_links)}: {thread_url}")

            try:
                thread_response = requests.get(thread_url, headers=headers)
                thread_response.raise_for_status()
            except requests.RequestException as e:
                print(f"Error al acceder al hilo: {e}")
                continue

            thread_soup = BeautifulSoup(thread_response.text, 'html.parser')

            title_element = thread_soup.find(['h1', 'h2', 'h3'], class_=['p-title-value', 'thread-title', 'MessageCard__thread-title'])
            title = title_element.text.strip() if title_element else "Título no encontrado"

            responses = thread_soup.find_all('div', class_='MessageCard__container')

            thread_data = {
                'title': title,
                'url': thread_url,
                'responses': []
            }

            for response in responses:
                author_element = response.find('a', class_='MessageCard__user-info__name')
                author = author_element.text.strip() if author_element else "Autor desconocido"

                date_element = response.find('time', class_='u-dt')
                date = date_element['datetime'] if date_element else "Fecha desconocida"

                content_element = response.find('div', class_='bbWrapper')
                content = content_element.text.strip() if content_element else "Contenido no encontrado"

                thread_data['responses'].append({
                    'author': author,
                    'date': date,
                    'content': content
                })

            all_data.append(thread_data)
            print(f"Extraídas {len(thread_data['responses'])} respuestas del hilo")
            time.sleep(2)

        if page_count % 10 == 0:
            save_to_csv(all_data, f'foro_data_paginas_{page_count-9}_a_{page_count}.csv')
            all_data = []  # Reiniciar la lista de datos después de guardar

        current_url = get_next_page_url(soup, base_url)
        if current_url:
            print(f"Pasando a la siguiente página: {current_url}")
            time.sleep(5)
        else:
            print("No hay más páginas para procesar")
            if all_data:  # Guardar los datos restantes si hay alguno
                save_to_csv(all_data, f'foro_data_paginas_finales.csv')

    return page_count

def save_to_csv(data, filename):
    if not data:
        print(f"No hay datos para guardar en {filename}")
        return

    print(f"Guardando datos de {len(data)} hilos en {filename}")
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Título', 'URL', 'Autor', 'Fecha', 'Contenido'])
        for thread in data:
            for response in thread['responses']:
                writer.writerow([
                    thread['title'],
                    thread['url'],
                    response['author'],
                    response['date'],
                    response['content']
                ])
    print(f"Archivo '{filename}' guardado localmente")

# Uso del script
base_url = "https://www.techsupportforum.com/forums/overclocking.273/"
total_pages = scrape_forum(base_url)
print(f"Proceso completado. Se procesaron {total_pages} páginas en total.")

Accediendo a la URL base: https://www.techsupportforum.com/forums/overclocking.273/
Encontrados 38 enlaces de hilos en la página 1
Procesando hilo 1/38: https://www.techsupportforum.com/threads/how-to-overclock-an-i2500k.652418/
Extraídas 2 respuestas del hilo
Procesando hilo 2/38: https://www.techsupportforum.com/threads/how-to-overclock-a-core-2-duo-or-quad.637592/
Extraídas 2 respuestas del hilo
Procesando hilo 3/38: https://www.techsupportforum.com/threads/if-you-are-new-to-overclocking-read-here-for-starters.232382/
Extraídas 3 respuestas del hilo
Procesando hilo 4/38: https://www.techsupportforum.com/threads/will-the-rx-7700-xt-for-gpu-and-5-7600-for-cpu-bottleneck.1260964/
Extraídas 5 respuestas del hilo
Procesando hilo 5/38: https://www.techsupportforum.com/threads/good-question.1260609/
Extraídas 3 respuestas del hilo
Procesando hilo 6/38: https://www.techsupportforum.com/threads/few-question-before-i-undervolt.1259990/
Extraídas 20 respuestas del hilo
Procesando hilo 7/38: ht