In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
import pandas as pd
from bs4 import BeautifulSoup
import re
import sqlite3

### Scraping Cuenta de Resultados

In [2]:
# Función para limpiar y convertir valores a numéricos
def convertir_a_numerico(valor):
  if isinstance(valor, str):
    # Extraer números, puntos y comas
    valor = re.sub(r'[^\d.,-]', '', valor)
    # Reemplazar comas por puntos si es necesario
    valor = valor.replace('.', '')
    valor = valor.replace(',', '.')
    try:
      return float(valor)
    except ValueError:
      return None
  return valor

def scrap_resultados(tk):
    dfs = []
    driver = webdriver.Chrome()
    try:
        # Abrir la URL
        url = f"https://es.investing.com/equities/{tk}-income-statement"
        driver.get(url)

        # Esperar a que al menos una tabla esté presente
        
        wait = WebDriverWait(driver, 5)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))

        # Obtener el HTML de la página
        page_html = driver.page_source

        # Parsear el HTML de la página con BeautifulSoup
        soup = BeautifulSoup(page_html, "html.parser")

        # Encontrar todas las tablas en la página
        tables = soup.find_all("table")

        # Procesar cada tabla
        for i, table in enumerate(tables):
            # Extraer las filas de la tabla
            rows = table.find_all("tr")
            data = []
            for row in rows:
                cells = row.find_all(["td", "th"])  # Buscar celdas (td) y encabezados (th)
                cell_texts = [cell.get_text(strip=True) for cell in cells]
                data.append(cell_texts)

            # Convertir los datos en un DataFrame de Pandas
            df = pd.DataFrame(data)
            dfs.append(df)
        print("exito al procesar!")

    except TimeoutException:
        print("Error: No se encontraron tablas en la página.")
    except Exception as e:
        print(f"Error: {e}")
    finally:
        # Cerrar el navegador
        driver.quit()
        time.sleep(5)
    return dfs
  
def procesar_tab_resultados(df):
  data = pd.DataFrame()
   
  data = df[1].replace("aa.aa",None).reset_index(drop=True)
  data = data.drop(data[data[1]==""].index,axis=0)
  data = data[data[1].notna()]
  data = data.drop(columns=[0])
  data.columns = data.iloc[0]
  data.drop(0,inplace=True,axis=0)
  data = data.dropna(axis=1,how="all")
  data.rename(columns={data.columns[0]:"indicador"},inplace=True)


  # Aplicar la función a todas las columnas excepto 'company'
  ejercicios = [col for col in data.columns if col.startswith("20")]
  for col in ejercicios:  
    data[col] = data[col].apply(convertir_a_numerico)
  
  data.rename(columns=lambda x: x[:4] if x.startswith('20') else x, inplace=True)

  return data.set_index("indicador")

def get_resultados(tk):
  dfs = scrap_resultados(tk)
  if len(dfs) > 0:
    df = procesar_tab_resultados(dfs)
    df["company"] = tk
    df["informe"] = "resultados"
    df["seccion"] = ""
    return df
  else:
    return None
  

### Scraping hoja de Balance (situacion patrimonial)

In [3]:

def scrap_balance_sheet(tk):
    dfs = []
    # Configurar el navegador
    # chrome_options = Options()
    # chrome_options.add_argument("--headless")
    driver = webdriver.Chrome()
    try:
        # Abrir la URL
        url = f"https://es.investing.com/equities/{tk}-balance-sheet"
        driver.get(url)

        # Esperar a que al menos una tabla esté presente
        
        wait = WebDriverWait(driver, 5)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))

        # Obtener el HTML de la página
        page_html = driver.page_source

        # Parsear el HTML de la página con BeautifulSoup
        soup = BeautifulSoup(page_html, "html.parser")

        # Encontrar todas las tablas en la página
        tables = soup.find_all("table")

        # Procesar cada tabla
        for i, table in enumerate(tables):
            # Extraer las filas de la tabla
            rows = table.find_all("tr")
            data = []
            for row in rows:
                cells = row.find_all(["td", "th"])  # Buscar celdas (td) y encabezados (th)
                cell_texts = [cell.get_text(strip=True) for cell in cells]
                data.append(cell_texts)

            # Convertir los datos en un DataFrame de Pandas
            df = pd.DataFrame(data)
            dfs.append(df)
        print("exito al procesar!")

    except TimeoutException:
        print("Error: No se encontraron tablas en la página.")
    except Exception as e:
        print(f"Error: {e}")
    finally:
        # Cerrar el navegador
        driver.quit()
        time.sleep(5)
    return dfs
  
def procesar_tab(df):
  titulos = df[1].T[0].dropna().reset_index(drop=True).tolist()[1:]
  titulos[0] = "indicador"

  result = pd.DataFrame()
   
  estructura = [
  {"seccion":"activo_corriente", "rubros":2,"valores":3},
  {"seccion":"activo_total", "rubros":4,"valores":5},
  {"seccion":"pasivo_corriente", "rubros":6,"valores":7},
  {"seccion":"pasivo_total", "rubros":8,"valores":9},
  {"seccion":"patrimonio_neto", "rubros":10,"valores":11},
  {"seccion":"crecimiento_deuda", "rubros":14,"valores":15},
]
  for e in estructura:
    try:
      data =  pd.concat([df[e["rubros"]][1],df[e["valores"]]], axis=1)
      data.columns = titulos
      data.replace("aa.aa",None,inplace=True)
      data.dropna(inplace=True, axis=1)
      data["seccion"] = e["seccion"]
      
      result = pd.concat([result, data.reset_index(drop=True)], axis=0)
    except:
      pass
  
    # Aplicar la función a todas las columnas excepto 'company'
  ejercicios = [col for col in result.columns if col.startswith("20")]
  for col in ejercicios:  # se aplica a las columnas de ejercicios
    result[col] = result[col].apply(convertir_a_numerico)
  
  result.rename(columns=lambda x: x[:4] if x.startswith('20') else x, inplace=True)
  
  return result.set_index("indicador")

def get_balance_sheet(tk):
  dfs = scrap_balance_sheet(tk)
  if len(dfs) > 0:
    df = procesar_tab(dfs)
    df["company"] = tk
    df["informe"] = "balance"
    return df
  else:
    return None

### Guardar datos anuales en bbdd

In [4]:
def save_datos_anuales(df):
    # Conectar a la base de datos (o crearla si no existe)
    conn = sqlite3.connect('webmining.db')
    cursor = conn.cursor()
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS datos_anuales (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        company TEXT NOT NULL,
        informe TEXT NOT NULL,
        ejercicio INTEGER NOT NULL,
        seccion TEXT NULL,
        indicador TEXT NOT NULL,
        valor REAL
    )
    ''')
    # Insertar datos
    ejercicios = [col for col in df.columns if col.isnumeric()]
    for ejercicio in ejercicios:
        # eliminar los datos para evitar duplicados
        company = df['company'][0]
        informe = df['informe'][0]
        cursor.execute(f"""
                    DELETE FROM datos_anuales 
                    WHERE company = '{company}' AND 
                    informe = '{informe}' AND 
                    ejercicio = {ejercicio}
                    """)
        conn.commit()
        
        # Estructurar e insertar los datos
        df_ej = df[[ejercicio, "company", "informe","seccion"]].copy().reset_index()
        df_ej["ejercicio"] = int(ejercicio)
        df_ej.rename(columns={ejercicio: "valor"}, inplace=True)
        df_ej.dropna(subset=["valor"], inplace=True)
        df_ej.to_sql('datos_anuales', conn, if_exists='append', index=False)
        # Guardar los cambios
        conn.commit()
    # Cerrar la conexión
    conn.close()

In [5]:
companies = [
  # "apple-computer-inc",
  # "microsoft-corp",
  # "google-inc",
  # "tesla-motors",
  # "visa-inc",
  # "berkshire-hathaway-inc",
  # "ypf-sa",
  # "pfizer",
  # "amazon-com-inc",
  # "disney",
  # "nike",
  # "procter-gamble",
  # "coca-cola-co",
  # "chevron",
  # "3m-co"
  "nvidia-corp",
  "intel-corp",
  "adv-micro-device",
  "caterpillar",
  "ford-motor-co",
  "deere---co",
  "delta-air-lines-new",
  "boeing-co",
  "at-t",
  "mcdonalds",
    
  
]

In [6]:
for company in companies:
    print(f"Procesando {company}...")
    try:
        # Procesar los datos
        df_resultados = get_resultados(company)
        if df_resultados is not None:
            save_datos_anuales(df_resultados)
        df_balance = get_balance_sheet(company)
        if df_balance is not None:
            save_datos_anuales(df_balance)
        print(f"{company} procesado con éxito!")
    except Exception as e:
        print(f"Error al procesar {company}: {e}")
        

Procesando nvidia-corp...
exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


nvidia-corp procesado con éxito!
Procesando intel-corp...
exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


intel-corp procesado con éxito!
Procesando adv-micro-device...
exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


adv-micro-device procesado con éxito!
Procesando caterpillar...
exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


caterpillar procesado con éxito!
Procesando ford-motor-co...
exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


ford-motor-co procesado con éxito!
Procesando deere---co...
exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


deere---co procesado con éxito!
Procesando delta-air-lines-new...
exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


delta-air-lines-new procesado con éxito!
Procesando boeing-co...
exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


boeing-co procesado con éxito!
Procesando at-t...
exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


at-t procesado con éxito!
Procesando mcdonalds...
exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


exito al procesar!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


mcdonalds procesado con éxito!


  company = df['company'][0]
  informe = df['informe'][0]
  company = df['company'][0]
  informe = df['informe'][0]


In [4]:

# Consultar datos
conn = sqlite3.connect('webmining.db')
cursor = conn.cursor()
cursor.execute("SELECT * FROM datos_anuales WHERE company = '3m-co' AND informe = 'resultados'")
filas = cursor.fetchall()

for fila in filas:
    print(fila)

In [3]:
conn = sqlite3.connect('webmining.db')
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM datos_anuales")
numero_registros = cursor.fetchone()[0]
numero_registros

5309

In [None]:
companies = [
  {"apple-computer-inc","AAPL"},
  {"microsoft-corp","MSFT"},
  {"google-inc","GOOGL"},
  {"tesla-motors","TSLA"},
  {"visa-inc","V"},
  {"berkshire-hathaway-inc","BRK.A"},
  {"johnson-johnson","JNJ"},
  {"pfizer","PFE"},
  {"amazon-com-inc","AMZN"},
  {"disney","DIS"},
  {"nike","NKE"},
  {"procter-gamble","PG"},
  {"coca-cola-co","KO"},
  {"chevron","CVX"},
  {"3m-co","MMM"}
]