In [69]:
import os
from dotenv import load_dotenv, find_dotenv
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
import nest_asyncio
from pinecone import Pinecone,ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings
import pinecone
from pinecone import Pinecone,ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings


nest_asyncio.apply()

load_dotenv(find_dotenv(), override=True)

# Load environment variables
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Define the index name
PINECONE_INDEX_NAME = "dia"

### Cargar Multiples URL (SCRAPING DIA)

In [66]:
def desde_web2(web):
    from langchain_community.document_loaders import WebBaseLoader

    # Create an instance of WebBaseLoader with the list of URLs
    loader = WebBaseLoader(web)

    # Set the number of requests per second to control the scraping rate
    loader.requests_per_second = 1

    # Use the aload method to asynchronously load the documents
    # Note: You might need to handle exceptions or errors here
    try:
        data = loader.aload()
    except Exception as e:
        print(f"An error occurred: {e}")
        data = None

    return data

In [67]:
import requests
from bs4 import BeautifulSoup
from requests import get

def read_urls_from_file(filepath):
    with open(filepath, 'r') as file:
        # Elimina comas y comillas extras de cada línea
        urls = [line.strip().rstrip(',').replace("'", "").replace('"', '') for line in file if line.strip()]
    return urls

# Assuming the 'urls' list is defined as containing the initial URLs you want to scrape
urls = read_urls_from_file('urls.txt') # List of URLs you've already defined
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'}
base_url = 'https://www.dia.es'
all_unique_links = set()

for url in urls:
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        category_segment = url.split('/')[-2]
        # Extract and adjust links to ensure they are absolute
        for a in soup.find_all('a', href=True):
            href = a['href']
            if href.startswith('/'):
                href = base_url + href  # Converts relative links to absolute
            # Check if URL does not contain 'sort' in the last segment
            if href.startswith(base_url) and category_segment in href and 'sort' not in href.split('/')[-1]:
                all_unique_links.add(href)

# Convert set to list (if needed)
all_unique_links = list(all_unique_links)

In [68]:
len(all_unique_links)

4393

### Fragmentar los datos

In [70]:
def fragmentar(data, chunk_size=500):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=60)
    fragmentos = text_splitter.split_documents(data)
    return fragmentos

In [71]:
def cargar_y_fragmentar(documento):
    """Carga el contenido desde la web o un archivo y lo fragmenta."""
    if isinstance(documento, str) and documento.endswith('.txt'):
        contenido = desde_txt(documento)
    else:
        contenido = desde_web2(documento)
    
    fragmentos = fragmentar(contenido)
    costo_embedding(fragmentos)
    return fragmentos

### Costos OpenAI

In [72]:
def costo_embedding(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0001:.5f}')

### Borrando Index de Pinecone

In [73]:
def borrar_indices(index_name='todos'):
    import pinecone
    pc = Pinecone(api_key=PINECONE_API_KEY)

    if index_name == 'todos':
        indexes = pinecone.list_indexes()
        print('Borrando todos los índices ... ')
        for index in indexes:
            pc.delete_index(index)
        print('Listo!')
    else:
        print(f'Borrando el índice: {index_name} ...', end='')
        pc.delete_index(index_name)
        print('Listo')

In [74]:
borrar_indices(PINECONE_INDEX_NAME)

Borrando el índice: dia ...Listo


### Creando Vectores (Embeddings) y subirlos a (Pinecone)

In [75]:
def creando_vectores(index_name,fragmentos):

    embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY, model='text-embedding-ada-002')

    pc = Pinecone(api_key=PINECONE_API_KEY)

    if index_name in pc.list_indexes().names():
        index = pc.Index(index_name)
        print(f'El índice {index_name} ya existe. Cargando los embeddings ... ', end='')
        vectores = PineconeVectorStore.from_existing_index(index_name, embeddings)

        
    elif index_name not in pc.list_indexes().names():
        print(f'Creando el índice {index_name} y los embeddings ...', end='')
        pc.create_index(name=index_name,dimension=1536,metric="cosine", spec=ServerlessSpec(cloud="aws",region="us-east-1"))
        vectores = PineconeVectorStore.from_documents(fragmentos, embeddings, index_name=index_name)
        
        
    return vectores

### Resumen Final (Dia)

In [76]:
documento_dia = all_unique_links  # Asumimos que all_unique_links es una lista de URLs
fragmentos_dia = cargar_y_fragmentar(documento_dia)
costo_embedding(fragmentos_dia)
vectores_dia = creando_vectores(PINECONE_INDEX_NAME, fragmentos_dia)

Fetching pages: 100%|##############################################################| 4393/4393 [14:46<00:00,  4.96it/s]


Total Tokens: 3351781
Embedding Cost in USD: 0.33518
Total Tokens: 3351781
Embedding Cost in USD: 0.33518
Creando el índice dia y los embeddings ...

In [64]:
pip freeze > requirements.txt


Note: you may need to restart the kernel to use updated packages.
