## Create the CSV File from the markdown file

In [2]:
import csv
import os

# Turn the text files into csv
filepath = "<FILEPATH HERE>"
filename = os.path.basename(filepath)

markdown_content = ""

# Simulating reading from a file
with open(filepath, 'r', encoding='utf-8') as file:
    markdown_content = file.read()

# Splitting the content into paragraphs
paragraphs = [p.strip() for p in markdown_content.split('\n\n') if p.strip()]

# Specify the chapter number here
chapter_number = filename #'001'

# Path to the CSV file where the output will be saved
csv_file_path = filename + '.csv'

# Writing to the CSV file
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Writing the header row
    writer.writerow(['Chapter', 'Index', 'Text'])
    
    for index, paragraph in enumerate(paragraphs, start=1):
        # Assuming the first paragraph is the chapter title and skipping it
        if index > 1: 
            writer.writerow([chapter_number, f'{index-1:03}', paragraph])

print(f'CSV file "{csv_file_path}" has been created.')

CSV file "Chapter1.md.csv" has been created.


In [3]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="Chapter1.md.csv")
dataset

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 149 examples [00:00, 2624.79 examples/s]


DatasetDict({
    train: Dataset({
        features: ['Chapter', 'Index', 'Text'],
        num_rows: 149
    })
})

## Setup Pinecone

In [6]:
from pinecone import Pinecone

# configure client
api_key = '<PINECONE API KEY HERE>'
pc = Pinecone(api_key=api_key)

In [10]:
from pinecone import ServerlessSpec, PodSpec

use_serverless = False

if use_serverless:
    spec = ServerlessSpec(cloud='aws', region='us-west-2')
else:
    spec = PodSpec(environment='gcp-starter')

## Create Index on Pinecone

In [11]:
import time

index_name = 'book-rag'

if index_name not in pc.list_indexes():
    pc.create_index(
        index_name,
        dimension=1536,
        spec=spec,
        metric='cosine'
    )
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)
index = pc.Index(index_name)

In [12]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [15]:
from langchain_openai import OpenAIEmbeddings

apikey = '<OPENAI API KEY HERE>'
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=apikey)

In [18]:
dataset['train']

Dataset({
    features: ['Chapter', 'Index', 'Text'],
    num_rows: 149
})

In [20]:
# Import tqdm for showing progress bars during loops
from tqdm.auto import tqdm

# Convert the dataset into a pandas DataFrame for easier manipulation
data = dataset['train'].to_pandas()

# Set the size of batches for processing to reduce memory usage
batch_size = 100

# Loop through the dataset in batches to process and index the data
for i in tqdm(range(0, len(data), batch_size)):
    # Calculate the end index for the current batch, ensuring it does not exceed the dataset's length
    i_end = min(len(data), i + batch_size)
    
    # Select the current batch from the dataset
    batch = data.iloc[i:i_end]
    
    # Generate unique identifiers for each document in the batch
    # using a combination of DOI and chunk ID
    ids = [f"{x['Chapter']} - {x['Index']}" for i, x in batch.iterrows()]
    
    # Extract the text content of each document in the batch
    texts = [x['Text'] for _, x in batch.iterrows()]
    
    # Use the embedding model to generate embeddings for each document's text content
    embeds = embed_model.embed_documents(texts)
    
    # Prepare metadata for each document in the batch, including the text content,
    # source, and title for additional context and searchability
    metadata = [
        {'text': x['Text']} for i, x in batch.iterrows()
    ]

    # Insert or update the documents in the Pinecone vector database with the generated
    # identifiers, embeddings, and metadata for each document
    index.upsert(vectors=zip(ids, embeds, metadata))

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:12<00:00,  6.38s/it]


In [21]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00149,
 'namespaces': {'': {'vector_count': 149}},
 'total_vector_count': 149}

In [22]:
from langchain.vectorstores import Pinecone

text_field = "text"

vectorstore = Pinecone(index, embed_model.embed_query, text_field)

  warn_deprecated(


In [None]:
query = "what is the future of longevity?"
vectorstore.similarity_search(query, k=3)