In [1]:
from openai import OpenAI
from langchain_community.embeddings import OpenAIEmbeddings
import os
from datasets import load_dataset, Dataset
from dotenv import load_dotenv
load_dotenv()
from pinecone import Pinecone, ServerlessSpec
import tiktoken  
import pandas as pd
import numpy as np
import sys


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
# Initialize OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

# Initialize Pinecone (use correct import if using newer version)
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

In [3]:
index_name = 'ai-customer-support-chatbot-2'

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

if index_name not in pc.list_indexes():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="dotproduct",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

# Connect to Pinecone index
index = pc.Index(index_name)

In [4]:
# Load dataset
dataset = load_dataset('wikipedia', '20220301.simple')

# Convert to DataFrame for easy cleaning
df = pd.DataFrame(dataset['train'])

# Remove rows with NaN values
df = df.dropna(subset=['id', 'title', 'url', 'text'])

dataset = df.to_dict(orient='records')


In [5]:
# Function to count tokens using tiktoken
def count_tokens(text):
    encoder = tiktoken.get_encoding("cl100k_base")
    tokens = encoder.encode(text)
    return len(tokens)

# Function to split text into chunks based on token count
def split_into_chunks(text, max_tokens):
    encoder = tiktoken.get_encoding("cl100k_base")
    tokens = encoder.encode(text)
    chunks = [tokens[i:i + max_tokens] for i in range(0, len(tokens), max_tokens)]
    return [encoder.decode(chunk) for chunk in chunks]

In [6]:
def generate_embeddings(text, model="text-embedding-ada-002"):
    max_tokens = 8192
    chunks = split_into_chunks(text, max_tokens)
    
    embeddings = []
    for chunk in chunks:
        try:
            response = client.embeddings.create(
                input=[chunk],
                model=model
            )
            embedding = response.data[0].embedding  # Adjust this based on actual response format
            if embedding:
                embeddings.append(embedding)
        except Exception as e:
            print(f"Error generating embedding for chunk: {e}")

    return embeddings


In [7]:
def get_metadata_size(metadata):
    import json
    return sys.getsizeof(json.dumps(metadata))
def validate_embeddings(embedding):
    return all(value is not None and not isinstance(value, float) or not np.isnan(value) for value in embedding)


In [8]:

MAX_METADATA_SIZE = 40960  # 40KB in bytes

# Usage in store_in_pinecone
def store_in_pinecone(id, title, url, text):
    embeddings = generate_embeddings(text)

    for embedding in embeddings:
        if not validate_embeddings(embedding):
            print(f"Skipping invalid embedding for id: {id}")
            continue
        print(id, embedding)
        metadata = {
            "title": title,
            "url": url,
            'text': text
        }
        metadata_size = get_metadata_size(metadata)
        if metadata_size > MAX_METADATA_SIZE:
            print(f"Skipping entry for id: {id} due to metadata size exceeding limit.")
            continue

        # Ensure the embedding matches the expected dimension
        if len(embedding) != 1536:
            print(f"Skipping entry for id: {id} due to incorrect embedding dimension.")
            continue
        index.upsert(
            vectors=[
                {
                    "id": id,
                    "values": embedding,
                    "metadata": metadata
                }
            ]
        )
        print(f"Data stored in Pinecone for id: {id} successfully")



In [9]:
# import math
# def process_in_batches(dataset, batch_size=100):
#     total_articles = len(dataset)
#     num_batches = math.ceil(total_articles / batch_size)
    
#     for i in range(num_batches):
#         start_index = i * batch_size
#         end_index = min(start_index + batch_size, total_articles)
#         batch = dataset[start_index:end_index]

#         for article in batch:
#             store_in_pinecone(article['id'], article['title'], article['url'], article['text'])
        
#         print(f"Processed batch {i + 1}/{num_batches}")

In [None]:
batch1 = dataset[:1000]
batch2 = dataset[1000:2000]
batch3 = dataset[2000:3000]
batch4 = dataset[3000:4000]
batch5 = dataset[4000:5000]
batch6 = dataset[5000:6000]
batch7 = dataset[6000:7000]
batch8 = dataset[7000:8000]
batch9 = dataset[8000:9000]
batch10 = dataset[9000:10000]


In [None]:
print(index.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1388}},
 'total_vector_count': 1388}
