In [1]:
import json
import os
from dotenv import load_dotenv
from openai import OpenAI
from pinecone import Pinecone
from tqdm import tqdm

# Load environment variables and initialize clients
load_dotenv('/Users/adamhunter/miniconda3/envs/ragdev/ragdev.env')
openai_client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))
index = pc.Index("3rd-party-data-v2")

# Constants
EMBEDDING_MODEL = "text-embedding-3-large"
BATCH_SIZE = 100
JSONL_FILE_PATH = "/Users/adamhunter/Documents/ingestionrepos/pathlabsingestion/scraping_tradedesk/api access/data/third_party_segments/third_party_data_tpzavzt.jsonl"  # Update this path

def process_jsonl_file(file_path):
    chunks = []
    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            chunk = create_chunk(data)
            chunks.append(chunk)
    return chunks

def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

def handle_complex_values(value):
    if value is None:
        return "null"
    if isinstance(value, dict):
        return flatten_dict(value)
    if isinstance(value, list):
        return json.dumps(value)
    return value

def create_chunk(data):
    raw_string = f"Full Path: {data['FullPath']}, Description: {data['Description']}"
    chunk_id = data['ThirdPartyDataId']
    
    metadata = {}
    for key, value in data.items():
        if key not in ['FullPath', 'Description']:
            processed_value = handle_complex_values(value)
            if isinstance(processed_value, dict):
                metadata.update(processed_value)
            else:
                metadata[key] = processed_value
    
    return {
        "id": chunk_id,
        "raw_string": raw_string,
        "metadata": metadata
    }

def generate_embeddings(batch):
    texts = [chunk['raw_string'] for chunk in batch]
    response = openai_client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=texts,
        encoding_format="float",
        dimensions=256
    )
    return [data.embedding for data in response.data]

def prepare_and_upsert(batch, embeddings):
    upserts = [
        {
            "id": chunk['id'],
            "values": embedding,
            "metadata": {**chunk['metadata'], 'raw_string': chunk['raw_string']}
        }
        for chunk, embedding in zip(batch, embeddings)
    ]
    index.upsert(upserts)

def embed_and_upload_chunks(chunks):
    for i in tqdm(range(0, len(chunks), BATCH_SIZE), desc="Processing and uploading chunks"):
        batch = chunks[i:i+BATCH_SIZE]
        embeddings = generate_embeddings(batch)
        prepare_and_upsert(batch, embeddings)


In [2]:

# Main execution
chunks = process_jsonl_file(JSONL_FILE_PATH)
print(f"Total chunks created: {len(chunks)}")


Total chunks created: 586277


In [4]:
import tiktoken

def count_tokens(text):
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    return len(encoding.encode(text))

total_tokens = sum(count_tokens(chunk['raw_string']) for chunk in chunks)
print(f"Total tokens in all raw strings: {total_tokens}")
total_cost = total_tokens/1000000 * 0.13
print(f"Total cost for embedding all raw strings: {total_cost}")

Total tokens in all raw strings: 25446228
Total cost for all raw strings: 3.3080096400000003


In [5]:
chunks[20000]

{'id': '11167698|da31shth',
 'raw_string': 'Full Path: Asia > Life Event > College Graduation, Description: None',
 'metadata': {'ThirdPartyDataId': '11167698|da31shth',
  'BrandId': 'da31shth',
  'BrandName': 'Data Alliance',
  'Name': 'College Graduation',
  'DevicesBrowsers30DayCount': 15906000,
  'UniqueUserCount': 8327300,
  'UniqueUserInAppCount': 2914500,
  'UniqueUserWebCount': 3829900,
  'UniqueConnectedTvCount': 1582900,
  'CPMRate': '{"Amount": 1.85, "CurrencyCode": "USD"}',
  'CPMRateInAdvertiserCurrency': '{"Amount": 1.85, "CurrencyCode": "USD"}',
  'PercentOfMediaCostRate': 0.16,
  'PersonsCount': 6051700,
  'HouseholdCount': 4038400,
  'ReceivedIDsCount': 15906000,
  'ActiveIDsCount': 8327300,
  'ActiveIDsInAppCount': 2914500,
  'ActiveIDsWebCount': 3829900,
  'ActiveIDsConnectedTvCount': 1582900,
  'ActivePersonsCount': 6051700,
  'ActiveHouseholdCount': 4038400,
  'ActiveIDsCountExpanded': 'null'}}

In [6]:

embed_and_upload_chunks(chunks)

# Check index stats
print(index.describe_index_stats())

Processing and uploading chunks: 100%|██████████| 5863/5863 [1:52:09<00:00,  1.15s/it]   

{'dimension': 256,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 585168}},
 'total_vector_count': 585168}



