In [1]:
import json
import requests
import sys
import gzip
import base64

API_KEY = '$2a$10$bhPnwdTOKrnJ6a7RgKY8MeqyL.bjndmAI47QRVXV4snklbkmxK2DK'
# Use one of your existing collection IDs
COLLECTION_ID = "67224ee2e41b4d34e44b5df2"  # You can change this to any of your existing collection IDs

# Function to estimate size of JSON data
def get_size_mb(obj):
    return sys.getsizeof(json.dumps(obj)) / (1024 * 1024)

# Load the JSONL data
podcasts = []
with open('final_podcasts.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        try:
            podcasts.append(json.loads(line.strip()))
        except json.JSONDecodeError as e:
            print(f"Error parsing line: {e}")
            continue

# Calculate smaller chunk size - let's try 10 items per chunk
CHUNK_SIZE = 10  # Fixed small chunk size

print(f"Total items: {len(podcasts)}")
print(f"Total size: {get_size_mb(podcasts):.2f}MB")

def split_into_chunks(data, chunk_size):
    return [data[i:i + chunk_size] for i in range(0, len(data), chunk_size)]

def compress_data(data):
    json_str = json.dumps(data)
    compressed = gzip.compress(json_str.encode('utf-8'))
    return base64.b64encode(compressed).decode('utf-8')

def upload_chunk(chunk, chunk_id):
    url = 'https://api.jsonbin.io/v3/b'
    headers = {
        'Content-Type': 'application/json',
        'X-Master-Key': API_KEY,
        'X-Collection-Id': COLLECTION_ID
    }
    
    # Prepare data with compression
    data = {
        'chunk_id': chunk_id,
        'data': chunk,
        'compressed': False
    }
    
    chunk_size_mb = get_size_mb(data)
    print(f"Uploading chunk {chunk_id} (size: {chunk_size_mb:.2f}MB)")
    
    # If chunk is too large, try compression
    if chunk_size_mb > 0.5:  # If larger than 0.5MB, use compression
        data['data'] = compress_data(chunk)
        data['compressed'] = True
        chunk_size_mb = get_size_mb(data)
        print(f"Compressed size: {chunk_size_mb:.2f}MB")
    
    try:
        response = requests.post(url, headers=headers, json=data)
        if response.status_code != 200:
            print(f"Upload failed for chunk {chunk_id}")
            print(f"Status code: {response.status_code}")
            print(f"Response: {response.text}")
            raise Exception(f"Failed to upload chunk {chunk_id}")
        
        result = response.json()
        print(f"Successfully uploaded chunk {chunk_id}")
        return result
        
    except Exception as e:
        print(f"Error during upload: {str(e)}")
        raise

def create_index(chunks):
    index = {}
    for chunk_id, chunk in enumerate(chunks):
        for podcast in chunk:
            index[podcast['podcast_id']] = {
                'chunk_id': chunk_id,
                'podcast_index': chunk.index(podcast)
            }
    
    url = 'https://api.jsonbin.io/v3/b'
    headers = {
        'Content-Type': 'application/json',
        'X-Master-Key': API_KEY,
        'X-Collection-Id': COLLECTION_ID
    }
    
    data = {
        'type': 'index',
        'data': index
    }
    
    response = requests.post(url, headers=headers, json=data)
    if response.status_code != 200:
        print(f"Error creating index. Status code: {response.status_code}")
        print(f"Response: {response.text}")
        raise Exception("Failed to create index")
    
    return response.json()

def main():
    try:
        print(f"Using existing collection ID: {COLLECTION_ID}")

        # Split the data into chunks
        chunks = split_into_chunks(podcasts, CHUNK_SIZE)
        print(f"Split data into {len(chunks)} chunks")

        # Upload each chunk
        chunk_bins = []
        for i, chunk in enumerate(chunks):
            try:
                result = upload_chunk(chunk, i)
                bin_id = result['metadata']['id']
                chunk_bins.append(bin_id)
                print(f"Uploaded chunk {i+1}/{len(chunks)}")
                print(f"Bin ID for chunk {i}: {bin_id}")
            except Exception as e:
                print(f"Error uploading chunk {i}: {str(e)}")
                raise

        # Create and upload the index
        index_result = create_index(chunks)
        index_bin_id = index_result['metadata']['id']

        print("\nUpload Complete!")
        print("Collection ID:", COLLECTION_ID)
        print("Index Bin ID:", index_bin_id)
        print("Chunk Bin IDs:", chunk_bins)

        # Save the IDs to a file for reference
        with open('podcast_bins.json', 'w') as f:
            json.dump({
                'collection_id': COLLECTION_ID,
                'index_bin_id': index_bin_id,
                'chunk_bins': chunk_bins
            }, f, indent=2)

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    main()


Total items: 198
Total size: 10.21MB
Using existing collection ID: 67224ee2e41b4d34e44b5df2
Split data into 20 chunks
Uploading chunk 0 (size: 0.33MB)
Successfully uploaded chunk 0
Uploaded chunk 1/20
Bin ID for chunk 0: 673e348bacd3cb34a8abdac2
Uploading chunk 1 (size: 0.61MB)
Compressed size: 0.17MB
Successfully uploaded chunk 1
Uploaded chunk 2/20
Bin ID for chunk 1: 673e348dacd3cb34a8abdac6
Uploading chunk 2 (size: 0.57MB)
Compressed size: 0.19MB
Successfully uploaded chunk 2
Uploaded chunk 3/20
Bin ID for chunk 2: 673e3490e41b4d34e4579f0a
Uploading chunk 3 (size: 0.41MB)
Successfully uploaded chunk 3
Uploaded chunk 4/20
Bin ID for chunk 3: 673e3492ad19ca34f8cd5c35
Uploading chunk 4 (size: 0.38MB)
Successfully uploaded chunk 4
Uploaded chunk 5/20
Bin ID for chunk 4: 673e3495acd3cb34a8abdacc
Uploading chunk 5 (size: 0.74MB)
Compressed size: 0.21MB
Successfully uploaded chunk 5
Uploaded chunk 6/20
Bin ID for chunk 5: 673e3497e41b4d34e4579f11
Uploading chunk 6 (size: 0.74MB)
Compresse