In [None]:
import json
import firebase_admin
from firebase_admin import credentials, firestore

# Initialize Firebase Admin SDK
cred = credentials.Certificate('podcast-4c7aa-firebase-adminsdk-nj0md-8a8359518a.json')
firebase_admin.initialize_app(cred)

# Get Firestore client
db = firestore.client()

import sys

# Function to calculate the size of the document
def get_document_size(document):
    return sys.getsizeof(json.dumps(document))

# Function to upload a single podcast
def upload_podcast(podcast):
    # Calculate document size
    doc_size = get_document_size(podcast)

    # Check if document exceeds Firestore's maximum size
    if doc_size > 1048576:
        print(f"Podcast '{podcast.get('title', 'Unknown')}' exceeds the maximum document size. Optimizing...")
        # Optimize the document
        optimized_podcast = optimize_podcast(podcast)
        doc_size = get_document_size(optimized_podcast)
        if doc_size > 1048576:
            print(f"Cannot upload podcast '{podcast.get('title', 'Unknown')}' even after optimization.")
            return
        else:
            podcast = optimized_podcast

    # Use podcast_id as the document ID
    doc_ref = db.collection('podcasts').document(podcast['podcast_id'])
    doc_ref.set(podcast)
    print(f"Uploaded podcast: {podcast['title']}")

# Function to optimize podcast data
def optimize_podcast(podcast):
    # Remove or truncate large fields
    fields_to_remove = ['transcript', 'long_description', 'full_content']
    for field in fields_to_remove:
        if field in podcast:
            del podcast[field]

    # Optionally truncate description
    if 'description' in podcast and len(podcast['description']) > 1000:
        podcast['description'] = podcast['description'][:1000] + '...'

    # Handle large lists
    if 'episodes' in podcast:
        # Limit the number of episodes or move them to a subcollection
        podcast['episodes_count'] = len(podcast['episodes'])
        del podcast['episodes']

    return podcast

print("Starting upload...")

# Read and upload podcasts from JSONL file
with open('final_podcasts.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        podcast = json.loads(line)
        upload_podcast(podcast)

print("Upload complete!")


Uploaded podcast: The Good Advice Podcast: Do Business Better with Blake Binns
Uploaded podcast: Leading Women in Tech Podcast
Uploaded podcast: Full Disclosure
Uploaded podcast: The Brotherhood Podcast
Uploaded podcast: David C Barnett Small Business and Deal Making M&A SMB
Uploaded podcast: The Ridiculously Human Podcast
Uploaded podcast: Employee To Boss: Entrepreneurship, Side Hustles, and Business Growth Tips from Experts
Uploaded podcast: The Road to Now
Uploaded podcast: La Corneta Extendida
Uploaded podcast: Books & Looks
Uploaded podcast: WorkCookie - Get Ahead with Industrial/Organizational Psychology in the Workplace
Uploaded podcast: Tipping Point with Jimmy Evans
Uploaded podcast: What Happens Next in 6 Minutes
Uploaded podcast: The Danny Brown Show
Uploaded podcast: Cashflow Ninja
Uploaded podcast: Masters in Travel


InvalidArgument: 400 Document 'projects/podcast-4c7aa/databases/(default)/documents/podcasts/e6233fd57c7345aa837e60e610cfc634' cannot be written because its size (2,391,318 bytes) exceeds the maximum allowed size of 1,048,576 bytes.

In [1]:
import json
from datetime import datetime
import pandas as pd
from dateutil import parser

# Read the final_podcasts.jsonl file
final_data = []
with open('final_podcasts.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        final_data.append(data)

# Collect all episodes into a list
episodes_list = []
for podcast in final_data:
    podcast_id = podcast['podcast_id']
    podcast_title = podcast['title']
    for episode in podcast['episodes']:
        episode_id = episode['episode_id']
        episode_title = episode['title']
        episode_published_at = episode['published_at']
        episodes_list.append({
            'podcast_id': podcast_id,
            'podcast_title': podcast_title,
            'episode_id': episode_id,
            'episode_title': episode_title,
            'published_at': episode_published_at
        })

# Convert to DataFrame
df_episodes = pd.DataFrame(episodes_list)

# Parse 'published_at' into datetime objects
def parse_published_at(date_str):
    try:
        dt = parser.parse(date_str)
        return dt
    except Exception as e:
        return None

df_episodes['published_at_parsed'] = df_episodes['published_at'].apply(parse_published_at)

# Drop episodes with invalid dates
df_episodes = df_episodes.dropna(subset=['published_at_parsed'])

# Sort the DataFrame by 'published_at_parsed' from earliest to latest
df_episodes = df_episodes.sort_values('published_at_parsed')

# Optionally reset the index
df_episodes = df_episodes.reset_index(drop=True)

# Save the sorted episodes to a CSV file
df_episodes.to_csv('final_episodes_sorted.csv', index=False)

print("Successfully saved sorted episodes to 'final_episodes_sorted.csv'")


Successfully saved sorted episodes to 'final_episodes_sorted.csv'
