# Data Gathering

I leveraged Claude to put together some code to extract data from iNaturalist (https://www.inaturalist.org) for common Arizona plant families and genera

The ```iNaturalistSpeciesScraper``` is a utility class to talk to the iNaturalist API

In [4]:
import requests
import json
import time
import pandas as pd
from collections import defaultdict

class iNaturalistSpeciesScraper:
    def __init__(self):
        self.base_url = "https://api.inaturalist.org/v1"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Plant RAG Dataset Builder'
        })
    
    def search_arizona_desert_plants(self):
        """Search for common Arizona desert plant families and genera"""
        
        # Common Arizona desert plant groups
        search_terms = [
            # Cacti
            'Carnegiea', 'Ferocactus', 'Echinocactus', 'Mammillaria', 
            'Opuntia', 'Cylindropuntia',
            # Agaves and relatives
            'Agave', 'Yucca', 'Dasylirion', 'Nolina',
            # Desert trees and shrubs
            'Parkinsonia', 'Prosopis', 'Olneya', 'Cercidium',
            'Fouquieria', 'Larrea', 'Acacia', 'Simmondsia',
            # Wildflowers and perennials
            'Penstemon', 'Lupinus', 'Eschscholzia', 'Baileya',
            'Encelia', 'Sphaeralcea', 'Calliandra',
            # Grasses and groundcovers
            'Muhlenbergia', 'Bouteloua', 'Hesperaloe',
        ]
        
        all_taxa_ids = set()
        
        for term in search_terms:
            print(f"Searching for: {term}")
            params = {
                'q': term,
                'place_id': 35,  # Arizona
                'is_active': True,
                'per_page': 50
            }
            
            try:
                response = self.session.get(f"{self.base_url}/taxa/autocomplete", params=params)
                response.raise_for_status()
                data = response.json()
                
                results = data.get('results', [])
                for taxon in results:
                    # Only include species and subspecies
                    if taxon.get('rank') in ['species', 'subspecies', 'variety']:
                        all_taxa_ids.add(taxon['id'])
                
                print(f"  Found {len(results)} taxa")
                time.sleep(0.5)
                
            except Exception as e:
                print(f"Error searching for {term}: {e}")
        
        print(f"\nTotal unique taxa found: {len(all_taxa_ids)}")
        return list(all_taxa_ids)
    
    def get_taxon_details(self, taxon_id):
        """Get detailed information about a specific taxon"""
        try:
            response = self.session.get(f"{self.base_url}/taxa/{taxon_id}")
            response.raise_for_status()
            data = response.json()
            
            if data.get('results'):
                return data['results'][0]
            return None
            
        except Exception as e:
            print(f"Error getting taxon {taxon_id}: {e}")
            return None
    
    def extract_rich_plant_data(self, taxon):
        """Extract comprehensive plant information from taxon data"""
        if not taxon:
            return None
        
        plant_data = {
            'taxon_id': taxon.get('id'),
            'scientific_name': taxon.get('name'),
            'common_name': taxon.get('preferred_common_name'),
            'rank': taxon.get('rank'),
            
            # Rich text content
            'wikipedia_summary': taxon.get('wikipedia_summary', ''),
            'wikipedia_url': taxon.get('wikipedia_url', ''),
            
            # Taxonomy
            'kingdom': None,
            'phylum': None,
            'class': None,
            'order': None,
            'family': None,
            'genus': None,
            
            # Conservation and status
            'conservation_status': None,
            'conservation_status_source': None,
            'threatened': taxon.get('threatened', False),
            'endemic': taxon.get('endemic', False),
            'native': taxon.get('native', False),
            'introduced': taxon.get('introduced', False),
            
            # Observations
            'observations_count': taxon.get('observations_count', 0),
            
            # Images
            'default_photo_url': taxon.get('default_photo', {}).get('medium_url') if taxon.get('default_photo') else None,
            'photo_urls': [photo.get('medium_url') for photo in taxon.get('taxon_photos', [])[:5]],
            
            # Additional info
            'iconic_taxon_name': taxon.get('iconic_taxon_name'),
        }
        
        # Extract taxonomic hierarchy
        if taxon.get('ancestors'):
            for ancestor in taxon['ancestors']:
                rank = ancestor.get('rank')
                if rank in ['kingdom', 'phylum', 'class', 'order', 'family', 'genus']:
                    plant_data[rank] = ancestor.get('name')
        
        # Conservation status
        if taxon.get('conservation_status'):
            status = taxon['conservation_status']
            plant_data['conservation_status'] = status.get('status')
            plant_data['conservation_status_source'] = status.get('authority')
        
        return plant_data
    
    def fetch_all_species_details(self, taxa_ids, max_species=None):
        """Fetch detailed information for all taxa"""
        all_plants = []
        total = len(taxa_ids)
        
        if max_species:
            taxa_ids = taxa_ids[:max_species]
            total = max_species
        
        print(f"\nFetching details for {total} species...")
        
        for i, taxon_id in enumerate(taxa_ids, 1):
            print(f"Progress: {i}/{total} - Taxon ID: {taxon_id}")
            
            taxon = self.get_taxon_details(taxon_id)
            if taxon:
                plant_data = self.extract_rich_plant_data(taxon)
                if plant_data:
                    # Only include if it has some descriptive content
                    if plant_data['wikipedia_summary']:
                        all_plants.append(plant_data)
                        print(f"  ✓ {plant_data['scientific_name']} - {len(plant_data['wikipedia_summary'])} chars")
                    else:
                        print(f"  ✗ {plant_data.get('scientific_name', 'Unknown')} - No description")
            
            # Rate limiting
            time.sleep(1)
        
        return all_plants
    
    def save_to_csv(self, plants_data, filename="arizona_desert_plants.csv"):
        """Save plant data to CSV"""
        df = pd.DataFrame(plants_data)
        df.to_csv(filename, index=False)
        print(f"\n✓ Saved {len(plants_data)} plant records to {filename}")
    
    def save_to_json(self, plants_data, filename="arizona_desert_plants.json"):
        """Save plant data to JSON"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(plants_data, f, indent=2, ensure_ascii=False)
        print(f"✓ Saved {len(plants_data)} plant records to {filename}")
    
    def generate_summary_stats(self, plants_data):
        """Generate summary statistics"""
        print("\n" + "="*50)
        print("DATASET SUMMARY")
        print("="*50)
        
        print(f"\nTotal species: {len(plants_data)}")
        
        # Species with descriptions
        with_desc = sum(1 for p in plants_data if p.get('wikipedia_summary'))
        print(f"Species with descriptions: {with_desc}")
        
        # Average description length
        desc_lengths = [len(p.get('wikipedia_summary', '')) for p in plants_data if p.get('wikipedia_summary')]
        if desc_lengths:
            avg_length = sum(desc_lengths) / len(desc_lengths)
            print(f"Average description length: {avg_length:.0f} characters")
        
        # Family distribution
        families = defaultdict(int)
        for p in plants_data:
            if p.get('family'):
                families[p['family']] += 1
        
        print(f"\nTop 10 families:")
        for family, count in sorted(families.items(), key=lambda x: x[1], reverse=True)[:10]:
            print(f"  {family}: {count}")
        
        # Conservation status
        conserved = sum(1 for p in plants_data if p.get('conservation_status'))
        print(f"\nSpecies with conservation status: {conserved}")

## Fetching the data

The code below using the scraper utility class to obtain ~637 plant species/families from Arizona

Two files are generated: arizona_desert_plants.csv and arizona_desert_plants.json and saved locally

**Note**: this takes ~10 minutes to run; there are rate limits in the API and the code takes a conservative approach of one request per species/family

In [None]:
# Main execution
scraper = iNaturalistSpeciesScraper()

print("Step 1: Searching for Arizona desert plant species...")
taxa_ids = scraper.search_arizona_desert_plants()

print(f"\nStep 2: Fetching detailed information for {len(taxa_ids)} species...")
print("(This will take a while - about 1 second per species)")

plants_data = scraper.fetch_all_species_details(taxa_ids)

print(f"\nStep 3: Saving data...")
scraper.save_to_csv(plants_data)
scraper.save_to_json(plants_data)

scraper.generate_summary_stats(plants_data)

print("\n" + "="*50)
print("Done! Check the generated CSV and JSON files.")
print("="*50)

In [None]:
plants_data[:10]

## Data Cleanup

Let's examine the data we have obtained so far and see whether we can use it as-is or it needs some cleanup. We will read the plants_data.csv file since we have it handy already.

In [5]:
import pandas as pd

In [None]:
dp = pd.read_json('arizona_desert_plants.json')

In [None]:
dp.head(10)

In [None]:
dp.count()

In [None]:
# Inspect the ones that don't have a common name
dp[dp['common_name'].isnull()]

In [None]:
# Remove species w/o a common name
dp_filtered = dp[dp['common_name'].notnull()]

In [None]:
dp_filtered.count()

In [None]:
# Save results
dp_filtered.to_json('arizona_desert_plants_clean.json', 
                    orient='records',  # makes sure json objects are written as records
                    indent=2)  # Pretty print for readability :-)

# Additional data
While the iNaturalist data is a good start, the descriptions are not enough to be able to provide sound knowledge and answers for the intent of the project. 

Again, leveraging Claude, to extract data, in PDF format, from the University of Arizona Extensions department.

The ```UAExtensionsScrapper``` is a utility class to download PDFs that contain information about desert plant care, plant types, landscaping and design, etc.

**Note:** The final number of dowloaded PDFs is 14. I originally started with a list that only yield 6 PDFs, so I then tried another set after which I obtained the final 14.

In [6]:
class UAExtensionScraper:
    def __init__(self, download_dir="extension_pdfs"):
        self.download_dir = Path(download_dir)
        self.download_dir.mkdir(exist_ok=True)
        self.text_dir = Path("extension_texts")
        self.text_dir.mkdir(exist_ok=True)
        
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'UA Extension PDF Scraper'
        })
        
        # Curated list of Arizona Extension publications about desert plants
        # URLs verified as working in October 2024
        # I put together a bigger list based on experimentation
        self.publication_urls = [
            # Core desert plant care (Working)
            "https://extension.arizona.edu/sites/default/files/2024-08/az1048.pdf",  # Care of Desert Adapted Plants
            "https://extension.arizona.edu/sites/default/files/2024-08/az1022.pdf",  # Planting & Establishing Native & Adapted Trees
            
            # Specific plant types (Working)
            "https://extension.arizona.edu/sites/extension.arizona.edu/files/pubs/az1048-2018.pdf",  # Desert Adapted Plants
            "https://extension.arizona.edu/sites/extension.arizona.edu/files/pubs/az1673.pdf",  # Saguaro Cactus
            "https://extension.arizona.edu/sites/extension.arizona.edu/files/pubs/az1429.pdf",  # Palm Trees
            "https://extension.arizona.edu/sites/extension.arizona.edu/files/pubs/az1722.pdf",  # Agaves
            "https://extension.arizona.edu/sites/default/files/2024-08/az1021.pdf",  # Arizona Landscape Palms
            "https://extension.arizona.edu/sites/default/files/2024-08/az2021-2023.pdf",  # Arizona Landscape Palms Management
            "https://extension.arizona.edu/sites/extension.arizona.edu/files/attachment/Ocotillo.pdf",  # Ocotillo Care
            
            # Landscaping & Design (Working)
            "https://extension.arizona.edu/sites/default/files/2024-08/az1455.pdf",  # Desert Landscaping
            "https://extension.arizona.edu/sites/default/files/2024-08/az1624.pdf",  # Xeriscaping
            "https://extension.arizona.edu/sites/default/files/2024-08/az1110.pdf",  # Ground Covers
            "https://extension.arizona.edu/sites/default/files/2024-08/az1100a.pdf",  # Flower Planting Guide
            "https://extension.arizona.edu/sites/extension.arizona.edu/files/pubs/az1455.pdf",  # Desert Landscaping
            "https://extension.arizona.edu/sites/extension.arizona.edu/files/pubs/az1624.pdf",  # Xeriscaping
            "https://extension.arizona.edu/sites/extension.arizona.edu/files/pubs/az1298.pdf",  # Watering
            "https://extension.arizona.edu/sites/extension.arizona.edu/files/pubs/az1020.pdf",  # Fertilizing
            
            # Container & Special Topics (Working)
            "https://extension.arizona.edu/sites/extension.arizona.edu/files/pubs/az1713-2016.pdf",  # Container Gardening
            "https://extension.arizona.edu/sites/extension.arizona.edu/files/pubs/az1800-2019.pdf",  # Prickly Pear Cactus
            "https://extension.arizona.edu/sites/extension.arizona.edu/files/attachment/nopal_final_English.pdf",  # Desert Foods - Nopal
            "https://extension.arizona.edu/sites/extension.arizona.edu/files/attachment/Wildflowers.pdf",  # Wildflowers & Native Grasses
            "https://extension.arizona.edu/sites/extension.arizona.edu/files/pubs/az1713-2016.pdf",  # Container Gardening

            # Specific guides
            "https://extension.arizona.edu/sites/extension.arizona.edu/files/pubs/az1456.pdf",  # Native Plant Guide
            "https://extension.arizona.edu/sites/extension.arizona.edu/files/pubs/az1123.pdf",  # Plant Selection

            # Additional Resources
            "https://extension.arizona.edu/sites/default/files/2024-08/az1429.pdf",  # Original Palm Trees (if different)
        ]
    
    def download_pdf(self, url, custom_filename=None):
        """Download a single PDF"""
        try:
            print(f"Downloading: {url}")
            response = self.session.get(url, timeout=30)
            response.raise_for_status()
            
            if custom_filename:
                filename = custom_filename
            else:
                # Extract filename from URL
                filename = os.path.basename(urlparse(url).path)
                if not filename.endswith('.pdf'):
                    filename += '.pdf'
            
            filepath = self.download_dir / filename
            
            with open(filepath, 'wb') as f:
                f.write(response.content)
            
            print(f"  ✓ Saved: {filename} ({len(response.content)/1024:.1f} KB)")
            return filepath
            
        except requests.exceptions.RequestException as e:
            print(f"  ✗ Error downloading {url}: {e}")
            return None
        except Exception as e:
            print(f"  ✗ Unexpected error: {e}")
            return None
    
    def extract_text_pdfplumber(self, pdf_path):
        """Extract text using pdfplumber (more accurate)"""
        text = ""
        metadata = {
            'pages': 0,
            'extraction_method': 'pdfplumber'
        }
        
        try:
            with pdfplumber.open(pdf_path) as pdf:
                metadata['pages'] = len(pdf.pages)
                
                for page_num, page in enumerate(pdf.pages, 1):
                    page_text = page.extract_text()
                    if page_text:
                        text += f"\n--- Page {page_num} ---\n"
                        text += page_text + "\n"
                        
        except Exception as e:
            print(f"  ✗ pdfplumber error on {pdf_path.name}: {e}")
            return None, metadata
        
        return text, metadata
    
    def extract_text_pypdf2(self, pdf_path):
        """Extract text using PyPDF2 (fallback)"""
        text = ""
        metadata = {
            'pages': 0,
            'extraction_method': 'pypdf2'
        }
        
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                metadata['pages'] = len(pdf_reader.pages)
                
                for page_num, page in enumerate(pdf_reader.pages, 1):
                    page_text = page.extract_text()
                    if page_text:
                        text += f"\n--- Page {page_num} ---\n"
                        text += page_text + "\n"
                        
        except Exception as e:
            print(f"  ✗ PyPDF2 error on {pdf_path.name}: {e}")
            return None, metadata
        
        return text, metadata
    
    def clean_text(self, text):
        """Clean extracted text"""
        # Remove excessive whitespace
        text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)
        text = re.sub(r' +', ' ', text)
        
        # Remove common header/footer patterns
        lines = text.split('\n')
        cleaned_lines = []
        
        for line in lines:
            line = line.strip()
            
            # Skip page markers we added
            if line.startswith('--- Page') and line.endswith('---'):
                cleaned_lines.append(line)
                continue
            
            # Skip very short lines that might be page numbers
            if len(line) < 3:
                continue
            
            # Skip lines that are just numbers (page numbers)
            if re.match(r'^\d+$', line):
                continue
            
            # Skip common headers/footers
            if any(pattern in line.lower() for pattern in [
                'university of arizona',
                'cooperative extension',
                'college of agriculture',
                'issued in furtherance'
            ]):
                continue
            
            cleaned_lines.append(line)
        
        return '\n'.join(cleaned_lines)
    
    def process_pdf(self, pdf_path):
        """Extract and clean text from PDF"""
        print(f"Processing: {pdf_path.name}")
        
        # Try pdfplumber first (better quality)
        text, metadata = self.extract_text_pdfplumber(pdf_path)
        
        # Fallback to PyPDF2 if pdfplumber fails
        if not text or len(text.strip()) < 100:
            print(f"  → Trying PyPDF2 fallback...")
            text, metadata = self.extract_text_pypdf2(pdf_path)
        
        if not text or len(text.strip()) < 100:
            print(f"  ✗ No meaningful text extracted")
            return None
        
        # Clean the text
        clean_text = self.clean_text(text)
        
        # Save to text file
        txt_filename = pdf_path.stem + '.txt'
        txt_path = self.text_dir / txt_filename
        
        with open(txt_path, 'w', encoding='utf-8') as f:
            f.write(clean_text)
        
        print(f"  ✓ Extracted {len(clean_text)} characters ({metadata['pages']} pages)")
        
        return {
            'pdf_filename': pdf_path.name,
            'txt_filename': txt_filename,
            'source_url': None,  # Will be set by caller
            'text_length': len(clean_text),
            'pages': metadata['pages'],
            'extraction_method': metadata['extraction_method']
        }
    
    def download_all_publications(self):
        """Download all curated publications"""
        print("="*60)
        print("Downloading UA Extension Publications")
        print("="*60)
        
        downloaded = []
        
        for i, url in enumerate(self.publication_urls, 1):
            print(f"\n[{i}/{len(self.publication_urls)}]")
            filepath = self.download_pdf(url)
            if filepath:
                downloaded.append((filepath, url))
        
        print(f"\n✓ Downloaded {len(downloaded)}/{len(self.publication_urls)} PDFs")
        return downloaded
    
    def process_all_pdfs(self):
        """Process all PDFs in download directory"""
        pdf_files = list(self.download_dir.glob("*.pdf"))
        
        if not pdf_files:
            print("No PDFs found in download directory!")
            return []
        
        print("\n" + "="*60)
        print(f"Processing {len(pdf_files)} PDFs")
        print("="*60 + "\n")
        
        results = []
        
        for i, pdf_path in enumerate(pdf_files, 1):
            print(f"[{i}/{len(pdf_files)}]")
            result = self.process_pdf(pdf_path)
            if result:
                results.append(result)
        
        return results
    
    def save_metadata(self, results, filename="extension_publications_metadata.json"):
        """Save processing results metadata"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2)
        print(f"\n✓ Saved metadata to {filename}")
    
    def generate_summary(self, results):
        """Generate summary statistics"""
        print("\n" + "="*60)
        print("EXTRACTION SUMMARY")
        print("="*60)
        
        total_pdfs = len(results)
        total_pages = sum(r['pages'] for r in results)
        total_chars = sum(r['text_length'] for r in results)
        
        print(f"\nSuccessfully processed: {total_pdfs} publications")
        print(f"Total pages: {total_pages}")
        print(f"Total text content: {total_chars:,} characters")
        print(f"Average per document: {total_chars//total_pdfs:,} characters")
        
        print(f"\nText files saved to: {self.text_dir}/")


In [None]:
# Main execution
scraper = UAExtensionScraper()

print("Step 1: Downloading publications...")
downloaded = scraper.download_all_publications()

if downloaded:
    print("\nStep 2: Extracting text from PDFs...")
    results = scraper.process_all_pdfs()
    
    if results:
        print("\nStep 3: Saving metadata...")
        scraper.save_metadata(results)
        
        scraper.generate_summary(results)
        
        print("\n" + "="*60)
        print("✓ Done! Check the 'extension_texts' folder for extracted content")
        print("="*60)
    else:
        print("\n✗ No text was successfully extracted")
else:
    print("\n✗ No PDFs were downloaded")

# Putting all the data together

Now, we need to put it all together into a single dataset. Again, I enlisted Claude to help me write a script to combine the two data sets.

The resulting dataset is stored in 3 different formats: csv, json and json lines (jsonl); the format of the filenames is arizona_plants_unified_YYYYMMDD.<csv|json|jsonl>

**Note:** the code below uses simple chunking, for now

In [10]:
import pandas as pd
import json
from pathlib import Path
from datetime import datetime

class ArizonaPlantDatasetBuilder:
    def __init__(self, 
                 inaturalist_json,
                 extension_text_dir):
        self.inaturalist_json = inaturalist_json
        self.extension_text_dir = Path(extension_text_dir)
        self.unified_dataset = []
    
    def load_inaturalist_data(self):
        """Load and process iNaturalist species data"""
        print("Loading iNaturalist data...")
        
        with open(self.inaturalist_json, 'r', encoding='utf-8') as f:
            species_data = json.load(f)
        
        # Filter: only species with common names
        species_data = [s for s in species_data if s.get('common_name')]
        
        print(f"Loaded {len(species_data)} species with common names")
        
        documents = []
        for species in species_data:
            # Create a rich text document for each species
            doc_text = self._format_species_document(species)
            
            document = {
                'id': f"species_{species['taxon_id']}",
                'type': 'species',
                'source': 'iNaturalist',
                'title': f"{species['common_name']} ({species['scientific_name']})",
                'content': doc_text,
                'metadata': {
                    'scientific_name': species['scientific_name'],
                    'common_name': species['common_name'],
                    'family': species.get('family'),
                    'genus': species.get('genus'),
                    'native': species.get('native'),
                    'conservation_status': species.get('conservation_status'),
                    'wikipedia_url': species.get('wikipedia_url'),
                    'observations_count': species.get('observations_count', 0)
                }
            }
            
            documents.append(document)
        
        return documents
    
    def _format_species_document(self, species):
        """Format species data into a readable document"""
        lines = []
        
        # Title
        lines.append(f"# {species['common_name']}")
        lines.append(f"*{species['scientific_name']}*")
        lines.append("")
        
        # Taxonomy
        taxonomy = []
        if species.get('family'):
            taxonomy.append(f"Family: {species['family']}")
        if species.get('genus'):
            taxonomy.append(f"Genus: {species['genus']}")
        if taxonomy:
            lines.append("**Taxonomy:** " + ", ".join(taxonomy))
            lines.append("")
        
        # Description (Wikipedia summary)
        if species.get('wikipedia_summary'):
            lines.append("## Description")
            # Remove HTML tags from wikipedia summary
            import re
            clean_summary = re.sub(r'<[^>]+>', '', species['wikipedia_summary'])
            lines.append(clean_summary)
            lines.append("")
        
        # Status
        status_info = []
        if species.get('native'):
            status_info.append("Native to Arizona")
        elif species.get('introduced'):
            status_info.append("Introduced species")
        
        if species.get('conservation_status'):
            status_info.append(f"Conservation Status: {species['conservation_status']}")
        
        if status_info:
            lines.append("**Status:** " + ", ".join(status_info))
            lines.append("")
        
        # Observations
        if species.get('observations_count'):
            lines.append(f"*{species['observations_count']} observations recorded in Arizona*")
            lines.append("")
        
        return "\n".join(lines)
    
    def load_extension_documents(self):
        """Load and process Extension publication texts"""
        print("Loading Extension publications...")
        
        text_files = list(self.extension_text_dir.glob("*.txt"))
        print(f"Found {len(text_files)} text files")
        
        documents = []
        
        for txt_file in text_files:
            with open(txt_file, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # Extract title from filename (e.g., az1048.txt -> az1048)
            doc_id = txt_file.stem
            
            # Try to extract title from first few lines
            title = self._extract_title_from_content(content, doc_id)
            
            document = {
                'id': f"extension_{doc_id}",
                'type': 'extension_publication',
                'source': 'University of Arizona Cooperative Extension',
                'title': title,
                'content': content,
                'metadata': {
                    'publication_id': doc_id,
                    'filename': txt_file.name,
                    'content_length': len(content)
                }
            }
            
            documents.append(document)
        
        return documents
    
    def _extract_title_from_content(self, content, default_id):
        """Try to extract title from document content"""
        lines = content.split('\n')
        
        # Look for title in first 10 lines
        for line in lines[:10]:
            line = line.strip()
            # Title is usually a longer line with capitals
            if len(line) > 20 and len(line) < 100:
                if any(word[0].isupper() for word in line.split() if word):
                    return line
        
        return f"Extension Publication {default_id}"
    
    def chunk_long_documents(self, documents, max_chunk_size=2000):
        """Split long documents into smaller chunks for better retrieval"""
        chunked_docs = []
        
        for doc in documents:
            content = doc['content']
            
            # If document is short enough, keep as is
            if len(content) <= max_chunk_size:
                chunked_docs.append(doc)
                continue
            
            # Split into chunks by paragraphs
            paragraphs = content.split('\n\n')
            current_chunk = []
            current_length = 0
            chunk_num = 1
            
            for para in paragraphs:
                para_length = len(para)
                
                # If adding this paragraph exceeds limit, save current chunk
                if current_length + para_length > max_chunk_size and current_chunk:
                    chunk_content = '\n\n'.join(current_chunk)
                    
                    chunk_doc = {
                        'id': f"{doc['id']}_chunk_{chunk_num}",
                        'type': doc['type'],
                        'source': doc['source'],
                        'title': f"{doc['title']} (Part {chunk_num})",
                        'content': chunk_content,
                        'metadata': {
                            **doc['metadata'],
                            'chunk_number': chunk_num,
                            'is_chunk': True,
                            'parent_id': doc['id']
                        }
                    }
                    
                    chunked_docs.append(chunk_doc)
                    
                    # Start new chunk
                    current_chunk = [para]
                    current_length = para_length
                    chunk_num += 1
                else:
                    current_chunk.append(para)
                    current_length += para_length
            
            # Add final chunk
            if current_chunk:
                chunk_content = '\n\n'.join(current_chunk)
                chunk_doc = {
                    'id': f"{doc['id']}_chunk_{chunk_num}",
                    'type': doc['type'],
                    'source': doc['source'],
                    'title': f"{doc['title']} (Part {chunk_num})" if chunk_num > 1 else doc['title'],
                    'content': chunk_content,
                    'metadata': {
                        **doc['metadata'],
                        'chunk_number': chunk_num,
                        'is_chunk': chunk_num > 1,
                        'parent_id': doc['id']
                    }
                }
                chunked_docs.append(chunk_doc)
        
        return chunked_docs
    
    def build_unified_dataset(self, chunk_size=2000):
        """Build unified dataset from all sources"""
        print("="*60)
        print("Building Unified Arizona Desert Plants Dataset")
        print("="*60 + "\n")
        
        # Load both data sources
        species_docs = self.load_inaturalist_data()
        extension_docs = self.load_extension_documents()
        
        # Combine
        all_documents = species_docs + extension_docs
        print(f"\nTotal documents before chunking: {len(all_documents)}")
        print(f"  - Species: {len(species_docs)}")
        print(f"  - Extension publications: {len(extension_docs)}")
        
        # Chunk long documents
        print(f"\nChunking documents (max size: {chunk_size} chars)...")
        chunked_documents = self.chunk_long_documents(all_documents, chunk_size)
        print(f"Total documents after chunking: {len(chunked_documents)}")
        
        return chunked_documents
    
    def save_dataset(self, documents, directory, output_format='json'):
        """Save unified dataset"""
        timestamp = datetime.now().strftime("%Y%m%d")
        
        if output_format == 'json':
            filename = f"{directory}/arizona_plants_unified_{timestamp}.json"
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(documents, f, indent=2, ensure_ascii=False)
            print(f"\n✓ Saved JSON dataset: {filename}")
        
        elif output_format == 'jsonl':
            filename = f"{directory}/arizona_plants_unified_{timestamp}.jsonl"
            with open(filename, 'w', encoding='utf-8') as f:
                for doc in documents:
                    f.write(json.dumps(doc, ensure_ascii=False) + '\n')
            print(f"\n✓ Saved JSONL dataset: {filename}")
        
        elif output_format == 'csv':
            # Flatten for CSV
            filename = f"{directory}/arizona_plants_unified_{timestamp}.csv"
            df = pd.DataFrame([
                {
                    'id': doc['id'],
                    'type': doc['type'],
                    'source': doc['source'],
                    'title': doc['title'],
                    'content': doc['content'],
                    'content_length': len(doc['content']),
                    **{f"meta_{k}": v for k, v in doc['metadata'].items()}
                }
                for doc in documents
            ])
            df.to_csv(filename, index=False)
            print(f"\n✓ Saved CSV dataset: {filename}")
        
        return filename
    
    def generate_dataset_report(self, documents):
        """Generate comprehensive dataset statistics"""
        print("\n" + "="*60)
        print("DATASET STATISTICS")
        print("="*60)
        
        # Overall stats
        total_docs = len(documents)
        total_chars = sum(len(doc['content']) for doc in documents)
        avg_chars = total_chars // total_docs
        
        print(f"\nTotal Documents: {total_docs}")
        print(f"Total Content: {total_chars:,} characters")
        print(f"Average per Document: {avg_chars:,} characters")
        
        # By type
        by_type = {}
        for doc in documents:
            doc_type = doc['type']
            by_type[doc_type] = by_type.get(doc_type, 0) + 1
        
        print("\nDocument Types:")
        for doc_type, count in sorted(by_type.items()):
            print(f"  {doc_type}: {count}")
        
        # Chunked vs original
        chunked = sum(1 for doc in documents if doc['metadata'].get('is_chunk'))
        print(f"\nChunked documents: {chunked}")
        print(f"Original documents: {total_docs - chunked}")
        
        # Sample content lengths
        lengths = [len(doc['content']) for doc in documents]
        lengths.sort()
        print(f"\nContent length distribution:")
        print(f"  Minimum: {lengths[0]:,} chars")
        print(f"  Median: {lengths[len(lengths)//2]:,} chars")
        print(f"  Maximum: {lengths[-1]:,} chars")


In [12]:
builder = ArizonaPlantDatasetBuilder(
    inaturalist_json="data-preparation/arizona_desert_plants_clean.json",
    extension_text_dir="extension_texts"
)

# Build unified dataset with 2000 char chunks
documents = builder.build_unified_dataset(chunk_size=2000)

# Save in multiple formats
dir = "data-preparation"
print("\nSaving dataset in multiple formats...")
builder.save_dataset(documents, dir, output_format='json')
builder.save_dataset(documents, dir, output_format='jsonl')
builder.save_dataset(documents, dir, output_format='csv')

# Generate report
builder.generate_dataset_report(documents)

print("\n" + "="*60)
print("✓ Dataset integration complete!")
print("="*60)


In [None]:
combined_dataset = pd.read_json('arizona_plants_unified_20251018.json')

In [None]:
combined_dataset.count()