In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define care requirements, their synonyms, and value patterns
care_requirements = {
    "Habitat": {
        "keywords": [
            "enclosure", "vivarium", "tank", "habitat", "housing", 
            "terrarium", "cage", "container", "dwelling", "lair", 
            "shelter", "pen", "compartment", "quarters", "environment", 
            "set-up", "home", "living space", "space"
        ],
        "patterns": [
            r'\b\d+x\d+x\d+\b',  # e.g., 12x12x12
            r'\b\d+ gallons?\b',  # e.g., 20 gallons
            r'\b\d+\s?(inches|in|cm|centimeters|feet|ft)\b'  # e.g., 12, 12 inches, 12 cm
        ]
    },
    "Diet": {
        "keywords": [
            "food", "feeding", "diet", "nutrition", "meals", 
            "eating", "consumption", "intake", "grub", "nourishment", 
            "provisions", "sustenance", "fare", "feed", "appetite", 
            "rations", "dietary", "snacks", "bites", "foodstuff"
        ],
        "patterns": [
            r'\b\d+\s?(crickets|mealworms|mice|rats|worms|insects|waxworms|dubia roaches)\b per day',  # e.g., 5 crickets per day
            r'\b\d+\s?(crickets|mealworms|mice|rats|worms|insects|waxworms|dubia roaches|fruit|vegetables|dehydrated food|gel-based food)\b',  # e.g., 5 crickets, fruit, vegetables
            r'\b\d+\s?grams?\b',  # e.g., 50 grams
            r'\bfruit\b',  # Fruit
            r'\bvegetables\b',  # Vegetables
            r'\bdehydrated food\b',  # Dehydrated food
            r'\bgel-based food\b'  # Gel-based food
        ]
    },
    "Temperature": {
        "keywords": [
            "heat", "temperature", "climate", "warmth", "cooling", 
            "thermoregulation", "thermal", "hot", "cold", "chill", 
            "basking", "ambient temperature", "warm", "cool", "degrees", 
            "Fahrenheit", "Celsius", "heat lamp", "temperature control", 
            "thermometer", "heating", "cooling down", "warming up"
        ],
        "patterns": [
            r'\b\d+(-\d+)? ?°?(F|C)\b',  # e.g., 75-85°F or 25°C
            r'\b\d+\s?(°?F|°?C)\b'  # e.g., 75°F, 25°C, 75, 25
        ]
    },
    "Humidity": {
        "keywords": [
            "humidity", "moisture", "dampness", "wetness", 
            "hydration", "humid", "dew", "steam", "condensation", 
            "mist", "fog", "water vapor", "relative humidity", 
            "dryness", "humidifier", "dehumidifier", "water content", 
            "moisture level", "saturation", "humid conditions", 
            "dry", "air moisture"
        ],
        "patterns": [
            r'\b\d+%\b'  # e.g., 60%
        ]
    },
    "Lighting": {
        "keywords": [
            "lighting", "light", "UV", "sunlight", "illumination", 
            "brightness", "luminance", "glow", "radiance", "shining", 
            "daylight", "bulbs", "lamps", "fluorescent", "incandescent", 
            "LED", "light cycle", "photoperiod", "light source", 
            "light exposure", "light duration", "visible light", 
            "ultraviolet", "infrared", "natural light", "artificial light"
        ],
        "patterns": [
            r'\b\d+\s?hours?\b',  # e.g., 12 hours
            r'\b\d+\s?hrs?\b'  # e.g., 12 hrs
        ]
    },
    "Water": {
        "keywords": [
            "water", "drinking", "hydration", "moisture", "soak", 
            "dish", "clean", "replenish"
        ],
        "patterns": [
            r'\bwater dish\b',  # Water dish
            r'\bshallow dish\b',  # Shallow dish
            r'\bclean and replenish\b',  # Clean and replenish
            r'\bsoak\b',  # Soak
        ]
    }
}

def extract_values(text, patterns):
    values = []
    for pattern in patterns:
        matches = re.findall(pattern, text)
        for match in matches:
            if isinstance(match, tuple):
                match = ' '.join(match)
            values.append(match.strip())
    return values

def scrape_species_info(url, species):
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Failed to retrieve data from {url}. HTTP Status Code: {response.status_code}")
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    care_info = []
    
    texts = soup.find_all(string=True)
    relevant_texts = [text for text in texts if species.lower() in text.lower()]
    print(f"Found {len(relevant_texts)} relevant text elements mentioning the species.")
    
    for text_element in relevant_texts:
        text = text_element.strip()
        if not text or "{" in text or "}" in text or "window." in text:
            continue
        
        print(f"Evaluating text: {text[:100]}...")
        
        best_match = None
        best_values = []
        best_score = 0
        top_contributors = []
        
        for requirement, details in care_requirements.items():
            patterns = details['patterns']
            values = extract_values(text, patterns)
            if values:
                best_match = requirement
                best_values = values
                best_score = 50 * len(values)
                top_contributors = patterns
                break
        
        if not best_match or not best_values:
            continue  # Skip if no measure is found
        
        for value in best_values:
            care_info.append({
                "Species": species,
                "Care Requirement": best_match,
                "Measure": value,
                "Match Score": best_score,
                "Source": url,
                "Text": text,
                "Top Contributors": top_contributors
            })
    
    if care_info:
        print(f"Relevant info found for {species}:")
        for info in care_info:
            print(info)
    else:
        print(f"No relevant information found for {species}.")
    
    return care_info

def main():
    species = 'Gargoyle Gecko'
    url = "https://www.thesprucepets.com/caring-for-pet-gargoyle-geckos-5120265"
    all_care_data = scrape_species_info(url, species)
    
    if all_care_data:
        df = pd.DataFrame(all_care_data)
        df.drop_duplicates(subset=['Species', 'Care Requirement', 'Measure', 'Match Score', 'Source', 'Text'], inplace=True)
        df.to_csv('reptile_care_info.csv', index=False)
        print("Reptile care information saved to reptile_care_info.csv")
    else:
        print("No valid reptile care information found.")

if __name__ == "__main__":
    main()

Found 25 relevant text elements mentioning the species.
Evaluating text: A Guide to Caring for Gargoyle Geckos as Pets...
Evaluating text: Gargoyle Gecko: Species Profile...
Evaluating text: Choosing Your Gargoyle Gecko...
Evaluating text: The gargoyle gecko is coveted for both its beautiful, vibrant coloration as well as its easygoing an...
Evaluating text: Most often found in the southern parts of New Caledonia, a small island nation east of the Coral Sea...
Evaluating text: Gargoyle Gecko...
Evaluating text: Gargoyle Gecko Behavior and Temperament...
Evaluating text: The gargoyle gecko is known for its friendly nature. They adapt well to life as a pet and should liv...
Evaluating text: When stressed, you may notice your gargoyle gecko...
Evaluating text: Though they are usually rather docile, the gargoyle gecko can exhibit territorial behavior when hous...
Evaluating text: Gargoyle geckos are crepuscular, meaning that they are most active during both dawn and dusk. So don...
Evaluat