In [7]:
import pandas as pd
import wikipediaapi
import os
import groq
from dotenv import load_dotenv

# Create a User-Agent header with info about your application
user_agent = "FinalProject/1.0 (baruchgottesman@gmail.com)"  # Replace with your info

# Create the Wikipedia API object with the user agent
wiki_api = wikipediaapi.Wikipedia(
    language='en',
    user_agent=user_agent
)
bird_species = ["White-faced Whistling", "Black-bellied Whistling", "West Indian Whistling", "Fulvous Whistling", "Emperor Goose",
]

    # "Snow Goose", "Ross's Goose", "Graylag Goose", "Greater White-fronted Goose", "Lesser White-fronted Goose", "Taiga Bean-goose",
    # "Tundra Bean-goose", "Pink-footed Goose", "Brant", "Barnacle Goose", "Cackling Goose", "Canada Goose", "Hawaiian Goose",\
    # "Mute Swan", "Black Swan", "Trumpeter Swan", "Tundra Swan", "Whooper Swan", "Egyptian Goose", "Common Shelduck", "Muscovy Duck",
    # "Wood Duck", "Baikal Teal", "Garganey", "Blue-winged Teal", "Cinnamon Teal", "Northern Shoveler", "Gadwall", "Falcated Duck",
    # "Eurasian Wigeon", "American Wigeon", "Pacific Black Duck", "Laysan Duck", "Hawaiian Duck", "Eastern Spot-billed Duck",
    # "Mallard", "Mexican Duck", "American Black Duck", "Mottled Duck", "White-cheeked Pintail", "Northern Pintail", "Green-winged Teal",
    # "Canvasback", "Redhead", "Common Pochard", "Ring-necked Duck", "Tufted Duck", "Greater Scaup", "Lesser Scaup", "Steller's Eider",
    # "Spectacled Eider", "King Eider", "Common Eider", "Harlequin Duck", "Labrador Duck", "Surf Scoter", "Velvet Scoter",
    # "White-winged Scoter", "Stejneger's Scoter", "Common Scoter", "Black Scoter", "Long-tailed Duck", "Bufflehead",
    # "Common Goldeneye", "Barrow's Goldeneye", "Smew", "Hooded Merganser", "Common Merganser", "Red-breasted Merganser",
    # "Masked Duck", "Ruddy Duck"

In [9]:
# Data storage for species and Wikipedia text
data = []

# Function to get Wikipedia text with name variations
def get_wikipedia_text(species):
    # Split the species name into individual words for more flexible capitalization control
    words = species.split()

    # List of variations based on potential naming patterns
    variations = [
        species,                                 # Original name
        species + " bird",                       # Add "bird"
        species + " duck",                       # Add "duck"
        species.replace(" ", "_"),               # Replace space with underscore
        species.replace(" ", "-"),               # Replace space with dash
        species.lower(),                         # Lowercase the original name
        species.lower() + " bird",               # Lowercase with "bird"
        species.lower() + " duck",               # Lowercase with "duck"
        "_".join(words).lower() + "_duck",       # Lowercase words with "_duck"
        "-".join(words).lower() + "-duck",       # Lowercase words with "-duck"
        "_".join(word.capitalize() for word in words),   # Capitalize each word, join with underscore
        "-".join(word.capitalize() for word in words),   # Capitalize each word, join with dash
        "_".join(words) + "_duck",               # Original case with "_duck"
        "-".join(words) + "-duck",               # Original case with "-duck"
        "_".join(words) + "_bird",               # Original case with "_bird"
        "-".join(words) + "-bird",               # Original case with "-bird"
        " ".join(word.capitalize() for word in words) + " Duck", # Capitalized with "Duck"
        " ".join(word.capitalize() for word in words) + " Bird"  # Capitalized with "Bird"
    ]

    # Attempt each variation until a page is found
    for name in variations:
        page = wiki_api.page(name)
        if page.exists():
            return page.text  # Return the text if the page exists

    return "Wikipedia page not found"  # Return a message if no variation was successful

# Fetch Wikipedia content for each bird species
for species in bird_species:
    wiki_text = get_wikipedia_text(species)
    data.append([species, wiki_text])

# Create DataFrame
df = pd.DataFrame(data, columns=["Species Name", "Wikipedia Text"])

# Display the DataFrame to the user
df.head()

Unnamed: 0,Species Name,Wikipedia Text
0,White-faced Whistling,The white-faced whistling duck (Dendrocygna vi...
1,Black-bellied Whistling,The black-bellied whistling duck (Dendrocygna ...
2,West Indian Whistling,The West Indian whistling duck (Dendrocygna ar...
3,Fulvous Whistling,The fulvous whistling duck or fulvous tree duc...
4,Emperor Goose,"The emperor goose (Anser canagicus), also know..."


In [21]:
import os
import groq
import pandas as pd
from dotenv import load_dotenv

# Load environment variables
load_dotenv("GROQ_API_KEY.env")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# Initialize Groq client
client = groq.Groq(api_key=GROQ_API_KEY)

def extract_latin_name(species_text):
    """
    Extract the Latin name from the truncated species text using Groq.
    """
    # Truncate the text to first 750 characters
    truncated_text = species_text[:750]
    
    prompt = f"""
    Given the following text about a bird species, extract ONLY the scientific (Latin) name.
    Return ONLY the Latin name with no additional text or punctuation.
    If no Latin name is found, return "Not found".
    
    Text:
    {truncated_text}
    """
    
    try:
        response = client.chat.completions.create(
            messages=[
                {"role": "user", "content": prompt}
            ],
            model="mixtral-8x7b-32768",
            temperature=0.1,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error extracting Latin name: {str(e)}")
        return "Error"

# Create new DataFrame with just the essential columns
new_df = pd.DataFrame({
    'Number': range(1, len(df) + 1),
    'Species Name': df['Species Name'],
    'Latin Name': None
})

# Process each species in the DataFrame
for i, row in df.iterrows():
    species_name = row["Species Name"]
    wikipedia_text = row["Wikipedia Text"] or ""
    combined_text = f"Species: {species_name}\n\nWikipedia:\n{wikipedia_text}"
    
    # Extract Latin name
    latin_name = extract_latin_name(combined_text)
    new_df.at[i, 'Latin Name'] = latin_name
    
    # Print progress (optional)
    if (i + 1) % 10 == 0:  # Show progress every 10 entries
        print(f"Processed {i + 1} species")

# Display the new DataFrame
print("\nSpecies Data:")
print(new_df)

# Optionally save to CSV
# new_df.to_csv('species_latin_names.csv', index=False)


Species Data:
   Number             Species Name              Latin Name
0       1    White-faced Whistling     Dendrocygna viduata
1       2  Black-bellied Whistling  Dendrocygna autumnalis
2       3    West Indian Whistling     Dendrocygna arborea
3       4        Fulvous Whistling     Dendrocygna bicolor
4       5            Emperor Goose         Anser canagicus


In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from urllib.parse import quote

def get_birdlife_data(scientific_name: str) -> dict:
    """
    Fetch data from BirdLife for a given scientific name
    """
    # Split scientific name into parts and create search URL
    search_terms = scientific_name.strip().split()
    if len(search_terms) >= 2:
        search_url = f"https://datazone.birdlife.org/species/results?thrlev1=&thrlev2=&kw=+{quote(search_terms[0])}+{quote(search_terms[1])}"
    else:
        return {'Family': 'Not Found', 'Red List Category': 'Not Found'}
    
    try:
        # Fetch the page
        response = requests.get(search_url)
        response.raise_for_status()
        
        # Parse HTML
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the results table
        table = soup.find('table')
        if table:
            rows = table.find_all('tr')
            if len(rows) > 1:  # If we have data rows (skip header)
                cols = rows[1].find_all('td')
                if len(cols) >= 4:
                    return {
                        'Family': cols[2].text.strip(),
                        'Red List Category': cols[3].text.strip()
                    }
        
        return {'Family': 'Not Found', 'Red List Category': 'Not Found'}
    
    except Exception as e:
        print(f"Error fetching data for {scientific_name}: {str(e)}")
        return {'Family': 'Error', 'Red List Category': 'Error'}

def enrich_species_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add BirdLife data to the existing DataFrame
    """
    # Create copy of dataframe
    enriched_df = df.copy()
    
    # Add new columns
    enriched_df['Family'] = None
    enriched_df['Red List Category'] = None
    
    # Process each species
    total = len(enriched_df)
    for idx, row in enriched_df.iterrows():
        print(f"Processing {idx + 1} of {total}: {row['Latin Name']}")
        
        # Get BirdLife data
        result = get_birdlife_data(row['Latin Name'])
        
        # Update DataFrame
        enriched_df.at[idx, 'Family'] = result['Family']
        enriched_df.at[idx, 'Red List Category'] = result['Red List Category']
        
        # Wait 2 seconds between requests
        time.sleep(2)
    
    return enriched_df

# Use the functions with your existing DataFrame
if __name__ == "__main__":
    # Enrich the data
    enriched_df = enrich_species_data(new_df)
    
    # Display results
    print("\nEnriched Species Data:")
    print(enriched_df)
    
    # Show summary of Red List Categories
    print("\nRed List Category Summary:")
    print(enriched_df['Red List Category'].value_counts())
    
    # Save to CSV
    enriched_df.to_csv('species_with_birdlife_data.csv', index=False)

Processing 1 of 5: Dendrocygna viduata
Processing 2 of 5: Dendrocygna autumnalis
Processing 3 of 5: Dendrocygna arborea
Processing 4 of 5: Dendrocygna bicolor
Processing 5 of 5: Anser canagicus

Enriched Species Data:
   Number             Species Name              Latin Name  \
0       1    White-faced Whistling     Dendrocygna viduata   
1       2  Black-bellied Whistling  Dendrocygna autumnalis   
2       3    West Indian Whistling     Dendrocygna arborea   
3       4        Fulvous Whistling     Dendrocygna bicolor   
4       5            Emperor Goose         Anser canagicus   

                           Family Red List Category  
0  Anatidae (Ducks, Geese, Swans)                LC  
1  Anatidae (Ducks, Geese, Swans)                LC  
2  Anatidae (Ducks, Geese, Swans)                NT  
3  Anatidae (Ducks, Geese, Swans)                LC  
4  Anatidae (Ducks, Geese, Swans)                LC  

Red List Category Summary:
Red List Category
LC    4
NT    1
Name: count, dtype: int