<a href="https://colab.research.google.com/github/honi-sm/AmphiBIO_TraitAnalysisFinal/blob/main/CleanedDataPython.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas numpy openpyxl




In [None]:
from google.colab import files
uploaded = files.upload()


Saving AmphiBIOv1Raw.xlsx to AmphiBIOv1Raw (3).xlsx


In [None]:
"""
AmphiBIO Data Cleaning Script
Author: Lia Grace Stratos
Institution: University of Nebraska Omaha
Date: November 2025

Purpose:
    This script reproduces the same cleaning process used in Excel for the AmphiBIO amphibian
    traits dataset. It filters out incomplete or invalid entries, standardizes text and numeric
    fields, and combines binary columns into categorical variables for habitat type and
    reproductive method. The cleaned dataset is then used to generate a reproducible
    random sample of 500 species for statistical testing.

Use:
    Run this script in the same directory as the raw data file (AmphiBIOv1Raw.xlsx).
    The output includes two CSV files:
        1. cleanDataPython.csv — all cleaned species
        2. randomSample_500_python.csv — reproducible 500-species subset

Notes:
    - The original dataset contained 6,777 rows (38 columns).
    - After cleaning, roughly 1,481 valid species remain.
    - All missing values ('NA', blanks, zeros) are treated consistently.
"""

import pandas as pd
import numpy as np
import sys
import os

# File paths and constants
RAW_FILE = "AmphiBIOv1Raw.xlsx"
OUTPUT_CLEAN = "cleanDataPython.csv"
OUTPUT_SAMPLE = "randomSample_500_python.csv"
RANDOM_SEED = 2025


# Function: safely load data
def load_data(file_path: str) -> pd.DataFrame:
    """Load the raw dataset with missing value handling."""
    if not os.path.exists(file_path):
        sys.exit(f"Error: File '{file_path}' not found in working directory.")
    try:
        df = pd.read_excel(
            file_path,
            engine="openpyxl",
            na_values=["NA", "N/A", "na", "NaN", "", " "]
        )
        print(f"Loaded file: {file_path} ({df.shape[0]} rows, {df.shape[1]} columns)")
        return df
    except Exception as e:
        sys.exit(f"Error reading '{file_path}': {e}")


# Function: check required columns exist
def validate_columns(df: pd.DataFrame, required: list[str]) -> None:
    """Verify that all necessary columns are present."""
    missing = [c for c in required if c not in df.columns]
    if missing:
        sys.exit(f"Error: Missing required columns: {missing}")


# Function: derive habitatType
def make_habitat(row: pd.Series) -> str:
    parts = []
    if row["Fos"] == 1: parts.append("fossorial")
    if row["Ter"] == 1: parts.append("terrestrial")
    if row["Aqu"] == 1: parts.append("aquatic")
    if row["Arb"] == 1: parts.append("arboreal")
    return " + ".join(parts) if parts else np.nan


# Function: derive reproductiveMethod
def make_reproduction(row: pd.Series) -> str:
    if row["Dir"] == 1:
        return "direct"
    elif row["Lar"] == 1:
        return "larval"
    elif row["Viv"] == 1:
        return "viviparous"
    else:
        return np.nan


# Function: safely export CSV
def safe_export(df: pd.DataFrame, filename: str) -> None:
    """Write DataFrame to CSV with error handling."""
    try:
        df.to_csv(filename, index=False)
        print(f"Saved: {filename} ({len(df)} rows)")
    except PermissionError:
        print(f"Error: Permission denied while saving '{filename}'. Close any open file and retry.")
    except Exception as e:
        print(f"Unexpected error saving '{filename}': {e}")


# Load raw dataset
raw = load_data(RAW_FILE)

# Drop metadata columns not used for cleaning
drop_cols = ["id", "Order", "Family", "Genus", "OBS"]
raw = raw.drop(columns=[c for c in drop_cols if c in raw.columns])

# Keep relevant columns
cols = [
    "Species", "Fos", "Ter", "Aqu", "Arb",
    "Body_size_mm", "Litter_size_min_n", "Litter_size_max_n",
    "Dir", "Lar", "Viv"
]
validate_columns(raw, cols)
data = raw[cols].copy()

# Normalize text in Species
data["Species"] = (
    data["Species"]
    .astype(str)
    .str.strip()
    .str.lower()
    .str.replace(r"\s+", " ", regex=True)
)

# Type casting for numeric and binary fields
num_cols = ["Body_size_mm", "Litter_size_min_n", "Litter_size_max_n"]
bin_cols = ["Fos", "Ter", "Aqu", "Arb", "Dir", "Lar", "Viv"]

for col in num_cols:
    try:
        data[col] = pd.to_numeric(data[col], errors="coerce")
    except Exception as e:
        print(f"Warning: could not convert {col} to numeric: {e}")

for col in bin_cols:
    data[col] = pd.to_numeric(data[col], errors="coerce").astype("float64")

# Replace zeros with NaN (Excel treated them as blanks)
data.replace(0, np.nan, inplace=True)

# Drop rows missing key numeric data
before = len(data)
data = data[
    data["Body_size_mm"].notna() &
    data["Litter_size_min_n"].notna() &
    data["Litter_size_max_n"].notna()
]
after = len(data)
print(f"Removed {before - after} rows with missing or invalid numeric data.")

# Apply derived categorical columns
data["habitatType"] = data.apply(make_habitat, axis=1)
data["reproductiveMethod"] = data.apply(make_reproduction, axis=1)

# Drop missing reproductive data
data = data[data["reproductiveMethod"].notna()]

# Drop duplicates based on species name
data["Species"] = data["Species"].str.strip().str.lower()
before_dupes = len(data)
data = data.drop_duplicates(subset="Species", keep="first")
print(f"Removed {before_dupes - len(data)} duplicate species entries.")

# Rename and reorder columns
data = data.rename(columns={
    "Body_size_mm": "bodyLengthMm",
    "Litter_size_min_n": "clutchSizeMinN",
    "Litter_size_max_n": "clutchSizeMaxN"
})
cleaned = data[[
    "Species", "bodyLengthMm", "clutchSizeMinN", "clutchSizeMaxN",
    "habitatType", "reproductiveMethod"
]].reset_index(drop=True)

# Save full cleaned dataset
safe_export(cleaned, OUTPUT_CLEAN)

# Random sample creation
try:
    sample = cleaned.sample(n=500, random_state=RANDOM_SEED)
    safe_export(sample, OUTPUT_SAMPLE)
except ValueError as e:
    print(f"Error creating random sample: {e}")

print(f"Total retained: {len(cleaned)} species (expected ≈1481).")
print("Cleaning complete.")

Loaded file: AmphiBIOv1Raw.xlsx (6776 rows, 38 columns)
Removed 5181 rows with missing or invalid numeric data.
Removed 0 duplicate species entries.
Saved: cleanDataPython.csv (1484 rows)
Saved: randomSample_500_python.csv (500 rows)
Total retained: 1484 species (expected ≈1481).
Cleaning complete.
