In [3]:
import os
import numpy as np
import pandas as pd

# Paths (update if needed)
DATA1_PATH = "dataset1.csv"
DATA2_PATH = "dataset2.csv"

SEASON_MAP = {
    "winter": "Winter",
    "spring": "Spring",
    1: "Winter",
    2: "Spring",
}

In [4]:
def _parse_datetime(series: pd.Series) -> pd.Series:
    """Parse potentially messy datetime strings as day-first, UTC; coerce errors."""
    return pd.to_datetime(series, errors="coerce", dayfirst=True, utc=True)

def _minutes_delta(later: pd.Series, earlier: pd.Series) -> pd.Series:
    return (later - earlier).dt.total_seconds() / 60.0

def _normalize_token(tok: str) -> str:
    tok = tok.strip().lower()
    tok = tok.replace("&", "_and_").replace("-", "_").replace(" ", "_")
    tok = "".join(ch for ch in tok if ch.isalnum() or ch == "_")
    tok = tok.replace("__", "_")
    return tok

def recode_habit(series: pd.Series, rare_threshold: int = 10):
    """Recode messy 'habit' text field into consistent categories."""
    raw = series.fillna("").astype(str)

    def is_coord_like(s: str) -> bool:
        s2 = s.replace(",", ".")
        digits = sum(c.isdigit() for c in s2)
        return digits >= max(1, len(s2)//3) and any(ch in s2 for ch in ",.;:")

    normed = []
    for val in raw:
        v = val.strip()
        if v == "" or v.isnumeric() or is_coord_like(v):
            normed.append("Unknown")
            continue
        v = _normalize_token(v)
        parts = [p for p in v.replace("_and_", "_").split("_") if p]
        parts = sorted(parts)
        normed.append("_".join(parts) if parts else "Unknown")

    s = pd.Series(normed, index=series.index)

    synonym_map = {
        "bats": "bat",
        "pick_bat": "bat_pick",
        "pick_and_bat": "bat_pick",
        "bat_and_rat": "bat_rat",
        "rat_and_bat": "bat_rat",
    }
    s = s.replace(synonym_map)

    counts = s.value_counts(dropna=False)
    rare = counts[counts < rare_threshold].index
    s_collapsed = s.where(~s.isin(rare), other="Other")

    mapping = pd.DataFrame({
        "original": series,
        "recoded_raw": pd.Series(normed, index=series.index),
        "recoded_final": s_collapsed,
    })
    return s_collapsed.astype("category"), mapping


In [5]:
# ---- Load dataset1 (bat landings)
d1 = pd.read_csv(DATA1_PATH)

# Parse datetimes
for col in ["start_time", "rat_period_start", "rat_period_end", "sunset_time"]:
    if col in d1.columns:
        d1[col] = _parse_datetime(d1[col])

# Derived features
d1["date"] = d1["start_time"].dt.date
d1["minutes_after_sunset"] = _minutes_delta(d1["start_time"], d1["sunset_time"])

if "seconds_after_rat_arrival" in d1.columns:
    d1["minutes_after_rat_arrival"] = d1["seconds_after_rat_arrival"] / 60.0
else:
    d1["minutes_after_rat_arrival"] = np.nan

# Chronology check & rat presence flag
d1["chronology_ok"] = (
    (d1["rat_period_start"] <= d1["start_time"]) &
    (d1["start_time"] <= d1["rat_period_end"])
)
d1["rat_present_at_landing"] = d1["chronology_ok"].fillna(False)

# Habit recode (if present)
if "habit" in d1.columns:
    d1["habit_recoded"], habit_map = recode_habit(d1["habit"], rare_threshold=10)
else:
    habit_map = pd.DataFrame(columns=["original", "recoded_raw", "recoded_final"])

# Season labels cleanup
if "season" in d1.columns:
    d1["season_clean"] = d1["season"].map(dict(SEASON_MAP)).fillna(d1["season"]).astype(str)
else:
    d1["season_clean"] = pd.NA

# Transform skewed latency (if available)
if "bat_landing_to_food" in d1.columns:
    d1["latency_sec"] = pd.to_numeric(d1["bat_landing_to_food"], errors="coerce")
    d1["latency_log1p"] = np.log1p(d1["latency_sec"])
else:
    d1["latency_sec"] = np.nan
    d1["latency_log1p"] = np.nan

d1.head()


Unnamed: 0,start_time,bat_landing_to_food,habit,rat_period_start,rat_period_end,seconds_after_rat_arrival,risk,reward,month,sunset_time,...,season,date,minutes_after_sunset,minutes_after_rat_arrival,chronology_ok,rat_present_at_landing,habit_recoded,season_clean,latency_sec,latency_log1p
0,2017-12-30 18:37:00+00:00,16.0,rat,2017-12-30 18:35:00+00:00,2017-12-30 18:38:00+00:00,108,1,0,0,2017-12-30 16:45:00+00:00,...,0,2017-12-30,112.0,1.8,True,True,rat,0,16.0,2.833213
1,2017-12-30 19:51:00+00:00,0.074016,fast,2017-12-30 19:50:00+00:00,2017-12-30 19:55:00+00:00,17,0,1,0,2017-12-30 16:45:00+00:00,...,0,2017-12-30,186.0,0.283333,True,True,fast,0,0.074016,0.071405
2,2017-12-30 19:51:00+00:00,4.0,fast,2017-12-30 19:50:00+00:00,2017-12-30 19:55:00+00:00,41,0,1,0,2017-12-30 16:45:00+00:00,...,0,2017-12-30,186.0,0.683333,True,True,fast,0,4.0,1.609438
3,2017-12-30 19:52:00+00:00,10.0,rat,2017-12-30 19:50:00+00:00,2017-12-30 19:55:00+00:00,111,1,0,0,2017-12-30 16:45:00+00:00,...,0,2017-12-30,187.0,1.85,True,True,rat,0,10.0,2.397895
4,2017-12-30 19:54:00+00:00,15.0,rat,2017-12-30 19:50:00+00:00,2017-12-30 19:55:00+00:00,194,1,0,0,2017-12-30 16:45:00+00:00,...,0,2017-12-30,189.0,3.233333,True,True,rat,0,15.0,2.772589


In [7]:
output_path = "dataset1_cleaning.csv"
d1.to_csv(output_path, index=False)

In [8]:
# ---- Load dataset2 (30-min windows)
d2 = pd.read_csv(DATA2_PATH)
d2.rename(columns={"time": "window_start"}, inplace=True)
d2["window_start"] = _parse_datetime(d2["window_start"])

# Explicit window_end = start + 30 minutes
d2["window_end"] = d2["window_start"] + pd.to_timedelta(30, unit="m")

# Ensure helpful numeric types
for col in ["bat_landing_number", "food_availability", "rat_minutes", "rat_arrival_number"]:
    if col in d2.columns:
        d2[col] = pd.to_numeric(d2[col], errors="coerce")

d2.head()


Unnamed: 0,window_start,month,hours_after_sunset,bat_landing_number,food_availability,rat_minutes,rat_arrival_number,window_end
0,2017-12-26 16:13:00+00:00,0,-0.5,20,4.0,0.0,0,2017-12-26 16:43:00+00:00
1,2017-12-26 16:43:00+00:00,0,0.0,28,4.0,0.0,0,2017-12-26 17:13:00+00:00
2,2017-12-26 17:13:00+00:00,0,0.5,25,4.0,0.0,0,2017-12-26 17:43:00+00:00
3,2017-12-26 17:43:00+00:00,0,1.0,71,4.0,0.0,0,2017-12-26 18:13:00+00:00
4,2017-12-26 18:13:00+00:00,0,1.5,44,3.753857,0.0,0,2017-12-26 18:43:00+00:00


In [9]:
output_path = "dataset2_cleaning.csv"
d2.to_csv(output_path, index=False)