In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from scipy import stats, fft
from datetime import timedelta

def load_data(file_path):
    """Load CSV data and return a DataFrame."""
    try:
        df = pd.read_csv(file_path)
        print(f"Loaded data from {file_path}, shape: {df.shape}")
        return df
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return None
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None

# Example usage for initial data preview
path = r"dataset2-1.csv"
df = load_data(path)
if df is not None:
    display(df.head(4))


**Labelling**

In [None]:
def preprocess_data(df):
    """Clean and preprocess the DataFrame."""
    if df is None:
        return None
    df = df.copy()
    df.columns = df.columns.str.strip()
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.sort_values(by=['cow', 'date']).reset_index(drop=True)
    return df

def get_health_state_durations(df, health_state_column):
    """Compute durations for a specific health state."""
    durations = []
    for cow_id, group in df.groupby('cow'):
        group = group.sort_values('date').reset_index(drop=True)
        current_state = False
        start_date = None
        prev_date = None

        for i, row in group.iterrows():
            if pd.isna(row['date']):
                continue
            if row[health_state_column] == 1:
                if not current_state:
                    current_state = True
                    start_date = row['date']
                elif prev_date is not None and (row['date'] - prev_date).days > 1:
                    end_date = prev_date
                    duration = (end_date - start_date).days + 1
                    durations.append({
                        'cow': cow_id,
                        'health_state': health_state_column,
                        'start_date': start_date,
                        'end_date': end_date,
                        'duration_days': duration
                    })
                    start_date = row['date']
            else:
                if current_state:
                    end_date = prev_date if prev_date is not None else row['date']
                    duration = (end_date - start_date).days + 1
                    durations.append({
                        'cow': cow_id,
                        'health_state': health_state_column,
                        'start_date': start_date,
                        'end_date': end_date,
                        'duration_days': duration
                    })
                    current_state = False
                    start_date = None
            prev_date = row['date']

        if current_state:
            end_date = prev_date
            duration = (end_date - start_date).days + 1
            durations.append({
                'cow': cow_id,
                'health_state': health_state_column,
                'start_date': start_date,
                'end_date': end_date,
                'duration_days': duration
            })

    return pd.DataFrame(durations)

def compute_durations(df, health_states):
    """Compute and display health state durations."""
    state_durations = {}
    durations_min_max = {}

    for state in health_states:
        durations_df = get_health_state_durations(df, state)
        state_durations[state] = durations_df

        if not durations_df.empty:
            min_duration = durations_df['duration_days'].min()
            max_duration = durations_df['duration_days'].max()
            durations_min_max[state] = {'min_days': min_duration, 'max_days': max_duration}
        else:
            durations_min_max[state] = {'min_days': None, 'max_days': None}

    for state, durations in durations_min_max.items():
        min_days = durations['min_days']
        max_days = durations['max_days']
        if min_days is None or max_days is None:
            print(f"No data available for \"{state}\".")
        elif min_days == max_days:
            print(f"{state.capitalize()} lasts {min_days} day{'s' if min_days > 1 else ''}.")
        else:
            print(f"{state.capitalize()} lasts between {min_days} and {max_days} days.")

    return state_durations

# Define health states
health_states = ['oestrus', 'calving', 'lameness', 'mastitis', 'other_disease', 'OK']

# Process durations for a sample dataset
df = load_data(r"filtered_dataset_more_than_18_obs.csv")
df = preprocess_data(df)
if df is not None:
    durations = compute_durations(df, health_states)


In [None]:
def assign_labels(df, condition_columns):
    """Assign a single LABEL column based on condition columns."""
    df = df.copy()
    # Ensure condition columns are numeric
    for col in condition_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    def get_label(row):
        conditions = [col for col in condition_columns if row[col] == 1]
        if len(conditions) > 0:
            return conditions[0]  # Take the first condition if multiple
        return 'OK' if row['OK'] == 1 else 'healthy'

    df['LABEL'] = df.apply(get_label, axis=1)
    return df

def create_sliding_windows(df, window_size_hours, min_valid_hours, shift_hours, condition_columns):
    """Create sliding windows with activity data and labels."""
    df = df.copy()
    # Adjust hour 24 to 00 and increment date
    df['adjusted_date'] = df['date']
    df['adjusted_hour'] = df['hour']
    df.loc[df['hour'] == 24, 'adjusted_date'] = pd.to_datetime(df.loc[df['hour'] == 24, 'date']) + timedelta(days=1)
    df.loc[df['hour'] == 24, 'adjusted_hour'] = 0

    df['adjusted_date'] = pd.to_datetime(df['adjusted_date'], errors='coerce')
    df['timestamp'] = pd.to_datetime(df['adjusted_date'].dt.strftime('%Y-%m-%d') + ' ' + df['adjusted_hour'].astype(str) + ':00:00')
    df = df.sort_values(['cow', 'timestamp']).reset_index(drop=True)

    grouped = df.groupby('cow')
    shifted_windows = []

    for cow_id, group in grouped:
        timestamps = group['timestamp'].dropna().unique()
        group = group.set_index('timestamp')
        print(f"\n🐄 Processing cow {cow_id} with {len(timestamps)} records...")

        for start_time in timestamps:
            end_time = start_time + timedelta(hours=window_size_hours)
            collected_hours = []
            current_time = start_time
            previous_date = current_time.date()

            while current_time < end_time:
                if current_time in group.index:
                    current_date = current_time.date()
                    if (current_date - previous_date).days > 1:
                        print(f"🛑 Stopping at {current_time} due to missing day.")
                        break
                    collected_hours.append(group.loc[current_time]['ACTIVITY_LEVEL'])
                    previous_date = current_date
                    current_time += timedelta(hours=1)
                else:
                    print(f"⚠️  Missing hour at {current_time}, skipping.")
                    current_date = current_time.date()
                    if (current_date - previous_date).days > 0:
                        print(f"🛑 Stopping at {current_time} due to day gap after missing hour.")
                        break
                    previous_date = current_date
                    current_time += timedelta(hours=1)

            if len(collected_hours) >= min_valid_hours:
                end_effective = start_time + timedelta(hours=len(collected_hours) - 1)
                condition_window = group.loc[start_time:end_effective]
                condition_counts = condition_window[condition_columns].sum()

                final_conditions = {col: 0 for col in condition_columns}
                final_conditions['OK'] = 0

                if (condition_counts > 0).any():
                    most_frequent_condition = condition_counts.idxmax()
                    final_conditions[most_frequent_condition] = 1
                else:
                    final_conditions['OK'] = 1

                shifted_windows.append({
                    'cow': cow_id,
                    'start_time': start_time,
                    'end_time': end_effective,
                    'duration_hours': len(collected_hours),
                    'activity_window': collected_hours,
                    **final_conditions
                })

                label = most_frequent_condition if (condition_counts > 0).any() else 'healthy'
                print(f"✅ Window for cow {cow_id} → {label}")
            else:
                print(f"❌ Discarded window for cow {cow_id} from {start_time}: only {len(collected_hours)} valid hours.")

    shifted_df = pd.DataFrame(shifted_windows)
    print(f"\n✅ DONE.")
    print(f"Total valid windows collected: {len(shifted_df)}")
    return shifted_df

def save_output(df, output_path):
    """Save DataFrame to CSV."""
    try:
        df.to_csv(output_path, index=False)
        print(f"CSV file saved to {output_path}")
    except Exception as e:
        print(f"Error saving to {output_path}: {str(e)}")

# Parameters
window_size_hours = 24
min_valid_hours = 18
shift_hours = 1
condition_columns = ['oestrus', 'calving', 'lameness', 'mastitis', 'other_disease']

# List of datasets to process
datasets = [
    "dataset1.csv",
    "dataset2-1.csv",
    "dataset3.csv",
    "filtered_dataset_more_than_18_obs.csv"
]

# Process each dataset
for dataset in datasets:
    print(f"\n📊 Processing dataset: {dataset}")
    df = load_data(dataset)
    df = preprocess_data(df)
    if df is not None:
        df_labeled = assign_labels(df, condition_columns)
        display(df_labeled.head(4))
        shifted_df = create_sliding_windows(df_labeled, window_size_hours, min_valid_hours, shift_hours, condition_columns)
        output_path = f"OneHourShift_{dataset}"
        save_output(shifted_df, output_path)
