# Dataset Transformation

This notebook processes a dataset of cow health and activity data to create sliding windows of activity levels and label them based on health conditions. The goal is to transform the data into a format suitable for analysis, with each window representing a period of time and labeled with the dominant health condition.

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from scipy import stats, fft
from datetime import timedelta

## Load Data

Load the dataset and display the first few rows to understand its structure.

In [None]:
def load_and_preview_data(file_path):
    """Load dataset from CSV and display the first 4 rows.
    
    Args:
        file_path (str): Path to the CSV file.
    
    Returns:
        pd.DataFrame: Loaded DataFrame.
    """
    df = pd.read_csv(file_path)
    display(df.head(4))
    return df

# Load initial data
initial_path = r"dataset2-1.csv"
df = load_and_preview_data(initial_path)

## Preprocess Data

Clean column names, convert dates, and sort the data.

In [None]:
def preprocess_data(file_path='filtered_dataset_more_than_18_obs.csv'):
    """Preprocess the dataset by cleaning column names and converting date.
    
    Args:
        file_path (str): Path to the filtered dataset CSV.
    
    Returns:
        pd.DataFrame: Preprocessed DataFrame.
    """
    df = pd.read_csv(file_path)
    # Clean column names
    df.columns = df.columns.str.strip()
    # Convert the 'date' column to datetime type
    df['date'] = pd.to_datetime(df['date'])
    # Sort the data by cow and date
    df = df.sort_values(by=['cow', 'date'])
    return df

# Preprocess data
df_processed = preprocess_data(file_path='filtered_dataset_more_than_18_obs.csv')

## Compute Health State Durations

Calculate the duration of each health state for each cow and display the results.

In [None]:
def get_health_state_durations(df, health_state_column):
    """Calculate durations for a specific health state for each cow.
    
    Args:
        df (pd.DataFrame): Input DataFrame.
        health_state_column (str): Column name of the health state.
    
    Returns:
        pd.DataFrame: DataFrame with duration information.
    """
    durations = []

    # For each cow
    for cow_id, group in df.groupby('cow'):
        group = group.sort_values('date').reset_index(drop=True)

        current_state = False
        start_date = None
        prev_date = None

        for i, row in group.iterrows():
            if row[health_state_column] == 1:
                if not current_state:
                    # Start new period
                    current_state = True
                    start_date = row['date']
                elif prev_date is not None and (row['date'] - prev_date).days > 1:
                    # Gap detected → close previous period
                    end_date = prev_date
                    duration = (end_date - start_date).days + 1
                    durations.append({
                        'cow': cow_id,
                        'health_state': health_state_column,
                        'start_date': start_date,
                        'end_date': end_date,
                        'duration_days': duration
                    })
                    # Start new period
                    start_date = row['date']
            else:
                if current_state:
                    end_date = prev_date if prev_date is not None else row['date']
                    duration = (end_date - start_date).days + 1
                    durations.append({
                        'cow': cow_id,
                        'health_state': health_state_column,
                        'start_date': start_date,
                        'end_date': end_date,
                        'duration_days': duration
                    })
                    current_state = False
                    start_date = None

            prev_date = row['date']

        # Handle case where last rows are 1s
        if current_state:
            end_date = prev_date
            duration = (end_date - start_date).days + 1
            durations.append({
                'cow': cow_id,
                'health_state': health_state_column,
                'start_date': start_date,
                'end_date': end_date,
                'duration_days': duration
            })

    return pd.DataFrame(durations)

def compute_and_display_durations(df, health_states):
    """Compute and display duration statistics for health states.
    
    Args:
        df (pd.DataFrame): Input DataFrame.
        health_states (list): List of health state column names.
    
    Returns:
        dict: Dictionary with min and max durations for each health state.
    """
    # Dictionaries to store durations
    state_durations = {}
    durations_min_max = {}

    # Compute durations for each health state
    for state in health_states:
        durations_df = get_health_state_durations(df, state)
        state_durations[state] = durations_df

        if not durations_df.empty:
            min_duration = durations_df['duration_days'].min()
            max_duration = durations_df['duration_days'].max()
            durations_min_max[state] = {'min_days': min_duration, 'max_days': max_duration}
        else:
            durations_min_max[state] = {'min_days': None, 'max_days': None}

    # Display results
    print("**Labelling**\n")
    for state, durations in durations_min_max.items():
        min_days = durations['min_days']
        max_days = durations['max_days']

        if min_days is None or max_days is None:
            print(f"No data available for \"{state}\".")
        elif min_days == max_days:
            print(f"{state.capitalize()} lasts {min_days} day{'s' if min_days > 1 else ''}.")
        else:
            print(f"{state.capitalize()} lasts between {min_days} and {max_days} days.")

    return durations_min_max

# Define health states
health_states = ['oestrus', 'calving', 'lameness', 'mastitis', 'other_disease', 'OK']

# Compute and display durations
durations_min_max = compute_and_display_durations(df_processed, health_states)

## Create Sliding Windows

Generate sliding windows of activity data and label them based on health conditions.

In [None]:
def create_sliding_windows(df, window_size_hours=24, min_valid_hours=18, shift_hours=1, condition_columns=None):
    """Create sliding windows of activity data and label them based on conditions.
    
    Args:
        df (pd.DataFrame): Input DataFrame.
        window_size_hours (int): Size of the sliding window in hours.
        min_valid_hours (int): Minimum number of valid hours required for a window.
        shift_hours (int): Number of hours to shift the window.
        condition_columns (list): List of condition columns for labeling.
    
    Returns:
        pd.DataFrame: DataFrame with sliding window data.
    """
    if condition_columns is None:
        condition_columns = ['oestrus', 'calving', 'lameness', 'mastitis', 'other_disease']

    # Adjust hour 24 to 00 and increment the date
    df['adjusted_date'] = df['date']
    df['adjusted_hour'] = df['hour']
    df.loc[df['hour'] == 24, 'adjusted_date'] = pd.to_datetime(df.loc[df['hour'] == 24, 'date']) + timedelta(days=1)
    df.loc[df['hour'] == 24, 'adjusted_hour'] = 0

    # Ensure datetime format
    df['adjusted_date'] = pd.to_datetime(df['adjusted_date'])

    # Combine adjusted_date and adjusted_hour into a timestamp
    df['timestamp'] = pd.to_datetime(df['adjusted_date'].dt.strftime('%Y-%m-%d') + ' ' + df['adjusted_hour'].astype(str) + ':00:00')
    df = df.sort_values(['cow', 'timestamp'])

    # Group by cow
    grouped = df.groupby('cow')
    shifted_windows = []

    for cow_id, group in grouped:
        timestamps = group['timestamp'].tolist()
        group = group.set_index('timestamp')  # Index by timestamp for easy lookup

        print(f"\n🐄 Processing cow {cow_id} with {len(timestamps)} records...")

        for start_time in timestamps:
            end_time = start_time + timedelta(hours=window_size_hours)
            collected_hours = []
            current_time = start_time
            previous_date = current_time.date()

            while current_time < end_time:
                if current_time in group.index:
                    current_date = current_time.date()

                    # Stop if there's a day gap
                    if (current_date - previous_date).days > 1:
                        print(f"🛑 Stopping at {current_time} due to missing day.")
                        break

                    collected_hours.append(group.loc[current_time]['ACTIVITY_LEVEL'])
                    previous_date = current_date
                    current_time += timedelta(hours=1)
                else:
                    # Hour is missing, skip it but advance time
                    print(f"⚠️  Missing hour at {current_time}, skipping.")
                    current_date = current_time.date()

                    if (current_date - previous_date).days > 0:
                        print(f"🛑 Stopping at {current_time} due to day gap after missing hour.")
                        break

                    previous_date = current_date
                    current_time += timedelta(hours=1)

            if len(collected_hours) >= min_valid_hours:
                end_effective = start_time + timedelta(hours=len(collected_hours) - 1)

                # Get condition values across the window
                condition_window = group.loc[start_time:end_effective]
                condition_counts = condition_window[condition_columns].sum()

                # Prepare final label dictionary: all 0s initially
                final_conditions = {col: 0 for col in condition_columns}

                if (condition_counts > 0).any():
                    most_frequent_condition = condition_counts.idxmax()
                    final_conditions[most_frequent_condition] = 1
                    final_conditions['OK'] = 0
                else:
                    final_conditions['OK'] = 1

                shifted_windows.append({
                    'cow': cow_id,
                    'start_time': start_time,
                    'end_time': end_effective,
                    'duration_hours': len(collected_hours),
                    'activity_window': collected_hours,
                    **final_conditions
                })

                print(f"✅ Window for cow {cow_id} → {most_frequent_condition if (condition_counts > 0).any() else 'healthy'}")
            else:
                print(f"❌ Discarded window for cow {cow_id} from {start_time}: only {len(collected_hours)} valid hours.")

    # Final DataFrame
    shifted_df = pd.DataFrame(shifted_windows)
    return shifted_df

# Create sliding windows
shifted_df = create_sliding_windows(df_processed)

## Save Results

Save the transformed dataset to a CSV file and display the results.

In [None]:
def save_to_csv(df, output_path):
    """Save DataFrame to a CSV file.
    
    Args:
        df (pd.DataFrame): DataFrame to save.
        output_path (str): Path to save the CSV file.
    """
    df.to_csv(output_path, index=False)
    print(f"CSV file saved to {output_path}")

# Display results
print("\n✅ DONE.")
print(f"Total valid windows collected: {len(shifted_df)}")
display(shifted_df.head())

# Save to CSV
output_csv_path = "OneHourShift.csv"
save_to_csv(shifted_df, output_csv_path)