In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from scipy import stats, fft
from datetime import timedelta
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

**Labelling**

In [None]:
def load_and_preprocess_data(file_path):
    """Load and preprocess the dataset."""
    df = pd.read_csv(file_path)
    # Clean column names
    df.columns = df.columns.str.strip()
    # Convert the 'date' column to datetime type
    df['date'] = pd.to_datetime(df['date'])
    # Sort the data by cow and date
    df = df.sort_values(by=['cow', 'date'])
    return df

def get_health_state_durations(df, health_state_column):
    """Calculate durations for a specific health state."""
    durations = []
    # For each cow
    for cow_id, group in df.groupby('cow'):
        group = group.sort_values('date').reset_index(drop=True)
        current_state = False
        start_date = None
        prev_date = None

        for i, row in group.iterrows():
            if row[health_state_column] == 1:
                if not current_state:
                    # Start new period
                    current_state = True
                    start_date = row['date']
                elif prev_date is not None and (row['date'] - prev_date).days > 1:
                    # Gap detected → close previous period
                    end_date = prev_date
                    duration = (end_date - start_date).days + 1
                    durations.append({
                        'cow': cow_id,
                        'health_state': health_state_column,
                        'start_date': start_date,
                        'end_date': end_date,
                        'duration_days': duration
                    })
                    # Start new period
                    start_date = row['date']
            else:
                if current_state:
                    end_date = prev_date if prev_date is not None else row['date']
                    duration = (end_date - start_date).days + 1
                    durations.append({
                        'cow': cow_id,
                        'health_state': health_state_column,
                        'start_date': start_date,
                        'end_date': end_date,
                        'duration_days': duration
                    })
                    current_state = False
                    start_date = None
            prev_date = row['date']

        # Handle case where last rows are 1s
        if current_state:
            end_date = prev_date
            duration = (end_date - start_date).days + 1
            durations.append({
                'cow': cow_id,
                'health_state': health_state_column,
                'start_date': start_date,
                'end_date': end_date,
                'duration_days': duration
            })

    return pd.DataFrame(durations)

def compute_health_durations(df, health_states):
    """Compute and display health state durations."""
    state_durations = {}
    durations_min_max = {}

    # Compute durations for each health state
    for state in health_states:
        durations_df = get_health_state_durations(df, state)
        state_durations[state] = durations_df

        if not durations_df.empty:
            min_duration = durations_df['duration_days'].min()
            max_duration = durations_df['duration_days'].max()
            durations_min_max[state] = {'min_days': min_duration, 'max_days': max_duration}
        else:
            durations_min_max[state] = {'min_days': None, 'max_days': None}

    # Display results
    for state, durations in durations_min_max.items():
        min_days = durations['min_days']
        max_days = durations['max_days']
        if min_days is None or max_days is None:
            print(f"No data available for \"{state}\".")
        elif min_days == max_days:
            print(f"{state.capitalize()} lasts {min_days} day{'s' if min_days > 1 else ''}.")
        else:
            print(f"{state.capitalize()} lasts between {min_days} and {max_days} days.")

    return state_durations, durations_min_max

# Load and preprocess data
# df = load_and_preprocess_data(r"filtered_dataset_more_than_18_obs.csv")
# Assuming health_states is defined (from dataset columns or predefined list)
# health_states = ['oestrus', 'calving', 'lameness', 'mastitis', 'other_disease', 'acidosis', 'LPS', 'accidents', 'disturbance', 'mixing', 'management_changes']
# state_durations, durations_min_max = compute_health_durations(df, health_states)

In [None]:
def label_and_align_data(df, health_states):
    """Label and align data with spread rules."""
    # Step 1: Identify event columns dynamically
    non_event_cols = ['cow', 'date', 'hour', 'IN_ALLEYS', 'REST', 'EAT', 'ACTIVITY_LEVEL', 'OK']
    event_cols = [col for col in df.columns if col not in non_event_cols]

    # Step 2: Aggregate daily events
    daily = df.groupby(['cow', 'date'])[event_cols].max().reset_index()
    daily['date'] = pd.to_datetime(daily['date'])

    # Step 3: Create full cow x day table
    all_dates = pd.date_range(daily['date'].min() - timedelta(days=7), daily['date'].max() + timedelta(days=7))
    cows = daily['cow'].unique()
    full_daily = pd.MultiIndex.from_product([cows, all_dates], names=['cow', 'date']).to_frame(index=False)

    # Merge and fill missing
    full_daily = full_daily.merge(daily, on=['cow', 'date'], how='left')
    full_daily[event_cols] = full_daily[event_cols].fillna(0)

    # Add LABEL and default OK
    full_daily['LABEL'] = 'control'
    full_daily['OK'] = 1

    # Step 4: Spread rules
    spread_rules = {
        'oestrus': {'before': 1, 'after': 1},
        'calving': {'before': 2, 'after': 1},
        'lameness': {'before': 2, 'after': 1},
        'mastitis': {'before': 2, 'after': 1},
        'acidosis': {'before': 2, 'after': 1},
        'LPS': {'before': 2, 'after': 1},
        'other_disease': {'before': 2, 'after': 1},
        'accidents': {'before': 2, 'after': 1},
        'disturbance': {'before': 0, 'after': 0},
        'mixing': {'before': 0, 'after': 0},
        'management_changes': {'before': 0, 'after': 0},
    }

    for cond in event_cols:
        if cond not in spread_rules:
            continue
        sub = full_daily[full_daily[cond] == 1][['cow', 'date']].sort_values(['cow', 'date'])
        for cow_id in sub['cow'].unique():
            cow_days = sub[sub['cow'] == cow_id]['date'].sort_values()
            episode = []
            prev_day = None
            for day in cow_days:
                if prev_day is None or (day - prev_day).days > 1:
                    if episode:
                        min_day = min(episode)
                        max_day = max(episode)
                        spread = spread_rules[cond]
                        spread_days = pd.date_range(min_day - timedelta(days=spread['before']), max_day + timedelta(days=spread['after']))
                        mask = (full_daily['cow'] == cow_id) & (full_daily['date'].isin(spread_days))
                        full_daily.loc[mask, cond] = 1
                        full_daily.loc[mask & (full_daily['LABEL'] == 'control'), 'LABEL'] = cond
                        full_daily.loc[mask, 'OK'] = 0
                    episode = [day]
                else:
                    episode.append(day)
                prev_day = day
            if episode:
                min_day = min(episode)
                max_day = max(episode)
                spread = spread_rules[cond]
                spread_days = pd.date_range(min_day - timedelta(days=spread['before']), max_day + timedelta(days=spread['after']))
                mask = (full_daily['cow'] == cow_id) & (full_daily['date'].isin(spread_days))
                full_daily.loc[mask, cond] = 1
                full_daily.loc[mask & (full_daily['LABEL'] == 'control'), 'LABEL'] = cond
                full_daily.loc[mask, 'OK'] = 0

    # Step 5: Prepare df before merging (drop event columns to avoid conflict)
    df['date'] = pd.to_datetime(df['date'])
    df = df.drop(columns=event_cols + ['OK'], errors='ignore')

    # Merge cleanly
    final = df.merge(full_daily[['cow', 'date', 'LABEL'] + event_cols + ['OK']], on=['cow', 'date'], how='left')

    # Step 6: If a day was labeled with an event, update hourly events if missing
    for cond in event_cols:
        final.loc[(final['LABEL'] == cond) & (final[cond] == 0), cond] = 1

    return final

# Label and align data
# final = label_and_align_data(df, health_states)
# final.to_csv(r"labelled&aligned_dataset.csv", index=False)
# final.head(24)

In [None]:
def visual_check_labeling(df, cow_id=156):
    """Visual check of consecutive days labeling for a specific cow."""
    # Filter that cow
    cow_data = df[df['cow'] == cow_id][['cow', 'date', 'hour', 'LABEL']]

    # Group by date (daily view)
    daily_view = cow_data.groupby(['cow', 'date'])['LABEL'].agg(lambda x: x.mode()[0]).reset_index()

    # Display consecutive days
    print(f"Consecutive days labeling for Cow {cow_id}:")
    display(daily_view.sort_values('date'))

# Visual check for a specific cow
# visual_check_labeling(final, cow_id=156)

**Investigating before imputation**

In [None]:
def investigate_before_imputation(file_path):
    """Investigate rows needing reassignment before imputation."""
    # Step 0: Load the dataset
    df = pd.read_csv(file_path)

    # Step 1: Define useful and unwanted classes
    useful_classes = ['mastitis', 'lameness', 'oestrus', 'calving', 'other_disease', 'OK']
    removed_classes = ['management_changes', 'mixing', 'disturbance', 'accidents', 'LPS', 'acidosis']

    # Step 2: Tag samples based on unwanted classes
    df['had_removed_class'] = df[removed_classes].max(axis=1)  # 1 if any unwanted class was active

    # Step 3: After dropping unwanted classes, check if any useful class remains
    df['has_useful_class'] = df[useful_classes].max(axis=1)  # 1 if any useful class active

    # Step 4: Identify rows needing reassignment
    rows_to_reassign = df[(df['had_removed_class'] == 1) & (df['has_useful_class'] == 0)]

    # Step 5: Summary statistics
    total_rows = len(df)
    total_to_reassign = len(rows_to_reassign)
    proportion = total_to_reassign / total_rows * 100

    print(f"Total rows in dataset: {total_rows}")
    print(f"Rows needing reassignment: {total_to_reassign}")
    print(f"Proportion needing reassignment: {proportion:.2f}%")

    return df, useful_classes, removed_classes

# Investigate before imputation
# df, useful_classes, removed_classes = investigate_before_imputation(r"labelled&aligned_dataset.csv")

In [None]:
def prepare_knn_data(df, useful_classes, removed_classes):
    """Prepare data for KNN classification."""
    # Step 2: Detect samples related to removed classes
    df['needs_replacement'] = df[removed_classes].max(axis=1)  # 1 if any removed class is active

    # Step 3: Create the physiological label
    def get_physio_label(row):
        for cond in useful_classes:
            if row[cond] == 1:
                return cond
        return None

    df['physio_label'] = df.apply(get_physio_label, axis=1)

    # Step 4: Split into clean vs. to-replace
    to_replace = df[(df['needs_replacement'] == 1) & (df['physio_label'].isnull())].copy()
    clean_physio = df[(df['needs_replacement'] == 0) & (df['physio_label'].notnull())].copy()
    behavior_features = ['IN_ALLEYS', 'REST', 'EAT', 'ACTIVITY_LEVEL']

    # Only disease cases for training
    disease_classes = ['mastitis', 'lameness', 'oestrus', 'calving', 'other_disease']
    train_disease = clean_physio[clean_physio['physio_label'].isin(disease_classes)]

    X_train = train_disease[behavior_features]
    y_train = train_disease['physio_label']

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    return to_replace, clean_physio, X_train_scaled, y_train, scaler, behavior_features

# Prepare data for KNN
# to_replace, clean_physio, X_train_scaled, y_train, scaler, behavior_features = prepare_knn_data(df, useful_classes, removed_classes)

# with KNN

In [None]:
def apply_knn_and_save(to_replace, clean_physio, X_train_scaled, y_train, scaler, behavior_features, useful_classes, output_path='final2_cleaned_and_reassigned_dataset.csv'):
    """Apply KNN to reassign labels and save the final dataset."""
    # Train KNN
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train_scaled, y_train)

    # Step 6: Predict missing labels
    X_missing = scaler.transform(to_replace[behavior_features])
    predicted_labels = knn.predict(X_missing)

    # Assign the new labels
    to_replace['physio_label'] = predicted_labels

    # Step 6.1: Analyze reassignment
    reassignment_summary = pd.Series(predicted_labels).value_counts().reset_index()
    reassignment_summary.columns = ['Physiological_Class', 'Number_of_Reassigned_Samples']
    print(reassignment_summary)

    # Step 7: Merge back
    for cond in useful_classes:
        to_replace.loc[to_replace['physio_label'] == cond, cond] = 1

    # Clean removed columns
    df_final = pd.concat([clean_physio, to_replace], axis=0)
    df_final = df_final.drop(columns=removed_classes + ['needs_replacement'])

    # Optional: reorder if needed
    df_final = df_final.sort_values(by=['cow', 'date', 'hour']).reset_index(drop=True)

    # Step 8: Save the cleaned and reconstructed dataset
    df_final.to_csv(output_path, index=False)

    # Show sample
    return df_final[['cow', 'date', 'hour', 'physio_label']].sample(10)

# Apply KNN and save
# sample = apply_knn_and_save(to_replace, clean_physio, X_train_scaled, y_train, scaler, behavior_features, useful_classes)

# one hour shifting process

In [None]:
def one_hour_shift_process(df, window_size_hours=24, min_valid_hours=18, shift_hours=1):
    """Perform one-hour shifting process to create windows."""
    condition_columns = ['oestrus', 'calving', 'lameness', 'mastitis', 'other_disease']

    # Adjust hour 24 to 00 and increment the date
    df['adjusted_date'] = df['date']
    df['adjusted_hour'] = df['hour']
    df.loc[df['hour'] == 24, 'adjusted_date'] = pd.to_datetime(df.loc[df['hour'] == 24, 'date']) + timedelta(days=1)
    df.loc[df['hour'] == 24, 'adjusted_hour'] = 0

    # Ensure datetime format
    df['adjusted_date'] = pd.to_datetime(df['adjusted_date'])

    # Combine adjusted_date and adjusted_hour into a timestamp
    df['timestamp'] = pd.to_datetime(df['adjusted_date'].dt.strftime('%Y-%m-%d') + ' ' + df['adjusted_hour'].astype(str) + ':00:00')
    df = df.sort_values(['cow', 'timestamp'])

    # Group by cow
    grouped = df.groupby('cow')
    shifted_windows = []

    for cow_id, group in grouped:
        timestamps = group['timestamp'].tolist()
        group = group.set_index('timestamp')  # Index by timestamp for easy lookup

        print(f"\n🐄 Processing cow {cow_id} with {len(timestamps)} records...")

        for start_time in timestamps:
            end_time = start_time + timedelta(hours=window_size_hours)
            collected_hours = []
            current_time = start_time
            previous_date = current_time.date()

            while current_time < end_time:
                if current_time in group.index:
                    current_date = current_time.date()

                    # Stop if there's a day gap
                    if (current_date - previous_date).days > 1:
                        print(f"🛑 Stopping at {current_time} due to missing day.")
                        break

                    collected_hours.append(group.loc[current_time]['ACTIVITY_LEVEL'])
                    previous_date = current_date
                    current_time += timedelta(hours=1)
                else:
                    # Hour is missing, skip it but advance time
                    print(f"⚠️  Missing hour at {current_time}, skipping.")
                    current_date = current_time.date()

                    if (current_date - previous_date).days > 0:
                        print(f"🛑 Stopping at {current_time} due to day gap after missing hour.")
                        break

                    previous_date = current_date
                    current_time += timedelta(hours=1)

            if len(collected_hours) >= min_valid_hours:
                end_effective = start_time + timedelta(hours=len(collected_hours) - 1)

                # Get condition values across the window
                condition_window = group.loc[start_time:end_effective]
                condition_counts = condition_window[condition_columns].sum()

                # Prepare final label dictionary: all 0s initially
                final_conditions = {col: 0 for col in condition_columns}
                final_conditions['OK'] = 1

                if (condition_counts > 0).any():
                    most_frequent_condition = condition_counts.idxmax()
                    final_conditions[most_frequent_condition] = 1
                    final_conditions['OK'] = 0

                shifted_windows.append({
                    'cow': cow_id,
                    'start_time': start_time,
                    'end_time': end_effective,
                    'duration_hours': len(collected_hours),
                    'activity_window': collected_hours,
                    **final_conditions
                })

                print(f"✅ Window for cow {cow_id} → {most_frequent_condition if (condition_counts > 0).any() else 'healthy'}")
            else:
                print(f"❌ Discarded window for cow {cow_id} from {start_time}: only {len(collected_hours)} valid hours.")

    # Final DataFrame
    shifted_df = pd.DataFrame(shifted_windows)

    print("\n✅ DONE.")
    print(f"Total valid windows collected: {len(shifted_df)}")
    display(shifted_df.head())

    return shifted_df

# Perform one-hour shift process
# shifted_df = one_hour_shift_process(df)

In [None]:
def save_shifted_data(shifted_df, output_path='OneHourShift.csv'):
    """Save the shifted dataset to CSV."""
    shifted_df.to_csv(output_path, index=False)
    print(f"CSV file saved to {output_path}")

# Save shifted data
# save_shifted_data(shifted_df)