In [10]:
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from scipy import stats, fft
from datetime import timedelta
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [11]:
def load_and_preprocess_data(filepath):
    """Load and preprocess the dataset."""
    df = pd.read_csv(filepath)
    df.columns = df.columns.str.strip()
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(by=['cow', 'date'])
    return df

In [20]:
def get_health_state_durations(df, health_state_column):
    """Calculate durations for specific health states."""
    durations = []

    for cow_id, group in df.groupby('cow'):
        group = group.sort_values('date').reset_index(drop=True)
        current_state = False
        start_date = None
        prev_date = None

        for i, row in group.iterrows():
            if row[health_state_column] == 1:
                if not current_state:
                    current_state = True
                    start_date = row['date']
                elif prev_date is not None and (row['date'] - prev_date).days > 1:
                    end_date = prev_date
                    duration = (end_date - start_date).days + 1
                    durations.append({
                        'cow': cow_id,
                        'health_state': health_state_column,
                        'start_date': start_date,
                        'end_date': end_date,
                        'duration_days': duration
                    })
                    start_date = row['date']
            else:
                if current_state:
                    end_date = prev_date if prev_date is not None else row['date']
                    duration = (end_date - start_date).days + 1
                    durations.append({
                        'cow': cow_id,
                        'health_state': health_state_column,
                        'start_date': start_date,
                        'end_date': end_date,
                        'duration_days': duration
                    })
                    current_state = False
                    start_date = None
            prev_date = row['date']

        if current_state:
            end_date = prev_date
            duration = (end_date - start_date).days + 1
            durations.append({
                'cow': cow_id,
                'health_state': health_state_column,
                'start_date': start_date,
                'end_date': end_date,
                'duration_days': duration
            })

    return pd.DataFrame(durations)

In [21]:
def analyze_health_states(df, health_states):
    """Analyze and display health state durations."""
    state_durations = {}
    durations_min_max = {}

    for state in health_states:
        durations_df = get_health_state_durations(df, state)
        state_durations[state] = durations_df

        if not durations_df.empty:
            min_duration = durations_df['duration_days'].min()
            max_duration = durations_df['duration_days'].max()
            durations_min_max[state] = {'min_days': min_duration, 'max_days': max_duration}
        else:
            durations_min_max[state] = {'min_days': None, 'max_days': None}

    for state, durations in durations_min_max.items():
        min_days = durations['min_days']
        max_days = durations['max_days']

        if min_days is None or max_days is None:
            print(f"No data available for \"{state}\".")
        elif min_days == max_days:
            print(f"{state.capitalize()} lasts {min_days} day{'s' if min_days > 1 else ''}.")
        else:
            print(f"{state.capitalize()} lasts between {min_days} and {max_days} days.")
    
    return state_durations, durations_min_max

In [None]:
def label_and_align_data(df):
    """Label and align the dataset with event spreading rules."""
    non_event_cols = ['cow', 'date', 'hour', 'in_alleys', 'rest', 'eat', 'activity_level', 'ok']
    event_cols = [col for col in df.columns if col not in non_event_cols]

    daily = df.groupby(['cow', 'date'])[event_cols].max().reset_index()
    daily['date'] = pd.to_datetime(daily['date'])

    all_dates = pd.date_range(daily['date'].min() - timedelta(days=7), daily['date'].max() + timedelta(days=7))
    cows = daily['cow'].unique()
    full_daily = pd.MultiIndex.from_product([cows, all_dates], names=['cow', 'date']).to_frame(index=False)

    full_daily = full_daily.merge(daily, on=['cow', 'date'], how='left')
    full_daily[event_cols] = full_daily[event_cols].fillna(0)

    full_daily['LABEL'] = 'control'
    full_daily['OK'] = 1

    spread_rules = {
        'oestrus': {'before': 1, 'after': 1},
        'calving': {'before': 2, 'after': 1},
        'lameness': {'before': 2, 'after': 1},
        'mastitis': {'before': 2, 'after': 1},
        'acidosis': {'before': 2, 'after': 1},
        'LPS': {'before': 2, 'after': 1},
        'other_disease': {'before': 2, 'after': 1},
        'accidents': {'before': 2, 'after': 1},
        'disturbance': {'before': 0, 'after': 0},
        'mixing': {'before': 0, 'after': 0},
        'management_changes': {'before': 0, 'after': 0},
    }

    for cond in event_cols:
        if cond not in spread_rules:
            continue
        sub = full_daily[full_daily[cond] == 1][['cow', 'date']].sort_values(['cow', 'date'])
        for cow_id in sub['cow'].unique():
            cow_days = sub[sub['cow'] == cow_id]['date'].sort_values()
            episode = []
            prev_day = None
            for day in cow_days:
                if prev_day is None or (day - prev_day).days > 1:
                    if episode:
                        min_day = min(episode)
                        max_day = max(episode)
                        spread = spread_rules[cond]
                        spread_days = pd.date_range(min_day - timedelta(days=spread['before']), max_day + timedelta(days=spread['after']))
                        mask = (full_daily['cow'] == cow_id) & (full_daily['date'].isin(spread_days))
                        full_daily.loc[mask, cond] = 1
                        full_daily.loc[mask & (full_daily['LABEL'] == 'control'), 'LABEL'] = cond
                        full_daily.loc[mask, 'OK'] = 0
                    episode = [day]
                else:
                    episode.append(day)
                prev_day = day
            if episode:
                min_day = min(episode)
                max_day = max(episode)
                spread = spread_rules[cond]
                spread_days = pd.date_range(min_day - timedelta(days=spread['before']), max_day + timedelta(days=spread['after']))
                mask = (full_daily['cow'] == cow_id) & (full_daily['date'].isin(spread_days))
                full_daily.loc[mask, cond] = 1
                full_daily.loc[mask & (full_daily['LABEL'] == 'control'), 'LABEL'] = cond
                full_daily.loc[mask, 'OK'] = 0

    df['date'] = pd.to_datetime(df['date'])
    df = df.drop(columns=event_cols + ['OK'], errors='ignore')

    final = df.merge(full_daily[['cow', 'date', 'LABEL'] + event_cols + ['OK']], on=['cow', 'date'], how='left')

    for cond in event_cols:
        final.loc[(final['LABEL'] == cond) & (final[cond] == 0), cond] = 1

    return final

In [None]:
def clean_and_reassign_data(df, useful_classes, removed_classes):
    """Clean data and reassign labels using KNN."""
    df['had_removed_class'] = df[removed_classes].max(axis=1)
    df['has_useful_class'] = df[useful_classes].max(axis=1)
    
    rows_to_reassign = df[(df['had_removed_class'] == 1) & (df['has_useful_class'] == 0)]
    total_rows = len(df)
    total_to_reassign = len(rows_to_reassign)
    proportion = total_to_reassign / total_rows * 100

    print(f"Total rows in dataset: {total_rows}")
    print(f"Rows needing reassignment: {total_to_reassign}")
    print(f"Proportion needing reassignment: {proportion:.2f}%")

    df['needs_replacement'] = df[removed_classes].max(axis=1)

    def get_physio_label(row):
        for cond in useful_classes:
            if row[cond] == 1:
                return cond
        return None
    print("Available columns:", df.columns.tolist())

    df['physio_label'] = df.apply(get_physio_label, axis=1)
    to_replace = df[(df['needs_replacement'] == 1) & (df['physio_label'].isnull())].copy()
    clean_physio = df[(df['needs_replacement'] == 0) & (df['physio_label'].notnull())].copy()

    behavior_features = ['in_alleys', 'rest', 'eat', 'activity_level']
   

   


    disease_classes = ['mastitis', 'lameness', 'oestrus', 'other_disease','ok']

    train_disease = clean_physio[clean_physio['physio_label'].isin(disease_classes)]
    X_train = train_disease[behavior_features]
    y_train = train_disease['physio_label']

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train_scaled, y_train)

    X_missing = scaler.transform(to_replace[behavior_features])
    predicted_labels = knn.predict(X_missing)
    to_replace['physio_label'] = predicted_labels

    reassignment_summary = pd.Series(predicted_labels).value_counts().reset_index()
    reassignment_summary.columns = ['Physiological_Class', 'Number_of_Reassigned_Samples']
    print(reassignment_summary)

    for cond in useful_classes:
        to_replace.loc[to_replace['physio_label'] == cond, cond] = 1

    df_final = pd.concat([clean_physio, to_replace], axis=0)
    df_final = df_final.drop(columns=removed_classes + ['needs_replacement'])
    df_final = df_final.sort_values(by=['cow', 'date', 'hour']).reset_index(drop=True)

    return df_final

In [24]:
def create_one_hour_shifted_windows(df, window_size_hours=24, min_valid_hours=18, shift_hours=1):
    """Create one-hour shifted windows from the dataset."""
    condition_columns = ['oestrus', 'lameness', 'mastitis', 'other_disease','ok']
    
    df['adjusted_date'] = df['date']
    df['adjusted_hour'] = df['hour']
    df.loc[df['hour'] == 24, 'adjusted_date'] = pd.to_datetime(df.loc[df['hour'] == 24, 'date']) + timedelta(days=1)
    df.loc[df['hour'] == 24, 'adjusted_hour'] = 0
    
    df['adjusted_date'] = pd.to_datetime(df['adjusted_date'])
    df['timestamp'] = pd.to_datetime(df['adjusted_date'].dt.strftime('%Y-%m-%d') + ' ' + df['adjusted_hour'].astype(str) + ':00:00')
    df = df.sort_values(['cow', 'timestamp'])
    
    grouped = df.groupby('cow')
    shifted_windows = []
    
    for cow_id, group in grouped:
        timestamps = group['timestamp'].tolist()
        group = group.set_index('timestamp')
        
        for start_time in timestamps:
            end_time = start_time + timedelta(hours=window_size_hours)
            collected_hours = []
            current_time = start_time
            previous_date = current_time.date()
            
            while current_time < end_time:
                if current_time in group.index:
                    current_date = current_time.date()
                    
                    if (current_date - previous_date).days > 1:
                        break
                    
                    collected_hours.append(group.loc[current_time]['activity_level'])
                    previous_date = current_date
                    current_time += timedelta(hours=1)
                else:
                    current_date = current_time.date()
                    
                    if (current_date - previous_date).days > 0:
                        break
                    
                    previous_date = current_date
                    current_time += timedelta(hours=1)
            
            if len(collected_hours) >= min_valid_hours:
                end_effective = start_time + timedelta(hours=len(collected_hours) - 1)
                condition_window = group.loc[start_time:end_effective]
                condition_counts = condition_window[condition_columns].sum()
                
                final_conditions = {col: 0 for col in condition_columns}
                
                if (condition_counts > 0).any():
                    most_frequent_condition = condition_counts.idxmax()
                    final_conditions[most_frequent_condition] = 1
                    final_conditions['ok'] = 0
                else:
                    final_conditions['ok'] = 1
                
                shifted_windows.append({
                    'cow': cow_id,
                    'start_time': start_time,
                    'end_time': end_effective,
                    'duration_hours': len(collected_hours),
                    'activity_window': collected_hours,
                    **final_conditions
                })
    
    shifted_df = pd.DataFrame(shifted_windows)
    return shifted_df

In [29]:
def process_dataset(input_path, output_path, health_states, useful_classes, removed_classes):
    """Process a complete dataset from input to output."""
    print(f"\nProcessing dataset: {input_path}")
    
    # Step 1: Load and preprocess
    df = load_and_preprocess_data(input_path)
    
    # Step 2: Analyze health states
    print("\nAnalyzing health states:")
    state_durations, durations_min_max = analyze_health_states(df, health_states)
    
    # Step 3: Label and align data
    print("\nLabeling and aligning data...")
    labeled_df = label_and_align_data(df)
    
    # Step 4: Clean and reassign data
    print("\nCleaning and reassigning data...")
    cleaned_df = clean_and_reassign_data(labeled_df, useful_classes, removed_classes)
    
    # Step 5: Create shifted windows
    # print("\nCreating one-hour shifted windows...")
    # shifted_df = create_one_hour_shifted_windows(cleaned_df)
    
    # Save results
    cleaned_df.to_csv(output_path, index=False)
    print(f"\n✅ Processing complete. Results saved to {output_path}")
    
    return cleaned_df

In [33]:
# Define parameters
health_states = ['oestrus', 'calving', 'lameness', 'mastitis', 'ok' ,
                'other_disease', 'accidents', 'disturbance', 'mixing', 'management_changes']

useful_classes = ['mastitis', 'lameness', 'oestrus', 'calving', 'other_disease', 'ok']
removed_classes = ['management_changes', 'mixing', 'disturbance', 'accidents']

# Define datasets to process
datasets = [
      {"input": r"C:/Users/lamia/Desktop/datasets/dataset1_physiological.csv", "output": r"C:/Users/lamia/Desktop/datasets/processed_dataset1.csv"}
    # {"input": "C:\Users\lamia\Desktop\datasets\dataset2_physiological.csv", "output": "processed_dataset2.csv"},
    # {"input": "C:\Users\lamia\Desktop\datasets\dataset3_physiological.csv", "output": "processed_dataset3.csv"},
    # {"input": "C:\Users\lamia\Desktop\datasets\dataset4_physiological.csv", "output": "processed_dataset4.csv"}
]

# Process each dataset
for dataset in datasets:
    try:
        process_dataset(
            dataset["input"],
            dataset["output"],
            health_states,
            useful_classes,
            removed_classes
        )
    except Exception as e:
        print(f"Error processing {dataset['input']}: {str(e)}")


Processing dataset: C:/Users/lamia/Desktop/datasets/dataset1_physiological.csv

Analyzing health states:
Oestrus lasts 1 day.
Calving lasts 1 day.
Lameness lasts 1 day.
Mastitis lasts between 2 and 3 days.
Ok lasts between 1 and 44 days.
Other_disease lasts 1 day.
No data available for "accidents".
Disturbance lasts between 1 and 2 days.
Mixing lasts between 1 and 2 days.
No data available for "management_changes".

Labeling and aligning data...

Cleaning and reassigning data...
Total rows: 106269
Rows to reassign: 5546 (5.22%)
Error processing C:/Users/lamia/Desktop/datasets/dataset1_physiological.csv: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


In [25]:
# Define parameters
health_states = ['oestrus',  'lameness', 'mastitis', 'ok' ,
                'other_disease', 'accidents', 'disturbance', 'mixing', 'management_changes']

useful_classes = ['mastitis', 'lameness', 'oestrus',  'other_disease', 'ok']
removed_classes = ['management_changes', 'mixing', 'disturbance', 'accidents']

# Define datasets to process
datasets = [
      {"input": r"C:/Users/lamia/Desktop/datasets/dataset2_physiological.csv", "output": r"C:/Users/lamia/Desktop/datasets/processed_dataset2.csv"}
    # {"input": "C:\Users\lamia\Desktop\datasets\dataset2_physiological.csv", "output": "processed_dataset2.csv"},
    # {"input": "C:\Users\lamia\Desktop\datasets\dataset3_physiological.csv", "output": "processed_dataset3.csv"},
    # {"input": "C:\Users\lamia\Desktop\datasets\dataset4_physiological.csv", "output": "processed_dataset4.csv"}
]

# Process each dataset
for dataset in datasets:
    try:
        process_dataset(
            dataset["input"],
            dataset["output"],
            health_states,
            useful_classes,
            removed_classes
        )
    except Exception as e:
        print(f"Error processing {dataset['input']}: {str(e)}")


Processing dataset: C:/Users/lamia/Desktop/datasets/dataset2_physiological.csv

Analyzing health states:
Oestrus lasts between 1 and 2 days.
Lameness lasts 1 day.
Mastitis lasts 1 day.
Ok lasts between 1 and 6 days.
Other_disease lasts 1 day.
No data available for "accidents".
Disturbance lasts between 1 and 4 days.
No data available for "mixing".
Management_changes lasts between 1 and 11 days.

Labeling and aligning data...

Cleaning and reassigning data...
Total rows in dataset: 40247
Rows needing reassignment: 15840
Proportion needing reassignment: 39.36%
Available columns: ['cow', 'date', 'hour', 'in_alleys', 'rest', 'eat', 'activity_level', 'ok', 'LABEL', 'oestrus', 'lameness', 'mastitis', 'other_disease', 'accidents', 'disturbance', 'mixing', 'management_changes', 'OK', 'had_removed_class', 'has_useful_class', 'needs_replacement']
  Physiological_Class  Number_of_Reassigned_Samples
0                  ok                         15819
1            lameness                         