In [9]:
import pandas as pd
from datetime import timedelta
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# ====================================================================================================
#  Core Functions
# ====================================================================================================

def analyze_health_state_durations(file_path):
    """
    Analyzes and prints the duration of each health state from the dataset.

    Args:
        file_path (str): The path to the input CSV file.

    Returns:
        dict: A dictionary with min and max durations for each health state.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        return {}

    df.columns = df.columns.str.strip()
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(by=['cow', 'date'])

    def get_health_state_durations(df, health_state_column):
        durations = []
        for cow_id, group in df.groupby('cow'):
            group = group.sort_values('date').reset_index(drop=True)
            current_state = False
            start_date = None
            prev_date = None

            for _, row in group.iterrows():
                if row[health_state_column] == 1:
                    if not current_state:
                        current_state = True
                        start_date = row['date']
                    elif prev_date is not None and (row['date'] - prev_date).days > 1:
                        if current_state:
                            end_date = prev_date
                            duration = (end_date - start_date).days + 1
                            durations.append({
                                'cow': cow_id,
                                'health_state': health_state_column,
                                'start_date': start_date,
                                'end_date': end_date,
                                'duration_days': duration
                            })
                        start_date = row['date']
                else:
                    if current_state:
                        end_date = prev_date if prev_date is not None else row['date']
                        duration = (end_date - start_date).days + 1
                        durations.append({
                            'cow': cow_id,
                            'health_state': health_state_column,
                            'start_date': start_date,
                            'end_date': end_date,
                            'duration_days': duration
                        })
                        current_state = False
                        start_date = None
                prev_date = row['date']

            if current_state:
                end_date = prev_date
                duration = (end_date - start_date).days + 1
                durations.append({
                    'cow': cow_id,
                    'health_state': health_state_column,
                    'start_date': start_date,
                    'end_date': end_date,
                    'duration_days': duration
                })
        return pd.DataFrame(durations)

    health_states = [
        'oestrus', 'calving', 'lameness', 'mastitis', 'lps',
        'acidosis', 'other_disease', 'accidents', 'disturbance',
        'mixing', 'management_changes'
    ]
    durations_min_max = {}

    for state in health_states:
        if state not in df.columns:
            durations_min_max[state] = {'min_days': None, 'max_days': None}
            continue

        durations_df = get_health_state_durations(df, state)
        if not durations_df.empty:
            min_duration = durations_df['duration_days'].min()
            max_duration = durations_df['duration_days'].max()
            durations_min_max[state] = {'min_days': min_duration, 'max_days': max_duration}
        else:
            durations_min_max[state] = {'min_days': None, 'max_days': None}

    print("Health State Durations:")
    for state, durations in durations_min_max.items():
        min_days = durations['min_days']
        max_days = durations['max_days']
        if min_days is None or max_days is None:
            print(f"No data available for \"{state}\".")
        elif min_days == max_days:
            print(f"{state.capitalize()} lasts {min_days} day{'s' if min_days > 1 else ''}.")
        else:
            print(f"{state.capitalize()} lasts between {min_days} and {max_days} days.")
            
    return durations_min_max

def label_and_align_data(file_path):
    """
    Labels and aligns the dataset by spreading event labels over a specific
    time window before and after the event.

    Args:
        file_path (str): The path to the input CSV file.

    Returns:
        pd.DataFrame: A new DataFrame with the 'LABEL' column added.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        return pd.DataFrame()

    df.columns = df.columns.str.strip().str.lower()
    df['date'] = pd.to_datetime(df['date'])

    non_event_cols = ['cow', 'date', 'hour', 'in_alleys', 'rest', 'eat', 'activity_level', 'ok']
    event_cols = [col for col in df.columns if col not in non_event_cols]

    daily = df.groupby(['cow', 'date'])[event_cols].max().reset_index()
    daily['date'] = pd.to_datetime(daily['date'])

    all_dates = pd.date_range(daily['date'].min() - timedelta(days=7), daily['date'].max() + timedelta(days=7))
    cows = daily['cow'].unique()
    full_daily = pd.MultiIndex.from_product([cows, all_dates], names=['cow', 'date']).to_frame(index=False)

    full_daily = full_daily.merge(daily, on=['cow', 'date'], how='left')
    full_daily[event_cols] = full_daily[event_cols].fillna(0)
    full_daily['LABEL'] = 'control'
    full_daily['ok'] = 1

    spread_rules = {
        'oestrus': {'before': 1, 'after': 1},
        'calving': {'before': 2, 'after': 1},
        'lameness': {'before': 2, 'after': 1},
        'mastitis': {'before': 2, 'after': 1},
        'lps': {'before': 2, 'after': 1},
        'acidosis': {'before': 2, 'after': 1},
        'other_disease': {'before': 2, 'after': 1},
        'accidents': {'before': 2, 'after': 1},
        'disturbance': {'before': 0, 'after': 0},
        'mixing': {'before': 0, 'after': 0},
        'management_changes': {'before': 0, 'after': 0},
    }

    for cond in event_cols:
        if cond not in spread_rules:
            continue
        sub = full_daily[full_daily[cond] == 1][['cow', 'date']].sort_values(['cow', 'date'])
        for cow_id in sub['cow'].unique():
            cow_days = sub[sub['cow'] == cow_id]['date'].sort_values()
            episode = []
            prev_day = None
            for day in cow_days:
                if prev_day is None or (day - prev_day).days > 1:
                    if episode:
                        min_day = min(episode)
                        max_day = max(episode)
                        spread = spread_rules[cond]
                        spread_days = pd.date_range(min_day - timedelta(days=spread['before']), max_day + timedelta(days=spread['after']))
                        mask = (full_daily['cow'] == cow_id) & (full_daily['date'].isin(spread_days))
                        full_daily.loc[mask, cond] = 1
                        full_daily.loc[mask & (full_daily['LABEL'] == 'control'), 'LABEL'] = cond
                        full_daily.loc[mask, 'ok'] = 0
                    episode = [day]
                else:
                    episode.append(day)
                prev_day = day
            if episode:
                min_day = min(episode)
                max_day = max(episode)
                spread = spread_rules[cond]
                spread_days = pd.date_range(min_day - timedelta(days=spread['before']), max_day + timedelta(days=spread['after']))
                mask = (full_daily['cow'] == cow_id) & (full_daily['date'].isin(spread_days))
                full_daily.loc[mask, cond] = 1
                full_daily.loc[mask & (full_daily['LABEL'] == 'control'), 'LABEL'] = cond
                full_daily.loc[mask, 'ok'] = 0
                
    df = df.drop(columns=event_cols + ['ok'], errors='ignore')
    final_df = df.merge(full_daily[['cow', 'date', 'LABEL'] + event_cols + ['ok']], on=['cow', 'date'], how='left')

    for cond in event_cols:
        final_df.loc[(final_df['LABEL'] == cond) & (final_df[cond] == 0), cond] = 1
    
    return final_df

def reassign_labels_with_knn(df):
    """
    Reassigns labels for samples belonging to removed classes using a K-NN classifier.

    Args:
        df (pd.DataFrame): The DataFrame with 'LABEL' and event columns.

    Returns:
        pd.DataFrame: The final DataFrame with reassigned labels and cleaned columns.
    """
    df.columns = df.columns.str.strip().str.lower()
    
    useful_classes = ['mastitis', 'lameness', 'oestrus', 'calving', 'other_disease', 'ok']
    removed_classes = ['management_changes', 'mixing', 'disturbance', 'accidents', 'lps', 'acidosis']

    df['needs_replacement'] = df[removed_classes].max(axis=1)

    def get_physio_label(row):
        for cond in useful_classes:
            if row[cond] == 1:
                return cond
        return None

    df['physio_label'] = df.apply(get_physio_label, axis=1)

    to_replace = df[(df['needs_replacement'] == 1) & (df['physio_label'].isnull())].copy()
    clean_physio = df[(df['needs_replacement'] == 0) & (df['physio_label'].notnull())].copy()

    behavior_features = ['in_alleys', 'rest', 'eat', 'activity_level']
    
    if clean_physio.empty:
        print("No clean data to train K-NN on. Returning original data.")
        return df.drop(columns=removed_classes + ['needs_replacement'], errors='ignore')
    
    X_train = clean_physio[behavior_features]
    y_train = clean_physio['physio_label']

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train_scaled, y_train)

    if not to_replace.empty:
        X_missing = scaler.transform(to_replace[behavior_features])
        predicted_labels = knn.predict(X_missing)
        to_replace['physio_label'] = predicted_labels
        
        reassignment_summary = pd.Series(predicted_labels).value_counts().reset_index()
        reassignment_summary.columns = ['Physiological_Class', 'Number_of_Reassigned_Samples']
        print("\nReassignment Summary:")
        print(reassignment_summary)

        for cond in useful_classes:
            to_replace.loc[to_replace['physio_label'] == cond, cond] = 1
    else:
        print("\nNo samples need replacement.")

    df_final = pd.concat([clean_physio, to_replace], axis=0)
    df_final = df_final.drop(columns=removed_classes + ['needs_replacement', 'LABEL'], errors='ignore')
    df_final = df_final.sort_values(by=['cow', 'date', 'hour']).reset_index(drop=True)

    return df_final

# ====================================================================================================
#  Main Pipeline & Execution Example
# ====================================================================================================

def main_processing_pipeline(input_file_path, output_file_path):
    """
    Executes the entire data processing workflow.
    
    Args:
        input_file_path (str): The path to the raw dataset.
        output_file_path (str): The path to save the final cleaned dataset.

    Returns:
        pd.DataFrame: The final, cleaned, and labeled DataFrame.
    """
    print("Step 1: Analyzing health state durations...")
    durations = analyze_health_state_durations(input_file_path)
    
    print("\n" + "="*50 + "\n")
    print("Step 2: Labeling and aligning data...")
    labelled_df = label_and_align_data(input_file_path)
    if labelled_df.empty:
        return pd.DataFrame()
    
    labelled_df.to_csv(f"{output_file_path.replace('.csv', '')}_labeled.csv", index=False)
    print(f"Intermediate labeled dataset saved to {output_file_path.replace('.csv', '')}_labeled.csv")

    print("\n" + "="*50 + "\n")
    print("Step 3: Reassigning labels with K-NN...")
    final_df = reassign_labels_with_knn(labelled_df)
    
    if not final_df.empty:
        final_df.to_csv(output_file_path, index=False)
        print(f"\nFinal cleaned dataset saved to {output_file_path}")
        print("\nSample of final dataset:")
        display(final_df[['cow', 'date', 'hour', 'physio_label']].sample(10))
    
    return final_df



In [3]:
if __name__ == '__main__':
    # Define file paths
    # Note: Replace these with your actual file paths
    # The original notebook used 'filtered_dataset1_more_than_18_obs.csv'
    # and saved to 'final_cleaned_and_reassigned_dataset1.csv'.
    # This example uses placeholder paths.
    
    input_data_path = r"C:\Users\lamia\Desktop\datasets\dataset1_more_than_18_obs.csv"
    output_data_path =r"C:\Users\lamia\Desktop\datasets\dataset1_knn.csv"
    
    # Run the full pipeline
    processed_df = main_processing_pipeline(input_data_path, output_data_path)

Step 1: Analyzing health state durations...
Health State Durations:
Oestrus lasts 1 day.
Calving lasts 1 day.
Lameness lasts 1 day.
Mastitis lasts between 2 and 3 days.
No data available for "lps".
No data available for "acidosis".
Other_disease lasts 1 day.
No data available for "accidents".
Disturbance lasts between 1 and 2 days.
Mixing lasts between 1 and 2 days.
No data available for "management_changes".


Step 2: Labeling and aligning data...
Intermediate labeled dataset saved to C:\Users\lamia\Desktop\datasets\dataset1_knn_labeled.csv


Step 3: Reassigning labels with K-NN...

Reassignment Summary:
  Physiological_Class  Number_of_Reassigned_Samples
0                  ok                          7357
1             oestrus                             2
2            lameness                             1
3             calving                             1

Final cleaned dataset saved to C:\Users\lamia\Desktop\datasets\dataset1_knn.csv

Sample of final dataset:


Unnamed: 0,cow,date,hour,physio_label
94316,6714,2019-03-13,5,ok
102081,6750,2019-04-15,7,ok
20564,6629,2018-12-09,21,ok
24827,6633,2018-12-22,17,ok
61539,6675,2019-03-17,12,ok
46841,6646,2019-04-16,7,ok
40751,6643,2019-01-13,3,ok
31730,6637,2018-11-04,18,ok
47072,6656,2018-11-02,22,ok
10467,6612,2019-02-08,15,ok


In [4]:
print("Step 1: Analyzing health state durations...")
durations = analyze_health_state_durations(output_data_path)

Step 1: Analyzing health state durations...
Health State Durations:
Oestrus lasts between 1 and 3 days.
Calving lasts between 1 and 3 days.
Lameness lasts between 1 and 4 days.
Mastitis lasts between 2 and 6 days.
No data available for "lps".
No data available for "acidosis".
Other_disease lasts between 2 and 6 days.
No data available for "accidents".
No data available for "disturbance".
No data available for "mixing".
No data available for "management_changes".


In [10]:
if __name__ == '__main__':
    # Define file paths
    # Note: Replace these with your actual file paths
    # The original notebook used 'filtered_dataset1_more_than_18_obs.csv'
    # and saved to 'final_cleaned_and_reassigned_dataset1.csv'.
    # This example uses placeholder paths.
    
    input_data_path = r"C:\Users\lamia\Desktop\datasets\dataset2_more_than_18_obs.csv"
    output_data_path =r"C:\Users\lamia\Desktop\datasets\dataset2_knn.csv"
    
    # Run the full pipeline
    processed_df = main_processing_pipeline(input_data_path, output_data_path)
    print("Step 1: Analyzing health state durations after knn...")
    durations = analyze_health_state_durations(output_data_path)

Step 1: Analyzing health state durations...
Health State Durations:
Oestrus lasts between 1 and 2 days.
No data available for "calving".
Lameness lasts 1 day.
Mastitis lasts 1 day.
No data available for "lps".
Acidosis lasts between 1 and 17 days.
Other_disease lasts 1 day.
No data available for "accidents".
Disturbance lasts between 1 and 4 days.
No data available for "mixing".
Management_changes lasts between 1 and 11 days.


Step 2: Labeling and aligning data...
Intermediate labeled dataset saved to C:\Users\lamia\Desktop\datasets\dataset2_knn_labeled.csv


Step 3: Reassigning labels with K-NN...

Reassignment Summary:
  Physiological_Class  Number_of_Reassigned_Samples
0                  ok                         22876
1            lameness                            18
2             oestrus                             2

Final cleaned dataset saved to C:\Users\lamia\Desktop\datasets\dataset2_knn.csv

Sample of final dataset:


Unnamed: 0,cow,date,hour,physio_label
19448,2170,2015-03-09,10,ok
584,151,2015-03-26,9,ok
729,151,2015-04-01,10,ok
2099,153,2015-03-29,12,ok
11881,2152,2015-04-08,3,ok
31948,7163,2015-03-14,6,ok
20412,2170,2015-04-18,14,ok
9111,1177,2015-04-07,17,ok
28338,2187,2015-04-09,20,ok
26212,2185,2015-03-04,6,ok


Step 1: Analyzing health state durations after knn...
Health State Durations:
Oestrus lasts between 1 and 2 days.
No data available for "calving".
Lameness lasts between 1 and 3 days.
Mastitis lasts 1 day.
No data available for "lps".
No data available for "acidosis".
No data available for "other_disease".
No data available for "accidents".
No data available for "disturbance".
No data available for "mixing".
No data available for "management_changes".


In [3]:
if __name__ == '__main__':
    # Define file paths
    # Note: Replace these with your actual file paths
    # The original notebook used 'filtered_dataset1_more_than_18_obs.csv'
    # and saved to 'final_cleaned_and_reassigned_dataset1.csv'.
    # This example uses placeholder paths.
    
    input_data_path = r"C:\Users\lamia\Desktop\datasets\dataset3_more_than_18_obs.csv"
    output_data_path =r"C:\Users\lamia\Desktop\datasets\dataset3_knn.csv"
    
    # Run the full pipeline
    processed_df = main_processing_pipeline(input_data_path, output_data_path)
    print("Step 1: Analyzing health state durations after knn...")
    durations = analyze_health_state_durations(output_data_path)

Step 1: Analyzing health state durations...
Health State Durations:
Oestrus lasts 1 day.
No data available for "calving".
No data available for "lameness".
No data available for "mastitis".
No data available for "lps".
No data available for "acidosis".
No data available for "other_disease".
No data available for "accidents".
No data available for "disturbance".
No data available for "mixing".
No data available for "management_changes".


Step 2: Labeling and aligning data...
Intermediate labeled dataset saved to C:\Users\lamia\Desktop\datasets\dataset3_knn_labeled.csv


Step 3: Reassigning labels with K-NN...

No samples need replacement.

Final cleaned dataset saved to C:\Users\lamia\Desktop\datasets\dataset3_knn.csv

Sample of final dataset:


Unnamed: 0,cow,date,hour,physio_label
22244,9502,2013-10-03,20,ok
20444,8595,2013-10-12,20,ok
21854,9481,2013-10-29,14,ok
602,1797,2013-10-05,3,ok
21366,9481,2013-10-09,6,ok
22345,9502,2013-10-08,1,ok
18163,5541,2013-10-13,19,ok
21219,9481,2013-10-03,3,ok
11570,4372,2013-10-22,15,ok
4090,2395,2013-10-15,14,ok


Step 1: Analyzing health state durations after knn...
Health State Durations:
Oestrus lasts 3 days.
No data available for "calving".
No data available for "lameness".
No data available for "mastitis".
No data available for "lps".
No data available for "acidosis".
No data available for "other_disease".
No data available for "accidents".
No data available for "disturbance".
No data available for "mixing".
No data available for "management_changes".


In [5]:
if __name__ == '__main__':
    # Define file paths
    # Note: Replace these with your actual file paths
    # The original notebook used 'filtered_dataset1_more_than_18_obs.csv'
    # and saved to 'final_cleaned_and_reassigned_dataset1.csv'.
    # This example uses placeholder paths.
    
    input_data_path = r"C:\Users\lamia\Desktop\datasets\dataset4_truncated_more_than_18_obs.csv"
    output_data_path =r"C:\Users\lamia\Desktop\datasets\dataset4_knn.csv"
    
    # Run the full pipeline
    processed_df = main_processing_pipeline(input_data_path, output_data_path)
    print("Step 1: Analyzing health state durations after knn...")
    durations = analyze_health_state_durations(output_data_path)

Step 1: Analyzing health state durations...
Health State Durations:
Oestrus lasts 1 day.
Calving lasts 1 day.
Lameness lasts 1 day.
Mastitis lasts 1 day.
No data available for "lps".
No data available for "acidosis".
Other_disease lasts 1 day.
Accidents lasts 1 day.
Disturbance lasts between 1 and 3 days.
No data available for "mixing".
Management_changes lasts 1 day.


Step 2: Labeling and aligning data...
Intermediate labeled dataset saved to C:\Users\lamia\Desktop\datasets\dataset4_knn_labeled.csv


Step 3: Reassigning labels with K-NN...

Reassignment Summary:
  Physiological_Class  Number_of_Reassigned_Samples
0                  ok                         28122
1             calving                             6

Final cleaned dataset saved to C:\Users\lamia\Desktop\datasets\dataset4_knn.csv

Sample of final dataset:


Unnamed: 0,cow,date,hour,physio_label
124168,49097,2014-12-26,17,ok
171403,49428,2015-05-04,21,ok
46657,47899,2015-03-30,2,ok
156104,49382,2015-10-01,10,ok
20144,44432,2015-12-12,9,ok
49406,47899,2015-07-25,15,ok
106250,48589,2015-03-18,3,ok
32166,46649,2015-09-27,7,ok
146453,49378,2015-09-18,7,ok
174509,49428,2015-09-13,7,ok


Step 1: Analyzing health state durations after knn...
Health State Durations:
Oestrus lasts between 2 and 3 days.
Calving lasts between 1 and 4 days.
Lameness lasts between 1 and 4 days.
Mastitis lasts between 1 and 4 days.
No data available for "lps".
No data available for "acidosis".
Other_disease lasts between 1 and 6 days.
No data available for "accidents".
No data available for "disturbance".
No data available for "mixing".
No data available for "management_changes".
