**Analyzing health states durations**


In [None]:
import pandas as pd

# Load the data
df = pd.read_csv("filtered_dataset_3_more_than_18_obs.csv")

# Clean column names
df.columns = df.columns.str.strip()

# Convert the 'date' column to datetime type
df['date'] = pd.to_datetime(df['date'])

# Sort the data by cow and date
df = df.sort_values(by=['cow', 'date'])

# Function to get durations for a specific health state
def get_health_state_durations(df, health_state_column):
    durations = []

    # For each cow
    for cow_id, group in df.groupby('cow'):
        group = group.sort_values('date').reset_index(drop=True)

        current_state = False
        start_date = None
        prev_date = None

        for i, row in group.iterrows():
            if row[health_state_column] == 1:
                if not current_state:
                    # Start new period
                    current_state = True
                    start_date = row['date']
                elif prev_date is not None and (row['date'] - prev_date).days > 1:
                    # Gap detected → close previous period
                    end_date = prev_date
                    duration = (end_date - start_date).days + 1
                    durations.append({
                        'cow': cow_id,
                        'health_state': health_state_column,
                        'start_date': start_date,
                        'end_date': end_date,
                        'duration_days': duration
                    })
                    # Start new period
                    start_date = row['date']
            else:
                if current_state:
                    end_date = prev_date if prev_date is not None else row['date']
                    duration = (end_date - start_date).days + 1
                    durations.append({
                        'cow': cow_id,
                        'health_state': health_state_column,
                        'start_date': start_date,
                        'end_date': end_date,
                        'duration_days': duration
                    })
                    current_state = False
                    start_date = None

            prev_date = row['date']

        # Handle case where last rows are 1s
        if current_state:
            end_date = prev_date
            duration = (end_date - start_date).days + 1
            durations.append({
                'cow': cow_id,
                'health_state': health_state_column,
                'start_date': start_date,
                'end_date': end_date,
                'duration_days': duration
            })

    return pd.DataFrame(durations)

# List of health states
health_states = [
    'oestrus', 'calving', 'lameness', 'mastitis',
    'other_disease', 'accidents', 'disturbance',
    'mixing', 'management_changes'
]

# Dictionaries to store durations
state_durations = {}
durations_min_max = {}

# Compute durations for each health state
for state in health_states:
    durations_df = get_health_state_durations(df, state)
    state_durations[state] = durations_df

    if not durations_df.empty:
        min_duration = durations_df['duration_days'].min()
        max_duration = durations_df['duration_days'].max()
        durations_min_max[state] = {'min_days': min_duration, 'max_days': max_duration}
    else:
        durations_min_max[state] = {'min_days': None, 'max_days': None}

# Display results
for state, durations in durations_min_max.items():
    min_days = durations['min_days']
    max_days = durations['max_days']

    if min_days is None or max_days is None:
        print(f"No data available for \"{state}\".")
    elif min_days == max_days:
        print(f"{state.capitalize()} lasts {min_days} day{'s' if min_days > 1 else ''}.")
    else:
        print(f"{state.capitalize()} lasts between {min_days} and {max_days} days.")

Oestrus lasts 1 day.
No data available for "calving".
No data available for "lameness".
No data available for "mastitis".
No data available for "other_disease".
No data available for "accidents".
No data available for "disturbance".
No data available for "mixing".
No data available for "management_changes".


**Etalement des jours**

In [None]:
# import pandas as pd
# from datetime import timedelta

# # Reload the dataset
# file_path = 'dataset3-1 (1).csv'
# df = pd.read_csv(file_path)

# # Step 1: Aggregate per cow-date (no hour) to detect events
# agg_cols = ['oestrus', 'calving', 'lameness', 'mastitis', 'other_disease', 'accidents', 'disturbance', 'mixing', 'management_changes']
# daily = df.groupby(['cow', 'date'])[agg_cols].max().reset_index()

# # Convert date to datetime
# daily['date'] = pd.to_datetime(daily['date'])

# # Step 2: Build a dataframe with all possible cow x day
# all_dates = pd.date_range(daily['date'].min() - timedelta(days=7), daily['date'].max() + timedelta(days=7))
# cows = daily['cow'].unique()

# # Create full cow-date combination
# full_daily = pd.MultiIndex.from_product([cows, all_dates], names=['cow', 'date']).to_frame(index=False)

# # Merge recorded events
# full_daily = full_daily.merge(daily, on=['cow', 'date'], how='left')

# # Fill missing values for event columns with 0
# for col in agg_cols:
#     if col not in full_daily:
#         full_daily[col] = 0
#     else:
#         full_daily[col] = full_daily[col].fillna(0)

# # Add default label
# full_daily['LABEL'] = 'control'


# # Step 3: Detect episodes and spread labels
# conditions = ['oestrus', 'calving', 'lameness', 'mastitis', 'other_disease', 'accidents', 'disturbance', 'mixing', 'management_changes']

# # Prepare a new column to receive the aligned label
# full_daily['LABEL'] = 'control'

# # Define how many days before and after depending on the condition
# spread_rules = {
#     'oestrus': {'before': 1, 'after': 1},
#     'calving': {'before': 2, 'after': 1},
#     'lameness': {'before': 2, 'after': 1},
#     'mastitis': {'before': 2, 'after': 1},
#     'other_disease': {'before': 2, 'after': 1},
#     'accidents': {'before': 2, 'after': 1},
#     'disturbance': {'before': 0, 'after': 0},
#     'mixing': {'before': 0, 'after': 0},
#     'management_changes': {'before': 0, 'after': 0},
# }

# # for cond in conditions:
# #     sub = full_daily[full_daily[cond] == 1][['cow', 'date']].sort_values(['cow', 'date'])
# #     for cow_id in sub['cow'].unique():
# #         cow_days = sub[sub['cow'] == cow_id]['date'].sort_values()
# #         # Detect episodes
# #         episode = []
# #         prev_day = None
# #         for day in cow_days:
# #             if prev_day is None or (day - prev_day).days > 1:
# #                 # New episode starts
# #                 if episode:
# #                     # Process previous episode
# #                     min_day = min(episode)
# #                     max_day = max(episode)
# #                     spread = spread_rules[cond]
# #                     spread_days = pd.date_range(min_day - timedelta(days=spread['before']), max_day + timedelta(days=spread['after']))
# #                     mask = (full_daily['cow'] == cow_id) & (full_daily['date'].isin(spread_days))
# #                     full_daily.loc[mask & (full_daily['LABEL'] == 'control'), 'LABEL'] = cond
# #                     full_daily.loc[mask, cond] = 1  # Force the corresponding condition column to 1

# #                 episode = [day]
# #             else:
# #                 episode.append(day)
# #             prev_day = day
# #         # Process the last episode
# #         if episode:
# #             min_day = min(episode)
# #             max_day = max(episode)
# #             spread = spread_rules[cond]
# #             spread_days = pd.date_range(min_day - timedelta(days=spread['before']), max_day + timedelta(days=spread['after']))
# #             mask = (full_daily['cow'] == cow_id) & (full_daily['date'].isin(spread_days))
# #             full_daily.loc[mask & (full_daily['LABEL'] == 'control'), 'LABEL'] = cond
# for cond in conditions:
#     sub = full_daily[full_daily[cond] == 1][['cow', 'date']].sort_values(['cow', 'date'])
#     for cow_id in sub['cow'].unique():
#         cow_days = sub[sub['cow'] == cow_id]['date'].sort_values()
#         episode = []
#         prev_day = None
#         for day in cow_days:
#             if prev_day is None or (day - prev_day).days > 1:
#                 if episode:
#                     min_day = min(episode)
#                     max_day = max(episode)
#                     spread = spread_rules[cond]
#                     spread_days = pd.date_range(min_day - timedelta(days=spread['before']), max_day + timedelta(days=spread['after']))
#                     mask = (full_daily['cow'] == cow_id) & (full_daily['date'].isin(spread_days))
#                     full_daily.loc[mask & (full_daily['LABEL'] == 'control'), 'LABEL'] = cond
#                     full_daily.loc[mask, cond] = 1  # <--- THIS IS THE NEW LINE
#                 episode = [day]
#             else:
#                 episode.append(day)
#             prev_day = day
#         if episode:
#             min_day = min(episode)
#             max_day = max(episode)
#             spread = spread_rules[cond]
#             spread_days = pd.date_range(min_day - timedelta(days=spread['before']), max_day + timedelta(days=spread['after']))
#             mask = (full_daily['cow'] == cow_id) & (full_daily['date'].isin(spread_days))
#             full_daily.loc[mask & (full_daily['LABEL'] == 'control'), 'LABEL'] = cond
#             full_daily.loc[mask, cond] = 1  # <--- AND THIS


# # Step 4: Map back to hourly level
# # Merge back to original hourly dataframe
# final = df.copy()
# final['date'] = pd.to_datetime(final['date'])
# final = final.merge(full_daily[['cow', 'date', 'LABEL']], on=['cow', 'date'], how='left')

# # Display a sample
# final[['cow', 'date', 'hour', 'LABEL']].sample(100)
# # Save the final aligned dataset to a CSV
# final.to_csv(r'c:/users/lamia/Downloads/labelled&aligned_dataset.csv', index=False)


In [None]:
import pandas as pd
from datetime import timedelta

# Step 0: Load and clean the dataset
file_path = "filtered_dataset3_more_than_18_obs.csv"
df = pd.read_csv(file_path)
df.columns = df.columns.str.strip().str.lower()

# Step 1: Identify event columns dynamically
non_event_cols = ['cow', 'date', 'hour', 'in_alleys', 'rest', 'eat', 'activity_level', 'ok']
event_cols = [col for col in df.columns if col not in non_event_cols]

# Step 2: Aggregate daily events
daily = df.groupby(['cow', 'date'])[event_cols].max().reset_index()
daily['date'] = pd.to_datetime(daily['date'])

# Step 3: Create full cow x day table
all_dates = pd.date_range(daily['date'].min() - timedelta(days=7), daily['date'].max() + timedelta(days=7))
cows = daily['cow'].unique()
full_daily = pd.MultiIndex.from_product([cows, all_dates], names=['cow', 'date']).to_frame(index=False)

# Merge and fill missing
full_daily = full_daily.merge(daily, on=['cow', 'date'], how='left')
full_daily[event_cols] = full_daily[event_cols].fillna(0)

# Add LABEL and default OK
full_daily['LABEL'] = 'control'
full_daily['ok'] = 1

# Step 4: Spread rules
spread_rules = {
    'oestrus': {'before': 1, 'after': 1},
    'calving': {'before': 2, 'after': 1},
    'lameness': {'before': 2, 'after': 1},
    'mastitis': {'before': 2, 'after': 1},
    'lps': {'before': 2, 'after': 1},
    'acidosis': {'before': 2, 'after': 1},
    'other_disease': {'before': 2, 'after': 1},
    'accidents': {'before': 2, 'after': 1},
    'disturbance': {'before': 0, 'after': 0},
    'mixing': {'before': 0, 'after': 0},
    'management_changes': {'before': 0, 'after': 0},
}

for cond in event_cols:
    if cond not in spread_rules:
        continue
    sub = full_daily[full_daily[cond] == 1][['cow', 'date']].sort_values(['cow', 'date'])
    for cow_id in sub['cow'].unique():
        cow_days = sub[sub['cow'] == cow_id]['date'].sort_values()
        episode = []
        prev_day = None
        for day in cow_days:
            if prev_day is None or (day - prev_day).days > 1:
                if episode:
                    min_day = min(episode)
                    max_day = max(episode)
                    spread = spread_rules[cond]
                    spread_days = pd.date_range(min_day - timedelta(days=spread['before']), max_day + timedelta(days=spread['after']))
                    mask = (full_daily['cow'] == cow_id) & (full_daily['date'].isin(spread_days))
                    full_daily.loc[mask, cond] = 1
                    full_daily.loc[mask & (full_daily['LABEL'] == 'control'), 'LABEL'] = cond
                    full_daily.loc[mask, 'ok'] = 0
                episode = [day]
            else:
                episode.append(day)
            prev_day = day
        if episode:
            min_day = min(episode)
            max_day = max(episode)
            spread = spread_rules[cond]
            spread_days = pd.date_range(min_day - timedelta(days=spread['before']), max_day + timedelta(days=spread['after']))
            mask = (full_daily['cow'] == cow_id) & (full_daily['date'].isin(spread_days))
            full_daily.loc[mask, cond] = 1
            full_daily.loc[mask & (full_daily['LABEL'] == 'control'), 'LABEL'] = cond
            full_daily.loc[mask, 'ok'] = 0

# Step 5: Prepare df before merging (drop event columns to avoid conflict)
df['date'] = pd.to_datetime(df['date'])
df = df.drop(columns=event_cols + ['ok'], errors='ignore')  # <<< DROP these before merging

# Merge cleanly
final = df.merge(full_daily[['cow', 'date', 'LABEL'] + event_cols + ['ok']], on=['cow', 'date'], how='left')

# Step 6: If a day was labeled with an event, update hourly events if missing
for cond in event_cols:
    final.loc[(final['LABEL'] == cond) & (final[cond] == 0), cond] = 1

# Save
final.to_csv('c:/users/lamia/Downloads/labelled&aligned_dataset3.csv', index=False)

# Show sample
#final[['cow', 'date', 'hour', 'LABEL', 'oestrus']].sample(10)


In [None]:
# Step 5: Visual check of consecutive days labeling

# Choose a cow to inspect
#cow_to_check = final['cow'].sample(1).iloc[0]  # or manually set, e.g., cow_to_check = 10127
cow_to_check = 10127
# Filter that cow
cow_data = final[final['cow'] == cow_to_check][['cow', 'date', 'hour', 'LABEL']]

# Group by date (daily view)
daily_view = cow_data.groupby(['cow', 'date'])['LABEL'].agg(lambda x: x.mode()[0]).reset_index()

# Display consecutive days
print(f"Consecutive days labeling for Cow {cow_to_check}:")
display(daily_view.sort_values('date'))


Consecutive days labeling for Cow 10127:


Unnamed: 0,cow,date,LABEL
0,10127,2013-10-01,control
1,10127,2013-10-02,control
2,10127,2013-10-03,control
3,10127,2013-10-04,control
4,10127,2013-10-05,control
5,10127,2013-10-06,control
6,10127,2013-10-07,control
7,10127,2013-10-08,control
8,10127,2013-10-09,control
9,10127,2013-10-12,control


In [None]:
# Step 5: Visual check of consecutive days labeling

# Choose a cow to inspect
cow_to_check = final['cow'].sample(1).iloc[0]

# Filter that cow
cow_data = final[final['cow'] == cow_to_check][['cow', 'date', 'hour', 'LABEL']]

# Group by date (daily view)
daily_view = cow_data.groupby(['cow', 'date'])['LABEL'].agg(lambda x: x.mode()[0]).reset_index()

# Display consecutive days
print(f"Consecutive days labeling for Cow {cow_to_check}:")
display(daily_view.sort_values('date'))


Consecutive days labeling for Cow 4279:


Unnamed: 0,cow,date,LABEL
0,4279,2013-10-01,control
1,4279,2013-10-02,control
2,4279,2013-10-03,control
3,4279,2013-10-04,control
4,4279,2013-10-05,control
5,4279,2013-10-06,control
6,4279,2013-10-07,control
7,4279,2013-10-08,control
8,4279,2013-10-09,control
9,4279,2013-10-10,control


**Useless columns removal**

In [None]:
import pandas as pd

# Step 0: Load the dataset
file_path = r"C:\Users\lamia\Downloads\labelled&aligned_dataset3.csv"
df = pd.read_csv(file_path)
df.columns = df.columns.str.strip().str.lower()

# Step 1: Define useful and unwanted classes
useful_classes = ['mastitis', 'lameness', 'oestrus', 'calving', 'other_disease', 'ok']
unwanted_classes = ['management_changes', 'mixing', 'disturbance', 'accidents', 'lps', 'acidosis']

# Step 2: Drop unwanted classes
df_cleaned = df.drop(columns=unwanted_classes, errors='ignore')

# Step 3: Save the cleaned dataset
df_cleaned.to_csv('c:/users/lamia/Downloads/dataset3_aligned_cleaned_keep_physiological.csv', index=False)

# Optional: Show a sample
print(df_cleaned.head())


     cow        date  hour  in_alleys      rest      eat  activity_level  \
0  10127  2013-10-01     1      0.000  3600.000    0.000      -828.00000   
1  10127  2013-10-01     2   2931.783   444.528  223.689       460.79322   
2  10127  2013-10-01     3    257.740  2930.966  411.294      -460.14030   
3  10127  2013-10-01     4     63.220  3536.780    0.000      -803.34420   
4  10127  2013-10-01     5      0.000  3600.000    0.000      -828.00000   

     label  mastitis  lameness  oestrus  calving  other_disease  ok  
0  control       0.0       0.0      0.0      0.0            0.0   1  
1  control       0.0       0.0      0.0      0.0            0.0   1  
2  control       0.0       0.0      0.0      0.0            0.0   1  
3  control       0.0       0.0      0.0      0.0            0.0   1  
4  control       0.0       0.0      0.0      0.0            0.0   1  
