In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt

# Function to dynamically identify health-related columns
def identify_health_labels(df, exclude_cols=['cow', 'date', 'hour', 'IN_ALLEYS', 'REST', 'EAT', 'ACTIVITY_LEVEL']):
    # Identify columns that are likely binary health indicators
    potential_health_cols = [col for col in df.columns if col not in exclude_cols]
    health_labels = [col for col in potential_health_cols 
                     if df[col].dropna().isin([0, 1]).all()]
    return health_labels

In [3]:
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    df['date'] = pd.to_datetime(df['date'])
    return df

### Check Completeness per Cow

In [4]:
def check_completeness_per_cow(df):
    # Convert 'date' to datetime
    df['date'] = pd.to_datetime(df['date'])

    # Calculate expected date range
    all_dates = pd.date_range(start=df['date'].min(), end=df['date'].max(), freq='D')

    # Group by cow and count unique dates
    cow_date_counts = df.groupby('cow')['date'].nunique()

    # Compare against expected number of days
    expected_days = len(all_dates)
    missing_days_per_cow = expected_days - cow_date_counts

    print("\nMissing days per cow:")
    print(missing_days_per_cow[missing_days_per_cow > 0])

### The number of hourly entries per cow per day

In [5]:
def count_hourly_entries(df):
    hourly_counts = df.groupby(['cow', 'date']).size().reset_index(name='hourly_records')
    print(hourly_counts)
    return hourly_counts

### Filter for days with exactly 24 hours records

In [6]:
def filter_full_days(df):
    hourly_counts = df.groupby(['cow', 'date']).size().reset_index(name='hourly_records')
    full_days = hourly_counts[hourly_counts['hourly_records'] == 24]
    print(f"\nDays with full days of records: {len(full_days)}")
    return full_days

### The number of full 24-hour days per cow

In [7]:
def count_full_days_per_cow(df):
    hourly_counts = df.groupby(['cow', 'date']).size().reset_index(name='hourly_records')
    full_days = hourly_counts[hourly_counts['hourly_records'] == 24]
    full_days_per_cow = full_days.groupby('cow').size().reset_index(name='full_24h_days')
    print("\nFull 24h days per cow:", full_days_per_cow)
    return full_days_per_cow

In [8]:
def calculate_percentage_complete_days(df):
    hourly_counts = df.groupby(['cow', 'date']).size().reset_index(name='hourly_records')
    full_days = hourly_counts[hourly_counts['hourly_records'] == 24]
    print(f"Percentage of complete days: {len(full_days)/len(hourly_counts)*100:.2f}%")

### Count how many 24h samples have less than 12 observations

In [9]:
def count_less_than_12_obs(df):
    hourly_counts = df.groupby(['cow', 'date']).size().reset_index(name='hourly_records')
    less_than_12_obs = hourly_counts[hourly_counts['hourly_records'] < 12]
    print(f"Number of cow-day combinations with less than 12 hourly records: {len(less_than_12_obs)}")
    if len(less_than_12_obs) > 0:
        print(less_than_12_obs)
    return less_than_12_obs

# We will execute this code on the dataset that has 24h with less than 12 observations

In [10]:
def filter_less_than_12_obs(df):
    hourly_counts = df.groupby(['cow', 'date']).size().reset_index(name='hourly_records')
    less_than_12_obs = hourly_counts[hourly_counts['hourly_records'] < 12]
    df_filtered = df.copy()
    if len(less_than_12_obs) > 0:
        # Filter for valid cow-date combinations (at least 12 hourly records)
        valid_days = hourly_counts[hourly_counts['hourly_records'] >= 12]
        # Merge back to original data to keep only valid records
        df_filtered = pd.merge(df, valid_days[['cow', 'date']], on=['cow', 'date'], how='inner')
        # df_filtered now excludes cow-date combos with <12 hours
        print(df_filtered)
    return df_filtered

**Perform cleaning by keeping only records with more that 18 samples**

In [11]:
def clean_by_observation_count(file_path, min_obs=18, output_path='filtered_dataset_more_than_18_obs.csv'):
    df_cleaned = pd.read_csv(file_path)
    df_cleaned['date'] = pd.to_datetime(df_cleaned['date'])

    hourly_counts = df_cleaned.groupby(['cow', 'date']).size().reset_index(name='hour_count')

    # Identify 24h records by observation count
    records_more_than_18_obs = hourly_counts[hourly_counts['hour_count'] > min_obs]
    records_12_to_18_obs = hourly_counts[(hourly_counts['hour_count'] >= 12) & (hourly_counts['hour_count'] <= min_obs)]
    records_less_than_12_obs = hourly_counts[hourly_counts['hour_count'] < 12]

    # Merge to filter full rows from original dataframe
    df_more_than_18_obs = df_cleaned.merge(records_more_than_18_obs[['cow', 'date']], on=['cow', 'date'])
    df_12_to_18_obs = df_cleaned.merge(records_12_to_18_obs[['cow', 'date']], on=['cow', 'date'])
    df_less_than_12_obs = df_cleaned.merge(records_less_than_12_obs[['cow', 'date']], on=['cow', 'date'])

    # Drop the records with less than 12 observations from the main dataset
    df_filtered = df_cleaned[~df_cleaned.set_index(['cow', 'date']).index.isin(df_less_than_12_obs.set_index(['cow', 'date']).index)]

    # Print dataset shapes
    print("Original dataset shape:", df_cleaned.shape)
    print(f"After filtering (<12 obs removed):", df_filtered.shape)
    print(f"Deleted rows (<12 obs):", df_less_than_12_obs.shape)
    print(f"Filtered dataset (>{min_obs} obs):", df_more_than_18_obs.shape)
    print(f"Filtered dataset (12–{min_obs} obs):", df_12_to_18_obs.shape)

    # Save the dataset with more than 18 observations
    df_more_than_18_obs.to_csv(output_path, index=False)
    return df_more_than_18_obs, df_filtered, df_12_to_18_obs, df_less_than_12_obs

**Perform another cleaning by keeping only physiological classes**

Retain: mastitis, lameness, oestrus, calving, other_disease, OK.

Remove: management_changes, mixing, disturbance, accidents, LPS, acidosis.

In [12]:
def clean_by_physiological_classes(file_path, output_path='dataset3_aligned_cleaned_keep_physiological.csv', unwanted_classes=['management_changes', 'mixing', 'disturbance', 'accidents', 'lps', 'acidosis']):
    # Step 0: Load the dataset
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip().str.lower()

    # Step 1: Define useful and unwanted classes
    health_labels = identify_health_labels(df)
    useful_classes = [col for col in health_labels if col not in unwanted_classes]

    # Step 2: Keep only useful columns (non-health columns + useful health classes)
    keep_cols = ['cow', 'date', 'hour', 'IN_ALLEYS', 'REST', 'EAT', 'ACTIVITY_LEVEL'] + useful_classes
    keep_cols = [col for col in keep_cols if col in df.columns]  # Ensure columns exist
    df_cleaned = df[keep_cols]

    # Step 3: Save the cleaned dataset
    df_cleaned.to_csv(output_path, index=False)

    # Optional: Show a sample
    print(df_cleaned.head())
    return df_cleaned

In [14]:
# Example usage for multiple datasets
datasets = [r"C:/Users/lamia/Desktop/datasets/dataset1.csv","C:/Users/lamia/Desktop/datasets/dataset2.csv","C:/Users/lamia/Desktop/datasets/dataset3.csv", "C:/Users/lamia/Desktop/datasets/truncated_dataset4.csv"]  
for file_path in datasets:
    print(f"\nAnalyzing {file_path}")
    df = load_dataset(file_path)
    print("\nCheck Completeness per Cow:")
    check_completeness_per_cow(df)
    print("\nHourly Entries per Cow per Day:")
    count_hourly_entries(df)
    print("\nDays with Exactly 24 Hours Records:")
    filter_full_days(df)
    print("\nFull 24-Hour Days per Cow:")
    count_full_days_per_cow(df)
    print("\nPercentage of Complete Days:")
    calculate_percentage_complete_days(df)
    print("\nCow-Day Combinations with Less than 12 Hourly Records:")
    count_less_than_12_obs(df)
    # print("\nFiltered Dataset (Excluding <12 obs):")
    # df_filtered = filter_less_than_12_obs(df)
    # print("\nCleaning by Observation Count (>18 obs):")
    # output_path_obs = f"filtered_{file_path.split('.')[0]}_more_than_18_obs.csv"
    # df_more_than_18, df_filtered_obs, df_12_to_18, df_less_than_12 = clean_by_observation_count(file_path, output_path=output_path_obs)
    # print("\nCleaning by Physiological Classes:")
    # output_path_phys = f"cleaned_{file_path.split('.')[0]}_physiological.csv"
    # df_phys_cleaned = clean_by_physiological_classes(file_path, output_path=output_path_phys)


Analyzing C:/Users/lamia/Desktop/datasets/dataset1.csv

Check Completeness per Cow:

Missing days per cow:
cow
6601     3
6610     3
6612     3
6613    30
6621     3
6629     3
6633     3
6634     3
6637     3
6638    18
6643    18
6646     3
6656     3
6664    19
6674    19
6675     3
6683    19
6686    18
6689     3
6690     3
6693    19
6695    38
6699    29
6701    18
6714    18
6721    30
6750    18
7600    19
Name: date, dtype: int64

Hourly Entries per Cow per Day:
       cow       date  hourly_records
0     6601 2018-10-25              13
1     6601 2018-10-26              24
2     6601 2018-10-27              24
3     6601 2018-10-28              24
4     6601 2018-10-29              24
...    ...        ...             ...
4526  7600 2019-04-13              24
4527  7600 2019-04-14              24
4528  7600 2019-04-15              24
4529  7600 2019-04-16              24
4530  7600 2019-04-17              24

[4531 rows x 3 columns]

Days with Exactly 24 Hours Records:

Day