**Count the number of cows and the number of full 24 hours records per cow**

In [None]:
import pandas as pd


df = pd.read_csv("dataset2-1.csv")
# Count unique cows to identify the number of cows in the dataset
num_cows = df['cow'].nunique()

print(f"Total number of unique cows: {num_cows}")

# Step 1: Count the number of hourly entries per cow per day
hourly_counts = df.groupby(['cow', 'date']).size().reset_index(name='hourly_records')

# Step 2: Filter for days with exactly 24 hourly records
full_days = hourly_counts[hourly_counts['hourly_records'] == 24]

# Step 3: Count the number of full 24-hour days per cow
full_days_per_cow = full_days.groupby('cow').size().reset_index(name='full_24h_days')
print(full_days_per_cow)


Total number of unique cows: 28
     cow  full_24h_days
0    151             60
1    153             60
2    156             60
3    162             60
4    173             60
5    189             60
6   1177             56
7   1624             60
8   2152             60
9   2155             60
10  2158             60
11  2162             60
12  2164             60
13  2165             60
14  2170             60
15  2175             60
16  2179             60
17  2182             60
18  2183             60
19  2185             60
20  2187             60
21  2603             60
22  2622             60
23  7163             60
24  8200             60
25  8605             60
26  8677             60
27  9195             60


**Count how many 24h samples have less than 12 observations**

In [None]:

# Filter for combinations with less than 12 hourly records
less_than_12_obs = hourly_counts[hourly_counts['hourly_records'] < 12]

# Count the number of such combinations
num_less_than_12_obs = len(less_than_12_obs)
print(f"Number of cow-day combinations with less than 12 hourly records: {num_less_than_12_obs}")

# Print the filtered combinations
print(less_than_12_obs)






Number of cow-day combinations with less than 12 hourly records: 0
Empty DataFrame
Columns: [cow, date, hourly_records]
Index: []


**Since we only have 2 observataions for that date , we will delete the record**

In [None]:
#Filter for valid cow-date combinations (at least 12 hourly records)
valid_days = hourly_counts[hourly_counts['hourly_records'] >= 12]

#Merge back to original data to keep only valid records
df_filtered = pd.merge(df, valid_days[['cow', 'date']], on=['cow', 'date'], how='inner')

# df_filtered now excludes cow-date combos with <12 hours
print(df_filtered)

        cow        date  hour  IN_ALLEYS      REST     EAT  ACTIVITY_LEVEL  \
0      7163  2015-03-02     1    185.517  3414.482   0.000      -755.64814   
1      7163  2015-03-02     2      0.000  3599.999   0.000      -827.99977   
2      7163  2015-03-02     3     10.661  3589.338   0.000      -823.84198   
3      7163  2015-03-02     4     43.800  3556.199   0.000      -810.91777   
4      7163  2015-03-02     5     17.167  3582.832   0.000      -821.30464   
...     ...         ...   ...        ...       ...     ...             ...   
40242  2622  2015-04-30    20    882.440  2703.864  13.695      -474.94642   
40243  2622  2015-04-30    21    288.649  3306.791   4.559      -712.46331   
40244  2622  2015-04-30    22    217.923  3382.076   0.000      -743.00980   
40245  2622  2015-04-30    23    793.545  2776.794  29.660      -499.23822   
40246  2622  2015-04-30    24    335.972  3251.471  12.556      -688.80929   

       oestrus  calving  lameness  mastitis  LPS  acidosis  oth

**Aggregate the daily records and compute statistical metrics**

In [None]:
import pandas as pd
import numpy as np
#Aggregate hourly data into daily totals
daily_df = df_filtered.groupby(['cow', 'date']).agg({
    'IN_ALLEYS': 'sum',
    'REST': 'sum',
    'EAT': 'sum',
    'ACTIVITY_LEVEL': 'sum'
}).reset_index()


def rmse(x):
    return np.sqrt(np.mean(x**2))

#Compute statistical metrics per cow
stats_df = daily_df.groupby('cow').agg(
    min_IN_ALLEYS=('IN_ALLEYS', 'min'),
    std_IN_ALLEYS=('IN_ALLEYS', 'std'),
    quantile25_IN_ALLEYS=('IN_ALLEYS', lambda x: x.quantile(0.25)),
    quantile50_IN_ALLEYS=('IN_ALLEYS', lambda x: x.quantile(0.5)),
    quantile75_IN_ALLEYS=('IN_ALLEYS', lambda x: x.quantile(0.75)),
    rmse_IN_ALLEYS=('IN_ALLEYS', rmse),

    min_REST=('REST', 'min'),
    std_REST=('REST', 'std'),
    quantile25_REST=('REST', lambda x: x.quantile(0.25)),
    quantile50_REST=('REST', lambda x: x.quantile(0.5)),
    quantile75_REST=('REST', lambda x: x.quantile(0.75)),
    rmse_REST=('REST', rmse),

    min_EAT=('EAT', 'min'),
    std_EAT=('EAT', 'std'),
    quantile25_EAT=('EAT', lambda x: x.quantile(0.25)),
    quantile50_EAT=('EAT', lambda x: x.quantile(0.5)),
    quantile75_EAT=('EAT', lambda x: x.quantile(0.75)),
    rmse_EAT=('EAT', rmse),

    min_ACTIVITY=('ACTIVITY_LEVEL', 'min'),
    std_ACTIVITY=('ACTIVITY_LEVEL', 'std'),
    quantile25_ACTIVITY=('ACTIVITY_LEVEL', lambda x: x.quantile(0.25)),
    quantile50_ACTIVITY=('ACTIVITY_LEVEL', lambda x: x.quantile(0.5)),
    quantile75_ACTIVITY=('ACTIVITY_LEVEL', lambda x: x.quantile(0.75)),
    rmse_ACTIVITY=('ACTIVITY_LEVEL', rmse)
).reset_index()


print(stats_df)

     cow  min_IN_ALLEYS  std_IN_ALLEYS  quantile25_IN_ALLEYS  \
0    151       7574.799    2345.407887           11045.54825   
1    153      16712.898    5191.240394           22288.08300   
2    156       6011.982    3389.196217            9051.34300   
3    162      13606.759    4471.708500           20085.80350   
4    173      11488.044    4311.332041           15915.96050   
5    189      13002.405    6796.124207           16632.85550   
6   1177      10262.755    7318.486998           14547.54000   
7   1624       9714.373    5283.987892           14390.37375   
8   2152       8069.131    2890.308639           10729.03950   
9   2155      12459.108    5270.466088           21218.84225   
10  2158      14376.517    3596.330260           16987.78425   
11  2162      10289.977    4063.107435           13461.27800   
12  2164      11360.127    3687.254742           16456.14000   
13  2165       7934.005    2557.469918           10538.65525   
14  2170      10179.216    4764.390243  

**Applying a shift with one hour overlap**

In [None]:
# Step 1: Create a proper datetime column
df['datetime'] = pd.to_datetime(df['date']) + pd.to_timedelta(df['hour'] - 1, unit='h')

# Step 2: Sort by cow and datetime
df = df.sort_values(['cow', 'datetime'])

# Step 3: Apply rolling window with 2-hour size and 1-hour overlap per cow

def rmse(x):
    return np.sqrt(np.mean(x**2))

rolling_metrics = (
    df.groupby('cow')
    .rolling(window='2H', on='datetime', min_periods=2)[['IN_ALLEYS', 'REST', 'EAT', 'ACTIVITY_LEVEL']]
    .agg(['mean', 'std', 'min', 'max', rmse])
)

rolling_metrics.columns = ['_'.join(col).replace('<lambda_0>', 'rmse') for col in rolling_metrics.columns]


rolling_metrics = rolling_metrics.reset_index()

print(rolling_metrics.head())

  .rolling(window='2H', on='datetime', min_periods=2)[['IN_ALLEYS', 'REST', 'EAT', 'ACTIVITY_LEVEL']]
  df.groupby('cow')
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse])
  .agg(['mean', 'std', 'min', 'max', rmse]

   cow            datetime  IN_ALLEYS_mean  IN_ALLEYS_std  IN_ALLEYS_min  \
0  151 2015-03-02 00:00:00             NaN            NaN            NaN   
1  151 2015-03-02 01:00:00        244.0040     306.888586         27.001   
2  151 2015-03-02 02:00:00         19.5575      10.526699         12.114   
3  151 2015-03-02 03:00:00        403.5930     553.634911         12.114   
4  151 2015-03-02 04:00:00        635.9330     225.056532        476.794   

   IN_ALLEYS_max  IN_ALLEYS_rmse  REST_mean    REST_std  REST_min  ...  \
0            NaN             NaN        NaN         NaN       NaN  ...   
1        461.007      326.539820  3349.6580  315.850457  3126.318  ...   
2         27.001       20.926096  3576.8620    5.464521  3572.998  ...   
3        795.072      562.266055  3052.9575  746.377370  2525.189  ...   
4        795.072      655.542525  2746.0120  312.290881  2525.189  ...   

   EAT_mean     EAT_std  EAT_min  EAT_max    EAT_rmse  ACTIVITY_LEVEL_mean  \
0       NaN         