In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.optimize import curve_fit, OptimizeWarning
from tqdm import tqdm
import warnings
from scipy.stats import zscore
from statsmodels.tsa.stattools import acf, pacf

sns.set_theme()
sns.set_context("notebook")
%load_ext autoreload
%autoreload 2

In [2]:
dtype_dict = {
    'FarmName_Pseudo': 'str',
    'SE_Number': 'str',
    'AnimalNumber': 'Int64',          
    'StartDate': 'str',
    'StartTime': 'str',
    'DateTime': 'str',
    'LactationNumber': 'Int64',       
    'DaysInMilk': 'Int64', 
    'YearSeason': 'str',           
    'TotalYield': 'float',
    'DateTime': 'str',
    'BreedName': 'str',
    'Age': 'Int64',
    'Mother': 'str',
    'Father': 'str',
    'CullDecisionDate': 'str',
    'Temperature': 'float',
    'RelativeHumidity': 'float',      
    'THI_adj': 'float',
    'HW': 'Int64',                    
    'cum_HW': 'Int64',                
    'Temp15Threshold': 'Int64'        
}


# Load the CSV with specified dtypes
data = pd.read_csv('../Data/MergedData/CleanedYieldData.csv', dtype=dtype_dict)

# Convert date and time columns back to datetime and time objects
data['DateTime'] = pd.to_datetime(data['DateTime'], errors='coerce')
data['StartTime'] = pd.to_datetime(data['StartTime'], format='%H:%M:%S', errors='coerce').dt.time
data['StartDate'] = pd.to_datetime(data['StartDate'], errors='coerce')
data['CullDecisionDate'] = pd.to_datetime(data['CullDecisionDate'], errors='coerce')
data['DateTime'] = pd.to_datetime(data['DateTime'], errors='coerce')
data.head()

Unnamed: 0,FarmName_Pseudo,SE_Number,AnimalNumber,StartDate,StartTime,LactationNumber,DaysInMilk,TotalYield,DateTime,YearSeason,...,Mother,Father,CullDecisionDate,Temperature,RelativeHumidity,THI_adj,HW,cum_HW,Temp15Threshold,Age
0,a624fb9a,SE-064c0cec-1189,5189,2022-01-01,06:25:00,7,191,13.9,2022-01-01 06:25:00,2022-1,...,,,2022-12-20,-3.025,0.930917,28.012944,0,0,0,3095
1,a624fb9a,SE-064c0cec-1189,5189,2022-01-01,16:41:00,7,191,16.87,2022-01-01 16:41:00,2022-1,...,,,2022-12-20,-3.025,0.930917,28.012944,0,0,0,3095
2,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,15:29:00,7,192,20.41,2022-01-02 15:29:00,2022-1,...,,,2022-12-20,-0.279167,0.990542,32.898193,0,0,0,3096
3,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,03:31:00,7,192,16.28,2022-01-02 03:31:00,2022-1,...,,,2022-12-20,-0.279167,0.990542,32.898193,0,0,0,3096
4,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,22:44:00,7,192,11.53,2022-01-02 22:44:00,2022-1,...,,,2022-12-20,-0.279167,0.990542,32.898193,0,0,0,3096


In [3]:
# Define the THI threshold
THI_THRESHOLD = 63

# Calculate the daily heat load based on the THI threshold
data['HeatLoad'] = data['THI_adj'].apply(lambda x: x - THI_THRESHOLD if x > THI_THRESHOLD else -(THI_THRESHOLD - x))

# Initialize the cumulative heat load column
data['CumulativeHeatLoad'] = 0

# Iterate through the data to calculate cumulative heat load correctly
for i in range(1, len(data)):
    previous_cumulative = data.at[i-1, 'CumulativeHeatLoad']
    current_heat_load = data.at[i, 'HeatLoad']
    if previous_cumulative + current_heat_load > 0:
        data.at[i, 'CumulativeHeatLoad'] = previous_cumulative + current_heat_load
    else:
        data.at[i, 'CumulativeHeatLoad'] = 0

data.head(-5)

  data.at[i, 'CumulativeHeatLoad'] = previous_cumulative + current_heat_load


Unnamed: 0,FarmName_Pseudo,SE_Number,AnimalNumber,StartDate,StartTime,LactationNumber,DaysInMilk,TotalYield,DateTime,YearSeason,...,CullDecisionDate,Temperature,RelativeHumidity,THI_adj,HW,cum_HW,Temp15Threshold,Age,HeatLoad,CumulativeHeatLoad
0,a624fb9a,SE-064c0cec-1189,5189,2022-01-01,06:25:00,7,191,13.90,2022-01-01 06:25:00,2022-1,...,2022-12-20,-3.025000,0.930917,28.012944,0,0,0,3095,-34.987056,0.0
1,a624fb9a,SE-064c0cec-1189,5189,2022-01-01,16:41:00,7,191,16.87,2022-01-01 16:41:00,2022-1,...,2022-12-20,-3.025000,0.930917,28.012944,0,0,0,3095,-34.987056,0.0
2,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,15:29:00,7,192,20.41,2022-01-02 15:29:00,2022-1,...,2022-12-20,-0.279167,0.990542,32.898193,0,0,0,3096,-30.101807,0.0
3,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,03:31:00,7,192,16.28,2022-01-02 03:31:00,2022-1,...,2022-12-20,-0.279167,0.990542,32.898193,0,0,0,3096,-30.101807,0.0
4,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,22:44:00,7,192,11.53,2022-01-02 22:44:00,2022-1,...,2022-12-20,-0.279167,0.990542,32.898193,0,0,0,3096,-30.101807,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1364930,f454e660,SE-fcdf259d-0044-0,1044,2023-06-07,00:51:00,10,351,5.63,2023-06-07 00:51:00,2023-3,...,NaT,15.645833,0.731917,61.559237,0,0,1,4154,-1.440763,0.0
1364931,f454e660,SE-fcdf259d-0044-0,1044,2023-06-07,11:17:00,10,351,3.34,2023-06-07 11:17:00,2023-3,...,NaT,15.645833,0.731917,61.559237,0,0,1,4154,-1.440763,0.0
1364932,f454e660,SE-fcdf259d-0044-0,1044,2023-06-08,17:01:00,10,352,6.96,2023-06-08 17:01:00,2023-3,...,NaT,15.570833,0.601708,59.383267,0,0,1,4155,-3.616733,0.0
1364933,f454e660,SE-fcdf259d-0044-0,1044,2023-06-08,02:23:00,10,352,8.18,2023-06-08 02:23:00,2023-3,...,NaT,15.570833,0.601708,59.383267,0,0,1,4155,-3.616733,0.0


In [4]:
# When CumulativeHeatLoad is greater than 5, it indicates that the cow is under heat stress
data['HeatStress'] = (data['CumulativeHeatLoad'] > 5).astype(int)
data.head(-5)

Unnamed: 0,FarmName_Pseudo,SE_Number,AnimalNumber,StartDate,StartTime,LactationNumber,DaysInMilk,TotalYield,DateTime,YearSeason,...,Temperature,RelativeHumidity,THI_adj,HW,cum_HW,Temp15Threshold,Age,HeatLoad,CumulativeHeatLoad,HeatStress
0,a624fb9a,SE-064c0cec-1189,5189,2022-01-01,06:25:00,7,191,13.90,2022-01-01 06:25:00,2022-1,...,-3.025000,0.930917,28.012944,0,0,0,3095,-34.987056,0.0,0
1,a624fb9a,SE-064c0cec-1189,5189,2022-01-01,16:41:00,7,191,16.87,2022-01-01 16:41:00,2022-1,...,-3.025000,0.930917,28.012944,0,0,0,3095,-34.987056,0.0,0
2,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,15:29:00,7,192,20.41,2022-01-02 15:29:00,2022-1,...,-0.279167,0.990542,32.898193,0,0,0,3096,-30.101807,0.0,0
3,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,03:31:00,7,192,16.28,2022-01-02 03:31:00,2022-1,...,-0.279167,0.990542,32.898193,0,0,0,3096,-30.101807,0.0,0
4,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,22:44:00,7,192,11.53,2022-01-02 22:44:00,2022-1,...,-0.279167,0.990542,32.898193,0,0,0,3096,-30.101807,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1364930,f454e660,SE-fcdf259d-0044-0,1044,2023-06-07,00:51:00,10,351,5.63,2023-06-07 00:51:00,2023-3,...,15.645833,0.731917,61.559237,0,0,1,4154,-1.440763,0.0,0
1364931,f454e660,SE-fcdf259d-0044-0,1044,2023-06-07,11:17:00,10,351,3.34,2023-06-07 11:17:00,2023-3,...,15.645833,0.731917,61.559237,0,0,1,4154,-1.440763,0.0,0
1364932,f454e660,SE-fcdf259d-0044-0,1044,2023-06-08,17:01:00,10,352,6.96,2023-06-08 17:01:00,2023-3,...,15.570833,0.601708,59.383267,0,0,1,4155,-3.616733,0.0,0
1364933,f454e660,SE-fcdf259d-0044-0,1044,2023-06-08,02:23:00,10,352,8.18,2023-06-08 02:23:00,2023-3,...,15.570833,0.601708,59.383267,0,0,1,4155,-3.616733,0.0,0


In [5]:
# Calculate the DailyYield for each cow each day
data['DailyYield'] = data.groupby(['SE_Number', 'StartDate'])['TotalYield'].transform('sum')

# Sort the data by AnimalNumber and StartDate
data.sort_values(['AnimalNumber', 'StartDate'], inplace=True)

# Calculate the previous day's total yield for each cow
data['PreviousDailyYield'] = data.groupby('AnimalNumber')['DailyYield'].shift(1)

# Calculate the daily yield change for each cow
data['DailyYieldChange'] = data['DailyYield'] - data['PreviousDailyYield']

# Group and aggregate data
data = data.groupby(['SE_Number', 'FarmName_Pseudo', 'StartDate']).agg({
    'DailyYield': 'first',
    'PreviousDailyYield': 'first',
    'DailyYieldChange': 'first',
    'HW': 'max',
    'Temperature': 'mean',
    'THI_adj': 'mean',
    'DaysInMilk': 'first',
    'YearSeason': 'first',
    'cum_HW': 'max',
    'Temp15Threshold': 'max',
    'Age': 'first',
    'BreedName': 'first',
    'LactationNumber': 'first',
    'HeatLoad': 'max',
    'CumulativeHeatLoad': 'max',
    'HeatStress': 'max'
}).reset_index()

# Renaming and formatting
data.rename(columns={
    'Temperature': 'MeanTemperature',
    'THI_adj': 'MeanTHI_adj',
    'StartDate': 'Date'
}, inplace=True)
data['Date'] = pd.to_datetime(data['Date'])

# Display the first few rows of the transformed data
data.head()

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,YearSeason,cum_HW,Temp15Threshold,Age,BreedName,LactationNumber,HeatLoad,CumulativeHeatLoad,HeatStress
0,SE-064c0cec-1189,a624fb9a,2022-01-01,30.77,30.77,0.0,0,-3.025,28.012944,191,2022-1,0,0,3095,02 SLB,7,-34.987056,0.0,0
1,SE-064c0cec-1189,a624fb9a,2022-01-02,48.22,30.77,17.45,0,-0.279167,32.898193,192,2022-1,0,0,3096,02 SLB,7,-30.101807,0.0,0
2,SE-064c0cec-1189,a624fb9a,2022-01-03,30.53,48.22,-17.69,0,2.033333,36.760487,193,2022-1,0,0,3097,02 SLB,7,-26.239513,0.0,0
3,SE-064c0cec-1189,a624fb9a,2022-01-04,42.26,30.53,11.73,0,0.066667,31.939524,194,2022-1,0,0,3098,02 SLB,7,-31.060476,0.0,0
4,SE-064c0cec-1189,a624fb9a,2022-01-05,38.49,42.26,-3.77,0,-3.7,26.498206,195,2022-1,0,0,3099,02 SLB,7,-36.501794,0.0,0


In [6]:
# Check if DailyYield is centered around approx the same for each farm
print("Mean of DailyYield:", data.groupby('FarmName_Pseudo')['DailyYield'].mean())
print("Standard Deviation of DailyYield:", data.groupby('FarmName_Pseudo')['DailyYield'].std())

Mean of DailyYield: FarmName_Pseudo
5c06d92d    37.389675
752efd72    31.151716
a624fb9a    33.413694
f454e660    30.485127
Name: DailyYield, dtype: float64
Standard Deviation of DailyYield: FarmName_Pseudo
5c06d92d     9.960240
752efd72     7.799288
a624fb9a    11.050811
f454e660    11.833056
Name: DailyYield, dtype: float64


## Wilmink Lactation Curve
$$
Y(t) = a + bt + c \exp(-dt)
$$
- \(Y(t)\): Milk yield at time \(t\) post-calving, so t = DaysInMilk
- \(a\): Intercept, representing baseline milk yield
- \(b\): Linear increase rate of milk yield over time
- \(c\): Initial exponential increase in milk yield
- \(d\): Rate at which the exponential increase declines over time

The Wilmink model captures the lactation curve by considering both linear and exponential components, providing a flexible representation of milk production dynamics over the lactation period.

In [7]:
# Define the Wilmink Lactation Curve function
def wilmink_lactation_curve(dim, a, b, c, d):
    dim = np.array(dim, dtype=float)
    return a + b * dim + c * np.exp(-d * dim)

# Function to detect and remove outliers
def remove_outliers(group, threshold=3.5):
    mean = np.mean(group['DailyYield'])
    std_dev = np.std(group['DailyYield'])
    return group[(group['DailyYield'] > mean - threshold * std_dev) & (group['DailyYield'] < mean + threshold * std_dev)]

# Function to smooth the data using a rolling average
def smooth_data(group, window=5):
    group = group.copy()
    group['DailyYield'] = group['DailyYield'].rolling(window, min_periods=1).mean()
    return group

# Function to fit the Wilmink Lactation Curve to the dataset
def fit_wilmink_lactation_curve(dataset):
    # Initialize the 'ExpectedYield' column to NaN
    dataset['ExpectedYield'] = np.nan
    params_dict = {}
    
    valid_indices = []

    # Group the dataset by 'SE_Number' and 'LactationNumber' and fit the curve for each segment
    for (animal_number, lactation_number), group in tqdm(dataset.groupby(['SE_Number', 'LactationNumber']), unit=" Segments"):
        # Prepare the data for fitting
        group = remove_outliers(group, threshold=3.5)  # Remove outliers with threshold 4
        group = smooth_data(group)  # Smooth the data
        x_data = group['DaysInMilk'].values
        y_data = group['DailyYield'].values
        
        # Ensure there are no NaN or infinite values in the data
        if not np.isfinite(x_data).all() or not np.isfinite(y_data).all():
            print(f"Non-finite values found for cow {animal_number}, lactation {lactation_number}, skipping.")
            continue
        
        # Ensure there are enough data points to fit the curve
        if len(x_data) < 10 or len(y_data) < 10:
            print(f"Insufficient data points for cow {animal_number}, lactation {lactation_number}, skipping.")
            continue

        valid_indices.extend(group.index)
        
        # Fit the model
        try:
            # Initial parameter guesses
            initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1]
            # Bounds on the parameters to prevent overflow
            bounds = ([-np.inf, -np.inf, -np.inf, 0], [np.inf, np.inf, np.inf, np.inf])
            
            with warnings.catch_warnings():
                warnings.filterwarnings('error', category=OptimizeWarning)
                try:
                    popt, pcov = curve_fit(
                        wilmink_lactation_curve, x_data, y_data,
                        p0=initial_guesses, bounds=bounds, maxfev=30000
                    )
                    
                    # Store the parameters in the dictionary
                    params_dict[(animal_number, lactation_number)] = {'a': popt[0], 'b': popt[1], 'c': popt[2], 'd': popt[3]}
                    
                    # Predict the expected yield using the fitted model
                    dataset.loc[group.index, 'ExpectedYield'] = wilmink_lactation_curve(group['DaysInMilk'], *popt)
                    
                    # Normalize the DailyYield
                    dataset.loc[group.index, 'NormalizedDailyYield'] = group['DailyYield'] / dataset.loc[group.index, 'ExpectedYield']
                    
                    # Calculate the daily yield change and normalize it
                    dataset.loc[group.index, 'PreviousDailyYield'] = group['DailyYield'].shift(1)
                    dataset.loc[group.index, 'DailyYieldChange'] = group['DailyYield'] - dataset.loc[group.index, 'PreviousDailyYield']
                    dataset.loc[group.index, 'NormalizedDailyYieldChange'] = dataset.loc[group.index, 'DailyYieldChange'] / dataset.loc[group.index, 'ExpectedYield']
                
                except OptimizeWarning:
                    print(f"OptimizeWarning for cow {animal_number}, lactation {lactation_number}, skipping.")
            
        except RuntimeError as e:
            print(f"Curve fit failed for cow {animal_number}, lactation {lactation_number}: {e}")
        except ValueError as e:
            print(f"Value error for cow {animal_number}, lactation {lactation_number}: {e}")
    
    # Keep only valid indices
    dataset = dataset.loc[valid_indices].reset_index(drop=True)
    
    # Fill any NaN values in the newly created columns with 0
    dataset['ExpectedYield'] = dataset['ExpectedYield'].fillna(0)
    dataset['NormalizedDailyYield'] = dataset['NormalizedDailyYield'].fillna(0)
    dataset['PreviousDailyYield'] = dataset['PreviousDailyYield'].fillna(0)
    dataset['DailyYieldChange'] = dataset['DailyYieldChange'].fillna(0)
    dataset['NormalizedDailyYieldChange'] = dataset['NormalizedDailyYieldChange'].fillna(0)
    
    return dataset, params_dict

# Apply the curve fitting function to your dataset
data, params_dict = fit_wilmink_lactation_curve(data)
data

  4%|▍         | 102/2315 [00:06<01:52, 19.73 Segments/s]

Insufficient data points for cow SE-5c06d92d-2621, lactation 3, skipping.


  5%|▍         | 114/2315 [00:09<03:53,  9.44 Segments/s]

Insufficient data points for cow SE-5c06d92d-2639, lactation 3, skipping.


 10%|▉         | 230/2315 [00:19<03:36,  9.62 Segments/s]

Insufficient data points for cow SE-5c06d92d-2804, lactation 5, skipping.
Insufficient data points for cow SE-5c06d92d-2815, lactation 2, skipping.


 11%|█         | 247/2315 [00:22<04:25,  7.79 Segments/s]

Insufficient data points for cow SE-5c06d92d-2824, lactation 3, skipping.


 12%|█▏        | 284/2315 [00:24<01:54, 17.78 Segments/s]

Insufficient data points for cow SE-5c06d92d-2845, lactation 2, skipping.


 13%|█▎        | 290/2315 [00:25<02:31, 13.32 Segments/s]

Insufficient data points for cow SE-5c06d92d-2870, lactation 2, skipping.


 15%|█▌        | 352/2315 [00:25<00:36, 53.12 Segments/s]

Insufficient data points for cow SE-5c06d92d-2911, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-2914, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-2919, lactation 2, skipping.


 19%|█▊        | 432/2315 [00:35<04:31,  6.94 Segments/s]

Insufficient data points for cow SE-5c06d92d-3039, lactation 4, skipping.


 19%|█▉        | 450/2315 [00:37<04:14,  7.32 Segments/s]

Insufficient data points for cow SE-5c06d92d-3045, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3047, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3049, lactation 1, skipping.


 22%|██▏       | 500/2315 [00:38<00:54, 33.02 Segments/s]

Insufficient data points for cow SE-5c06d92d-3063, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3063, lactation 3, skipping.
Insufficient data points for cow SE-5c06d92d-3065, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3068, lactation 1, skipping.


 23%|██▎       | 536/2315 [00:39<00:39, 44.72 Segments/s]

Insufficient data points for cow SE-5c06d92d-3116, lactation 3, skipping.


 27%|██▋       | 630/2315 [00:45<02:30, 11.23 Segments/s]

Insufficient data points for cow SE-5c06d92d-3173, lactation 3, skipping.


 32%|███▏      | 738/2315 [00:49<00:31, 50.31 Segments/s]

Insufficient data points for cow SE-5c06d92d-3267, lactation 2, skipping.


 35%|███▌      | 815/2315 [00:52<00:44, 33.50 Segments/s]

Insufficient data points for cow SE-5c06d92d-3357, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-3365, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-3370, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-3371, lactation 2, skipping.


 39%|███▉      | 905/2315 [00:54<00:40, 34.60 Segments/s]

Insufficient data points for cow SE-5c06d92d-3524, lactation 1, skipping.


 40%|███▉      | 917/2315 [00:55<00:48, 28.80 Segments/s]

Insufficient data points for cow SE-5c06d92d-3530, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3536, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3537, lactation 1, skipping.


 40%|████      | 937/2315 [00:58<01:49, 12.63 Segments/s]

Insufficient data points for cow SE-752efd72-0051, lactation 3, skipping.


 43%|████▎     | 1004/2315 [01:04<02:04, 10.54 Segments/s]

Insufficient data points for cow SE-752efd72-0117, lactation 2, skipping.


 44%|████▍     | 1013/2315 [01:05<01:45, 12.38 Segments/s]

Insufficient data points for cow SE-752efd72-0129, lactation 2, skipping.


 45%|████▌     | 1046/2315 [01:05<00:48, 26.06 Segments/s]

Insufficient data points for cow SE-752efd72-0136, lactation 2, skipping.
Insufficient data points for cow SE-752efd72-0143, lactation 2, skipping.


 46%|████▌     | 1063/2315 [01:07<01:23, 15.02 Segments/s]

Insufficient data points for cow SE-752efd72-0166, lactation 1, skipping.


 51%|█████     | 1174/2315 [01:22<00:59, 19.33 Segments/s]

Insufficient data points for cow SE-752efd72-0232, lactation 1, skipping.
Insufficient data points for cow SE-752efd72-0234, lactation 1, skipping.
Insufficient data points for cow SE-752efd72-0239, lactation 1, skipping.
Insufficient data points for cow SE-752efd72-0243, lactation 1, skipping.


 56%|█████▋    | 1305/2315 [01:34<00:47, 21.44 Segments/s]

Insufficient data points for cow SE-752efd72-0298, lactation 1, skipping.


 57%|█████▋    | 1319/2315 [01:36<01:40,  9.91 Segments/s]

Insufficient data points for cow SE-752efd72-0317, lactation 1, skipping.


 58%|█████▊    | 1339/2315 [01:38<01:36, 10.14 Segments/s]

Insufficient data points for cow SE-752efd72-0329, lactation 1, skipping.


 61%|██████▏   | 1419/2315 [01:40<00:16, 54.38 Segments/s]

Insufficient data points for cow SE-752efd72-0369, lactation 1, skipping.


 65%|██████▌   | 1506/2315 [01:43<00:20, 39.50 Segments/s]

Insufficient data points for cow SE-752efd72-0416, lactation 2, skipping.


 66%|██████▌   | 1518/2315 [01:43<00:23, 33.58 Segments/s]

Insufficient data points for cow SE-752efd72-0439, lactation 2, skipping.
Insufficient data points for cow SE-752efd72-0440, lactation 2, skipping.


 66%|██████▌   | 1527/2315 [01:45<00:43, 17.94 Segments/s]

Insufficient data points for cow SE-752efd72-0446, lactation 2, skipping.


 69%|██████▉   | 1597/2315 [01:47<00:16, 44.25 Segments/s]

Insufficient data points for cow SE-752efd72-0533, lactation 1, skipping.


 70%|██████▉   | 1609/2315 [01:47<00:19, 35.55 Segments/s]

Insufficient data points for cow SE-752efd72-0544, lactation 1, skipping.


 71%|███████   | 1640/2315 [01:50<00:44, 15.28 Segments/s]

Insufficient data points for cow SE-752efd72-2751, lactation 5, skipping.


 72%|███████▏  | 1659/2315 [01:55<01:45,  6.22 Segments/s]

Insufficient data points for cow SE-752efd72-2794, lactation 6, skipping.


 73%|███████▎  | 1692/2315 [01:57<00:38, 16.10 Segments/s]

Insufficient data points for cow SE-752efd72-2797, lactation 3, skipping.
Insufficient data points for cow SE-7fd04cd3-679, lactation 4, skipping.
Insufficient data points for cow SE-a624fb9a-1162, lactation 7, skipping.
Insufficient data points for cow SE-a624fb9a-1200, lactation 4, skipping.


 74%|███████▎  | 1706/2315 [02:03<01:52,  5.41 Segments/s]

Insufficient data points for cow SE-a624fb9a-1251, lactation 3, skipping.


 75%|███████▍  | 1735/2315 [02:03<00:49, 11.70 Segments/s]

Insufficient data points for cow SE-a624fb9a-1267, lactation 3, skipping.


 76%|███████▌  | 1761/2315 [02:07<01:14,  7.45 Segments/s]

Insufficient data points for cow SE-a624fb9a-1312, lactation 2, skipping.


 77%|███████▋  | 1777/2315 [02:08<00:54,  9.87 Segments/s]

Insufficient data points for cow SE-a624fb9a-1330, lactation 2, skipping.
Insufficient data points for cow SE-a624fb9a-1333, lactation 1, skipping.


 79%|███████▊  | 1822/2315 [02:09<00:19, 25.34 Segments/s]

Insufficient data points for cow SE-a624fb9a-1373, lactation 1, skipping.
Insufficient data points for cow SE-a624fb9a-1374, lactation 1, skipping.


 86%|████████▌ | 1992/2315 [02:18<00:19, 16.92 Segments/s]

Insufficient data points for cow SE-f454e660-0420, lactation 5, skipping.


 86%|████████▋ | 1999/2315 [02:19<00:29, 10.82 Segments/s]

Insufficient data points for cow SE-f454e660-0451, lactation 5, skipping.


 87%|████████▋ | 2019/2315 [02:20<00:17, 16.45 Segments/s]

Insufficient data points for cow SE-f454e660-0465, lactation 2, skipping.
Insufficient data points for cow SE-f454e660-0494, lactation 2, skipping.


 89%|████████▊ | 2053/2315 [02:23<00:20, 12.73 Segments/s]

Insufficient data points for cow SE-f454e660-0545, lactation 4, skipping.
Insufficient data points for cow SE-f454e660-0551, lactation 1, skipping.
Insufficient data points for cow SE-f454e660-0559, lactation 1, skipping.


 90%|████████▉ | 2075/2315 [02:24<00:17, 13.91 Segments/s]

Insufficient data points for cow SE-f454e660-0585, lactation 1, skipping.


 93%|█████████▎| 2150/2315 [02:28<00:08, 20.07 Segments/s]

Insufficient data points for cow SE-f454e660-0713, lactation 2, skipping.
Insufficient data points for cow SE-f454e660-0717, lactation 2, skipping.


 94%|█████████▍| 2178/2315 [02:28<00:04, 32.86 Segments/s]

Insufficient data points for cow SE-f454e660-0726, lactation 2, skipping.


 96%|█████████▌| 2216/2315 [02:29<00:02, 46.77 Segments/s]

Insufficient data points for cow SE-f454e660-0829, lactation 1, skipping.


 98%|█████████▊| 2265/2315 [02:35<00:03, 13.30 Segments/s]

Insufficient data points for cow SE-f454e660-509, lactation 3, skipping.
Insufficient data points for cow SE-f454e660-510, lactation 2, skipping.


 98%|█████████▊| 2274/2315 [02:35<00:03, 12.34 Segments/s]

Insufficient data points for cow SE-f454e660-567, lactation 1, skipping.


100%|██████████| 2315/2315 [02:39<00:00, 14.55 Segments/s]


Insufficient data points for cow SE-f454e660-729, lactation 1, skipping.


Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Temp15Threshold,Age,BreedName,LactationNumber,HeatLoad,CumulativeHeatLoad,HeatStress,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange
0,SE-064c0cec-1189,a624fb9a,2022-01-01,30.77,0.000000,0.000000,0,-3.025000,28.012944,191,...,0,3095,02 SLB,7,-34.987056,0.000000,0,35.914865,0.856748,0.000000
1,SE-064c0cec-1189,a624fb9a,2022-01-02,48.22,30.770000,8.725000,0,-0.279167,32.898193,192,...,0,3096,02 SLB,7,-30.101807,0.000000,0,35.799613,1.103224,0.243718
2,SE-064c0cec-1189,a624fb9a,2022-01-03,30.53,39.495000,-2.988333,0,2.033333,36.760487,193,...,0,3097,02 SLB,7,-26.239513,0.000000,0,35.684360,1.023044,-0.083744
3,SE-064c0cec-1189,a624fb9a,2022-01-04,42.26,36.506667,1.438333,0,0.066667,31.939524,194,...,0,3098,02 SLB,7,-31.060476,0.000000,0,35.569108,1.066796,0.040438
4,SE-064c0cec-1189,a624fb9a,2022-01-05,38.49,37.945000,0.109000,0,-3.700000,26.498206,195,...,0,3099,02 SLB,7,-36.501794,0.000000,0,35.453856,1.073339,0.003074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483102,SE-fcdf259d-0044-0,f454e660,2023-06-08,15.14,13.252000,0.494000,0,15.570833,59.383267,352,...,1,4155,41 Fjällko,10,-3.616733,0.000000,0,13.149489,1.045364,0.037568
483103,SE-fcdf259d-0044-0,f454e660,2023-06-09,7.47,13.746000,-2.968000,0,13.254167,54.534255,353,...,1,4156,41 Fjällko,10,-8.465745,0.000000,0,13.057668,0.825415,-0.227299
483104,SE-fcdf259d-0044-0,f454e660,2023-06-10,14.73,10.778000,0.378000,0,13.258333,54.082367,354,...,1,4157,41 Fjällko,10,-8.917633,0.000000,0,12.965846,0.860414,0.029154
483105,SE-fcdf259d-0044-0,f454e660,2023-06-12,12.27,11.156000,0.560000,0,15.820833,62.015093,356,...,1,4159,41 Fjällko,10,-0.984907,0.000000,0,12.782204,0.916587,0.043811


In [8]:
# Check if NormalizedDailyYield is centered around 1 for each unique farm
print("Mean of NormalizedDailyYield:", data.groupby('FarmName_Pseudo')['NormalizedDailyYield'].mean())
print("Standard Deviation of NormalizedDailyYield:", data.groupby('FarmName_Pseudo')['NormalizedDailyYield'].std())

Mean of NormalizedDailyYield: FarmName_Pseudo
5c06d92d    0.999873
752efd72    1.000064
a624fb9a    1.000214
f454e660    0.999874
Name: NormalizedDailyYield, dtype: float64
Standard Deviation of NormalizedDailyYield: FarmName_Pseudo
5c06d92d    0.110106
752efd72    0.073153
a624fb9a    0.101905
f454e660    0.114833
Name: NormalizedDailyYield, dtype: float64


In [9]:
# Make a dataframe from the parameters dictionary, it should contain Se_Number, LactationNumber, a, b, c, d
params_df = pd.DataFrame(params_dict).T.reset_index()
params_df.columns = ['SE_Number', 'LactationNumber', 'a', 'b', 'c', 'd']
params_df.head(-5)

Unnamed: 0,SE_Number,LactationNumber,a,b,c,d
0,SE-064c0cec-1189,7,57.928060,-0.115252,-6.063436,17.367795
1,SE-064c0cec-1189,8,41.195196,-0.080255,-44.398922,0.159012
2,SE-30dc5787-1389,5,115.924643,-0.343237,12.205804,0.099999
3,SE-30dc5787-1389,6,52.877694,-0.084404,-32.593600,0.093018
4,SE-30dc5787-1389,7,47.551089,-0.094727,-45.321451,0.203812
...,...,...,...,...,...,...
2227,SE-f454e660-686,1,24.464747,-0.029503,-30.107339,0.322460
2228,SE-f454e660-688,1,22.360134,0.006013,-0.083601,26.265057
2229,SE-f454e660-693,1,722399.241220,-24.952569,-722371.384516,0.000035
2230,SE-f454e660-701,1,25.337203,-0.007307,4.771692,5.566311


In [10]:
# Calculate Z-scores for each parameter
params_df['z_a'] = zscore(params_df['a'])
params_df['z_b'] = zscore(params_df['b'])
params_df['z_c'] = zscore(params_df['c'])
params_df['z_d'] = zscore(params_df['d'])

params_df.head(-5)

Unnamed: 0,SE_Number,LactationNumber,a,b,c,d,z_a,z_b,z_c,z_d
0,SE-064c0cec-1189,7,57.928060,-0.115252,-6.063436,17.367795,-0.188741,0.112085,-0.047044,1.264524
1,SE-064c0cec-1189,8,41.195196,-0.080255,-44.398922,0.159012,-0.188937,0.114493,-0.047044,-0.223807
2,SE-30dc5787-1389,5,115.924643,-0.343237,12.205804,0.099999,-0.188061,0.096396,-0.047044,-0.228911
3,SE-30dc5787-1389,6,52.877694,-0.084404,-32.593600,0.093018,-0.188800,0.114207,-0.047044,-0.229515
4,SE-30dc5787-1389,7,47.551089,-0.094727,-45.321451,0.203812,-0.188863,0.113497,-0.047044,-0.219933
...,...,...,...,...,...,...,...,...,...,...
2227,SE-f454e660-686,1,24.464747,-0.029503,-30.107339,0.322460,-0.189133,0.117985,-0.047044,-0.209671
2228,SE-f454e660-688,1,22.360134,0.006013,-0.083601,26.265057,-0.189158,0.120429,-0.047044,2.034019
2229,SE-f454e660-693,1,722399.241220,-24.952569,-722371.384516,0.000035,8.278968,-1.597036,-0.047044,-0.237557
2230,SE-f454e660-701,1,25.337203,-0.007307,4.771692,5.566311,-0.189123,0.119513,-0.047044,0.243852


In [11]:
# Identify outliers (using Z-score > 3.5 or < -3.5 as threshold)
outliers = params_df[(np.abs(params_df[['z_a', 'z_b', 'z_c', 'z_d']]) > 3.5).any(axis=1)]

x = outliers.count()
print("Number of outliers:", x)

# Optionally, drop the outliers
params_df_cleaned = params_df.drop(outliers.index)
params_df_cleaned.head(-5)

Number of outliers: SE_Number          119
LactationNumber    119
a                  119
b                  119
c                  119
d                  119
z_a                119
z_b                119
z_c                119
z_d                119
dtype: int64


Unnamed: 0,SE_Number,LactationNumber,a,b,c,d,z_a,z_b,z_c,z_d
0,SE-064c0cec-1189,7,57.928060,-0.115252,-6.063436,17.367795,-0.188741,0.112085,-0.047044,1.264524
1,SE-064c0cec-1189,8,41.195196,-0.080255,-44.398922,0.159012,-0.188937,0.114493,-0.047044,-0.223807
2,SE-30dc5787-1389,5,115.924643,-0.343237,12.205804,0.099999,-0.188061,0.096396,-0.047044,-0.228911
3,SE-30dc5787-1389,6,52.877694,-0.084404,-32.593600,0.093018,-0.188800,0.114207,-0.047044,-0.229515
4,SE-30dc5787-1389,7,47.551089,-0.094727,-45.321451,0.203812,-0.188863,0.113497,-0.047044,-0.219933
...,...,...,...,...,...,...,...,...,...,...
2226,SE-f454e660-671,1,39.063211,-0.042430,-35.990729,0.055196,-0.188962,0.117096,-0.047044,-0.232786
2227,SE-f454e660-686,1,24.464747,-0.029503,-30.107339,0.322460,-0.189133,0.117985,-0.047044,-0.209671
2228,SE-f454e660-688,1,22.360134,0.006013,-0.083601,26.265057,-0.189158,0.120429,-0.047044,2.034019
2230,SE-f454e660-701,1,25.337203,-0.007307,4.771692,5.566311,-0.189123,0.119513,-0.047044,0.243852


In [12]:
# Identify unique SE_Number and LactationNumber combinations from the outliers
outlier_combinations = outliers[['SE_Number', 'LactationNumber']].drop_duplicates()

# Merge with the original data to find rows that match these outlier combinations
data_cleaned = data.merge(outlier_combinations, on=['SE_Number', 'LactationNumber'], how='left', indicator=True)

# Keep only the rows that do not match the outlier combinations
data_cleaned = data_cleaned[data_cleaned['_merge'] == 'left_only'].drop(columns=['_merge'])

# Now data_cleaned contains the original data with the outlier combinations removed
print("Number of rows removed:", len(data) - len(data_cleaned))
data_cleaned.head(-5)

Number of rows removed: 15717


Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Temp15Threshold,Age,BreedName,LactationNumber,HeatLoad,CumulativeHeatLoad,HeatStress,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange
0,SE-064c0cec-1189,a624fb9a,2022-01-01,30.77,0.000000,0.000000,0,-3.025000,28.012944,191,...,0,3095,02 SLB,7,-34.987056,0.0,0,35.914865,0.856748,0.000000
1,SE-064c0cec-1189,a624fb9a,2022-01-02,48.22,30.770000,8.725000,0,-0.279167,32.898193,192,...,0,3096,02 SLB,7,-30.101807,0.0,0,35.799613,1.103224,0.243718
2,SE-064c0cec-1189,a624fb9a,2022-01-03,30.53,39.495000,-2.988333,0,2.033333,36.760487,193,...,0,3097,02 SLB,7,-26.239513,0.0,0,35.684360,1.023044,-0.083744
3,SE-064c0cec-1189,a624fb9a,2022-01-04,42.26,36.506667,1.438333,0,0.066667,31.939524,194,...,0,3098,02 SLB,7,-31.060476,0.0,0,35.569108,1.066796,0.040438
4,SE-064c0cec-1189,a624fb9a,2022-01-05,38.49,37.945000,0.109000,0,-3.700000,26.498206,195,...,0,3099,02 SLB,7,-36.501794,0.0,0,35.453856,1.073339,0.003074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483097,SE-fcdf259d-0044-0,f454e660,2023-06-03,12.67,14.652000,-0.622000,0,12.666667,53.132530,347,...,1,4150,41 Fjällko,10,-9.867470,0.0,0,13.608593,1.030966,-0.045706
483098,SE-fcdf259d-0044-0,f454e660,2023-06-04,22.31,14.030000,0.954000,0,13.079167,56.726870,348,...,1,4151,41 Fjällko,10,-6.273130,0.0,0,13.516773,1.108549,0.070579
483099,SE-fcdf259d-0044-0,f454e660,2023-06-05,12.84,14.984000,-0.092000,0,14.237500,58.482418,349,...,1,4152,41 Fjällko,10,-4.517582,0.0,0,13.424952,1.109278,-0.006853
483100,SE-fcdf259d-0044-0,f454e660,2023-06-06,9.47,14.892000,-0.284000,0,15.345833,60.546358,350,...,1,4153,41 Fjällko,10,-2.453642,0.0,0,13.333131,1.095617,-0.021300


In [13]:
# Check if NormalizedDailyYield is centered around 1 for each unique farm
print("Mean of NormalizedDailyYield:", data_cleaned.groupby('FarmName_Pseudo')['NormalizedDailyYield'].mean())
print("Standard Deviation of NormalizedDailyYield:", data_cleaned.groupby('FarmName_Pseudo')['NormalizedDailyYield'].std())

Mean of NormalizedDailyYield: FarmName_Pseudo
5c06d92d    0.999882
752efd72    1.000083
a624fb9a    1.000167
f454e660    0.999931
Name: NormalizedDailyYield, dtype: float64
Standard Deviation of NormalizedDailyYield: FarmName_Pseudo
5c06d92d    0.110462
752efd72    0.072759
a624fb9a    0.101240
f454e660    0.115457
Name: NormalizedDailyYield, dtype: float64


In [14]:
data_cleaned['Residuals'] = data_cleaned['DailyYield'] - data_cleaned['ExpectedYield']
data_cleaned

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Age,BreedName,LactationNumber,HeatLoad,CumulativeHeatLoad,HeatStress,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,Residuals
0,SE-064c0cec-1189,a624fb9a,2022-01-01,30.77,0.000000,0.000000,0,-3.025000,28.012944,191,...,3095,02 SLB,7,-34.987056,0.000000,0,35.914865,0.856748,0.000000,-5.144865
1,SE-064c0cec-1189,a624fb9a,2022-01-02,48.22,30.770000,8.725000,0,-0.279167,32.898193,192,...,3096,02 SLB,7,-30.101807,0.000000,0,35.799613,1.103224,0.243718,12.420387
2,SE-064c0cec-1189,a624fb9a,2022-01-03,30.53,39.495000,-2.988333,0,2.033333,36.760487,193,...,3097,02 SLB,7,-26.239513,0.000000,0,35.684360,1.023044,-0.083744,-5.154360
3,SE-064c0cec-1189,a624fb9a,2022-01-04,42.26,36.506667,1.438333,0,0.066667,31.939524,194,...,3098,02 SLB,7,-31.060476,0.000000,0,35.569108,1.066796,0.040438,6.690892
4,SE-064c0cec-1189,a624fb9a,2022-01-05,38.49,37.945000,0.109000,0,-3.700000,26.498206,195,...,3099,02 SLB,7,-36.501794,0.000000,0,35.453856,1.073339,0.003074,3.036144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483102,SE-fcdf259d-0044-0,f454e660,2023-06-08,15.14,13.252000,0.494000,0,15.570833,59.383267,352,...,4155,41 Fjällko,10,-3.616733,0.000000,0,13.149489,1.045364,0.037568,1.990511
483103,SE-fcdf259d-0044-0,f454e660,2023-06-09,7.47,13.746000,-2.968000,0,13.254167,54.534255,353,...,4156,41 Fjällko,10,-8.465745,0.000000,0,13.057668,0.825415,-0.227299,-5.587668
483104,SE-fcdf259d-0044-0,f454e660,2023-06-10,14.73,10.778000,0.378000,0,13.258333,54.082367,354,...,4157,41 Fjällko,10,-8.917633,0.000000,0,12.965846,0.860414,0.029154,1.764154
483105,SE-fcdf259d-0044-0,f454e660,2023-06-12,12.27,11.156000,0.560000,0,15.820833,62.015093,356,...,4159,41 Fjällko,10,-0.984907,0.000000,0,12.782204,0.916587,0.043811,-0.512204


In [15]:
# Group by 'FarmName_Pseudo' to perform calculations at the farm level
farm_results = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    farm_residuals = []
    
    for se_number, cow_group in farm_group.groupby('SE_Number'):
        residuals = cow_group['Residuals']
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            farm_residuals.append(residuals)
    
    if len(farm_residuals) > 0:
        # Combine residuals from all cows in the farm
        combined_residuals = np.concatenate(farm_residuals)
        
        # Calculate farm-level statistics
        mean_residuals = combined_residuals.mean()
        std_residuals = combined_residuals.std()
        acf_values = acf(combined_residuals, nlags=30, fft=False)
        pacf_values = pacf(combined_residuals, nlags=min(30, len(combined_residuals)//2))

        # Print the farm-level statistics
        print(f"Farm: {farm_name}")
        print(f"Mean Residuals: {mean_residuals}")
        print(f"Standard Deviation of Residuals: {std_residuals}")
        print(f"ACF (first 5 lags): {acf_values[:5]}")
        print(f"PACF (first 5 lags): {pacf_values[:5]}")
        print("=" * 50)


Farm: 5c06d92d
Mean Residuals: -0.007358816187875687
Standard Deviation of Residuals: 5.284313563996281
ACF (first 5 lags): [1.         0.26277774 0.24544434 0.18996471 0.15302048]
PACF (first 5 lags): [1.         0.26277914 0.18947808 0.09793554 0.0550846 ]
Farm: 752efd72
Mean Residuals: -0.025321862666249733
Standard Deviation of Residuals: 3.517667102901537
ACF (first 5 lags): [1.         0.29635308 0.25974486 0.20619966 0.1613189 ]
PACF (first 5 lags): [1.         0.29635515 0.18847521 0.09988853 0.04970753]
Farm: a624fb9a
Mean Residuals: -0.02967946086057544
Standard Deviation of Residuals: 5.723818107888552
ACF (first 5 lags): [ 1.         -0.05558486  0.24159506  0.20277682  0.18253758]
PACF (first 5 lags): [ 1.         -0.05558582  0.23925285  0.24078754  0.17611915]
Farm: f454e660
Mean Residuals: -0.013491751557388564
Standard Deviation of Residuals: 6.964295467505133
ACF (first 5 lags): [ 1.         -0.13958374  0.17589227  0.15993898  0.13689694]
PACF (first 5 lags): [ 1.   

In [16]:
# Group by 'FarmName_Pseudo', 'SE_Number', and 'LactationNumber' to perform individual calculations
farm_results = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    print(f"Farm: {farm_name}")
    
    for (se_number, lactation_number), cow_group in farm_group.groupby(['SE_Number', 'LactationNumber']):
        residuals = cow_group['Residuals']
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            mean_residuals = residuals.mean()
            std_residuals = residuals.std()
            acf_values = acf(residuals, nlags=30, fft=False)
            pacf_values = pacf(residuals, nlags=min(30, len(residuals)//2))

            # Print the statistics
            print(f"\nCow: {se_number}, Lactation Number: {lactation_number}")
            print(f"Mean Residuals: {mean_residuals}")
            print(f"Standard Deviation of Residuals: {std_residuals}")
            print(f"ACF (first 5 lags): {acf_values[:5]}")
            print(f"PACF (first 5 lags): {pacf_values[:5]}")
            print("-" * 50)
            
    print("=" * 50)

Farm: 5c06d92d

Cow: SE-5c06d92d-2055, Lactation Number: 6
Mean Residuals: -0.3578012321028742
Standard Deviation of Residuals: 4.788415669121281
ACF (first 5 lags): [ 1.          0.076273    0.31997299  0.02182998 -0.00740504]
PACF (first 5 lags): [ 1.          0.07671388  0.31968996 -0.02151787 -0.12452141]
--------------------------------------------------

Cow: SE-5c06d92d-2055, Lactation Number: 7
Mean Residuals: -0.07712953746114429
Standard Deviation of Residuals: 8.538929848843226
ACF (first 5 lags): [1.         0.37688227 0.31437413 0.26089741 0.23573651]
PACF (first 5 lags): [1.         0.37790087 0.20213581 0.11109077 0.08573913]
--------------------------------------------------

Cow: SE-5c06d92d-2058, Lactation Number: 7
Mean Residuals: -0.20389820995257132
Standard Deviation of Residuals: 4.423873471197473
ACF (first 5 lags): [ 1.         -0.047472    0.1369884   0.04060856  0.04892148]
PACF (first 5 lags): [ 1.         -0.04769487  0.13631604  0.05451853  0.03615355]
---

In [17]:
# Define the thresholds
mean_residual_threshold = 0.05
std_residual_threshold = 5.0
acf_threshold = 0.2
pacf_threshold = 0.2

# List to collect flagged combinations
flagged_combinations = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    for (se_number, lactation_number), cow_group in farm_group.groupby(['SE_Number', 'LactationNumber']):
        residuals = cow_group['Residuals'].dropna()
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            mean_residuals = residuals.mean()
            std_residuals = residuals.std()
            acf_values = acf(residuals, nlags=30, fft=False)
            pacf_values = pacf(residuals, nlags=min(30, len(residuals)//2))

            # Check against thresholds
            if (abs(mean_residuals) > mean_residual_threshold or 
                std_residuals > std_residual_threshold or 
                abs(acf_values[1]) > acf_threshold or 
                abs(pacf_values[1]) > pacf_threshold):
                
                # Collect the combination if it exceeds any threshold
                flagged_combinations.append({
                    'Farm': farm_name,
                    'SE_Number': se_number,
                    'LactationNumber': lactation_number,
                    'Mean Residuals': mean_residuals,
                    'Std Residuals': std_residuals,
                    'ACF[1]': acf_values[1],
                    'PACF[1]': pacf_values[1]
                })

# Convert to a DataFrame for easier inspection
flagged_df = pd.DataFrame(flagged_combinations)
flagged_df

Unnamed: 0,Farm,SE_Number,LactationNumber,Mean Residuals,Std Residuals,ACF[1],PACF[1]
0,5c06d92d,SE-5c06d92d-2055,6,-0.357801,4.788416,0.076273,0.076714
1,5c06d92d,SE-5c06d92d-2055,7,-0.077130,8.538930,0.376882,0.377901
2,5c06d92d,SE-5c06d92d-2058,7,-0.203898,4.423873,-0.047472,-0.047695
3,5c06d92d,SE-5c06d92d-2058,8,-0.039645,7.674564,0.289566,0.290418
4,5c06d92d,SE-5c06d92d-2211,7,0.093558,8.589714,0.334567,0.336116
...,...,...,...,...,...,...,...
1953,f454e660,SE-f454e660-714,1,-0.062436,4.866536,-0.097928,-0.098395
1954,f454e660,SE-f454e660-735,1,0.146763,5.659393,-0.158367,-0.158913
1955,f454e660,SE-f454e660-735,2,0.287714,8.506736,-0.394546,-0.399950
1956,f454e660,SE-fcdf259d-0044-0,9,-0.369940,3.736061,-0.370248,-0.374656


In [18]:
# Define the Wilmink Lactation Curve function
def wilmink_lactation_curve(dim, a, b, c, d):
    dim = np.array(dim, dtype=float)
    return a + b * dim + c * np.exp(-d * dim)

# Function to directly refit the Wilmink Lactation Curve (Standard Process)
def refit_wilmink(cow_data):
    x_data = cow_data['DaysInMilk'].values
    y_data = cow_data['DailyYield'].values

    # Use initial guesses and bounds from the original fitting process
    initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1]
    bounds = ([-np.inf, -np.inf, -np.inf, 0], [np.inf, np.inf, np.inf, np.inf])

    popt, _ = curve_fit(wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=30000)
    
    # Calculate the expected yield with the refitted parameters
    cow_data['ExpectedYield'] = wilmink_lactation_curve(cow_data['DaysInMilk'], *popt)
    
    # Calculate new residuals
    cow_data['Residuals'] = cow_data['DailyYield'] - cow_data['ExpectedYield']
    
    return cow_data

# Function to add lagged variables for addressing autocorrelation
def add_lagged_variables(cow_data, max_lag=3):
    for lag in range(1, max_lag + 1):
        cow_data[f'lag_{lag}'] = cow_data['DailyYield'].shift(lag)
    return cow_data.dropna()

# Define the Robust Wilmink Lactation Curve function
def robust_wilmink_lactation_curve(dim, a, b, c, d, lag1, lag2, lag3):
    dim = np.array(dim, dtype=np.float64)
    days_in_milk = dim[0]
    lag_1 = dim[1]
    lag_2 = dim[2]
    lag_3 = dim[3]
    
    return a + b * days_in_milk + c * np.exp(-d * days_in_milk) + lag1 * lag_1 + lag2 * lag_2 + lag3 * lag_3

# Function to fit the robust Wilmink model
def fit_robust_wilmink(cow_data, lags=3):
    cow_data = add_lagged_variables(cow_data, max_lag=lags)
    
    x_data = cow_data[['DaysInMilk', 'lag_1', 'lag_2', 'lag_3']].values.T
    y_data = cow_data['DailyYield'].values
    
    initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1, 0, 0, 0]
    bounds = ([-np.inf, -np.inf, -np.inf, 0, -np.inf, -np.inf, -np.inf], 
              [np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf])
    
    try:
        popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)
        cow_data.loc[:, 'ExpectedYield'] = robust_wilmink_lactation_curve(x_data, *popt)
        cow_data.loc[:, 'Residuals'] = cow_data['DailyYield'] - cow_data['ExpectedYield']
    except RuntimeError as e:
        print(f"Curve fitting failed: {e}")
        cow_data.loc[:, 'ExpectedYield'] = np.nan
        cow_data.loc[:, 'Residuals'] = np.nan
    
    return cow_data

# Function to add lagged variables for addressing autocorrelation
def add_lagged_variables(cow_data, max_lag=3):
    for lag in range(1, max_lag + 1):
        cow_data[f'lag_{lag}'] = cow_data['DailyYield'].shift(lag)
    return cow_data.dropna()

# Example usage: Applying the robust model to flagged cases
for index, row in flagged_df.iterrows():
    se_number = row['SE_Number']
    lactation_number = row['LactationNumber']
    
    cow_data = data_cleaned[(data_cleaned['SE_Number'] == se_number) & 
                            (data_cleaned['LactationNumber'] == lactation_number)].copy()
    
    if abs(row['ACF[1]']) > 0.2:  # Significant autocorrelation
        cow_data = add_lagged_variables(cow_data, max_lag=3)
        cow_data_refitted = fit_robust_wilmink(cow_data, lags=3)
        data_cleaned.update(cow_data_refitted)
    else:
        cow_data_refitted = refit_wilmink(cow_data)
        data_cleaned.update(cow_data_refitted)

data_cleaned['NormalizedDailyYield'] = data_cleaned['DailyYield'] / data_cleaned['ExpectedYield']
data_cleaned['NormalizedDailyYieldChange'] = data_cleaned['DailyYieldChange'] / data_cleaned['ExpectedYield']

data_cleaned

  popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)


Curve fitting failed: Optimal parameters not found: The maximum number of function evaluations is exceeded.


  popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)


Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Age,BreedName,LactationNumber,HeatLoad,CumulativeHeatLoad,HeatStress,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,Residuals
0,SE-064c0cec-1189,a624fb9a,2022-01-01,30.77,0.000000,0.000000,0,-3.025000,28.012944,191,...,3095,02 SLB,7,-34.987056,0.000000,0,35.914865,0.856748,0.000000,-5.144865
1,SE-064c0cec-1189,a624fb9a,2022-01-02,48.22,30.770000,8.725000,0,-0.279167,32.898193,192,...,3096,02 SLB,7,-30.101807,0.000000,0,35.799613,1.346942,0.243718,12.420387
2,SE-064c0cec-1189,a624fb9a,2022-01-03,30.53,39.495000,-2.988333,0,2.033333,36.760487,193,...,3097,02 SLB,7,-26.239513,0.000000,0,35.684360,0.855557,-0.083744,-5.154360
3,SE-064c0cec-1189,a624fb9a,2022-01-04,42.26,36.506667,1.438333,0,0.066667,31.939524,194,...,3098,02 SLB,7,-31.060476,0.000000,0,35.569108,1.188110,0.040438,6.690892
4,SE-064c0cec-1189,a624fb9a,2022-01-05,38.49,37.945000,0.109000,0,-3.700000,26.498206,195,...,3099,02 SLB,7,-36.501794,0.000000,0,35.453856,1.085637,0.003074,3.036144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483102,SE-fcdf259d-0044-0,f454e660,2023-06-08,15.14,13.252000,0.494000,0,15.570833,59.383267,352,...,4155,41 Fjällko,10,-3.616733,0.000000,0,12.479001,1.213238,0.039587,2.660999
483103,SE-fcdf259d-0044-0,f454e660,2023-06-09,7.47,13.746000,-2.968000,0,13.254167,54.534255,353,...,4156,41 Fjällko,10,-8.465745,0.000000,0,12.383706,0.603212,-0.239670,-4.913706
483104,SE-fcdf259d-0044-0,f454e660,2023-06-10,14.73,10.778000,0.378000,0,13.258333,54.082367,354,...,4157,41 Fjällko,10,-8.917633,0.000000,0,12.288410,1.198690,0.030761,2.441590
483105,SE-fcdf259d-0044-0,f454e660,2023-06-12,12.27,11.156000,0.560000,0,15.820833,62.015093,356,...,4159,41 Fjällko,10,-0.984907,0.000000,0,12.097818,1.014232,0.046289,0.172182


In [19]:
# Define the thresholds
mean_residual_threshold = 0.05
std_residual_threshold = 5.0
acf_threshold = 0.2
pacf_threshold = 0.2

# List to collect flagged combinations
flagged_combinations = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    for (se_number, lactation_number), cow_group in farm_group.groupby(['SE_Number', 'LactationNumber']):
        residuals = cow_group['Residuals'].dropna()
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            mean_residuals = residuals.mean()
            std_residuals = residuals.std()
            acf_values = acf(residuals, nlags=30, fft=False)
            pacf_values = pacf(residuals, nlags=min(30, len(residuals)//2))

            # Check against thresholds
            if (abs(mean_residuals) > mean_residual_threshold or 
                std_residuals > std_residual_threshold or 
                abs(acf_values[1]) > acf_threshold or 
                abs(pacf_values[1]) > pacf_threshold):
                
                # Collect the combination if it exceeds any threshold
                flagged_combinations.append({
                    'Farm': farm_name,
                    'SE_Number': se_number,
                    'LactationNumber': lactation_number,
                    'Mean Residuals': float(f"{mean_residuals:.6f}"),  # Convert to regular decimal form
                    'Std Residuals': std_residuals,
                    'ACF[1]': acf_values[1],
                    'PACF[1]': pacf_values[1]
                })

# Convert to a DataFrame for easier inspection
flagged_df = pd.DataFrame(flagged_combinations)
flagged_df

Unnamed: 0,Farm,SE_Number,LactationNumber,Mean Residuals,Std Residuals,ACF[1],PACF[1]
0,5c06d92d,SE-5c06d92d-2055,7,0.129817,7.646087,-0.008971,-0.008996
1,5c06d92d,SE-5c06d92d-2058,8,-0.172312,6.998856,-0.012969,-0.013007
2,5c06d92d,SE-5c06d92d-2211,7,-0.142066,7.138250,-0.030489,-0.030630
3,5c06d92d,SE-5c06d92d-2246,6,-0.676966,9.827048,0.129483,0.130232
4,5c06d92d,SE-5c06d92d-2254,7,-0.132351,10.386472,0.002686,0.002697
...,...,...,...,...,...,...,...
1162,f454e660,SE-f454e660-712,1,0.089389,4.922108,0.005358,0.005376
1163,f454e660,SE-f454e660-735,1,0.000055,5.653616,-0.161290,-0.161846
1164,f454e660,SE-f454e660-735,2,0.093748,7.629328,-0.022853,-0.023166
1165,f454e660,SE-fcdf259d-0044-0,9,-0.080118,3.412328,-0.001702,-0.001722


In [20]:
def remove_outliers(data, threshold=3.5):
    # Calculate z-scores of residuals
    data = data.copy()  # Create a copy to avoid the SettingWithCopyWarning
    data['z_score'] = (data['Residuals'] - data['Residuals'].mean()) / data['Residuals'].std()
    
    # Identify the number of outliers
    num_outliers = (data['z_score'].abs() >= threshold).sum()
    print(f"Number of outliers detected: {num_outliers}")
    
    # Remove rows where the z-score of the residual is greater than the threshold
    cleaned_data = data.loc[(data['z_score'].abs() < threshold)].drop(columns=['z_score'])
    
    # Print the number of rows before and after
    print(f"Number of rows before outlier removal: {len(data)}")
    print(f"Number of rows after outlier removal: {len(cleaned_data)}")
    
    return cleaned_data

# Apply to flagged cases
for index, row in flagged_df.iterrows():
    se_number = row['SE_Number']
    lactation_number = row['LactationNumber']
    
    # Select the cow data for the specific SE_Number and LactationNumber
    cow_data = data_cleaned.loc[(data_cleaned['SE_Number'] == se_number) & 
                                (data_cleaned['LactationNumber'] == lactation_number)]
    
    # Remove outliers
    cow_data_trimmed = remove_outliers(cow_data, threshold=3.5)
    
    # Recalculate the residuals and update the dataset
    cow_data_trimmed['Residuals'] = cow_data_trimmed['DailyYield'] - cow_data_trimmed['ExpectedYield']
    
    # Remove the old data for this cow from data_cleaned
    data_cleaned = data_cleaned.loc[~((data_cleaned['SE_Number'] == se_number) & 
                                      (data_cleaned['LactationNumber'] == lactation_number))]
    
    # Append the cleaned data back to data_cleaned
    data_cleaned = pd.concat([data_cleaned, cow_data_trimmed], ignore_index=True)

Number of outliers detected: 4
Number of rows before outlier removal: 371
Number of rows after outlier removal: 367
Number of outliers detected: 4
Number of rows before outlier removal: 341
Number of rows after outlier removal: 337
Number of outliers detected: 3
Number of rows before outlier removal: 217
Number of rows after outlier removal: 214
Number of outliers detected: 0
Number of rows before outlier removal: 174
Number of rows after outlier removal: 174
Number of outliers detected: 0
Number of rows before outlier removal: 240
Number of rows after outlier removal: 240
Number of outliers detected: 1
Number of rows before outlier removal: 118
Number of rows after outlier removal: 117
Number of outliers detected: 4
Number of rows before outlier removal: 362
Number of rows after outlier removal: 358
Number of outliers detected: 0
Number of rows before outlier removal: 340
Number of rows after outlier removal: 340
Number of outliers detected: 3
Number of rows before outlier removal: 42

In [21]:
data_cleaned

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Age,BreedName,LactationNumber,HeatLoad,CumulativeHeatLoad,HeatStress,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,Residuals
0,SE-064c0cec-1189,a624fb9a,2022-05-28,15.22,0.0000,0.0000,0,9.912500,50.478673,3,...,3242,02 SLB,8,-12.521327,0.000000,0,13.399562,1.135858,0.000000,1.820438
1,SE-064c0cec-1189,a624fb9a,2022-05-29,18.96,15.2200,1.8700,0,10.066667,53.841648,4,...,3243,02 SLB,8,-9.158352,0.000000,0,17.370255,1.091521,0.107655,1.589745
2,SE-064c0cec-1189,a624fb9a,2022-05-30,22.64,17.0900,1.8500,0,10.466667,52.935959,5,...,3244,02 SLB,8,-10.064041,0.000000,0,20.745402,1.091326,0.089176,1.894598
3,SE-064c0cec-1189,a624fb9a,2022-05-31,26.49,18.9400,1.8875,0,11.183333,52.872112,6,...,3245,02 SLB,8,-10.127888,0.000000,0,23.612557,1.121861,0.079936,2.877443
4,SE-064c0cec-1189,a624fb9a,2022-06-01,33.61,20.8275,2.5565,0,12.704167,56.056547,7,...,3246,02 SLB,8,-6.943453,0.000000,0,26.046402,1.290389,0.098152,7.563598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465315,SE-fcdf259d-0044-0,f454e660,2023-06-08,15.14,13.2520,0.4940,0,15.570833,59.383267,352,...,4155,41 Fjällko,10,-3.616733,0.000000,0,12.479001,1.213238,0.039587,2.660999
465316,SE-fcdf259d-0044-0,f454e660,2023-06-09,7.47,13.7460,-2.9680,0,13.254167,54.534255,353,...,4156,41 Fjällko,10,-8.465745,0.000000,0,12.383706,0.603212,-0.239670,-4.913706
465317,SE-fcdf259d-0044-0,f454e660,2023-06-10,14.73,10.7780,0.3780,0,13.258333,54.082367,354,...,4157,41 Fjällko,10,-8.917633,0.000000,0,12.288410,1.198690,0.030761,2.441590
465318,SE-fcdf259d-0044-0,f454e660,2023-06-12,12.27,11.1560,0.5600,0,15.820833,62.015093,356,...,4159,41 Fjällko,10,-0.984907,0.000000,0,12.097818,1.014232,0.046289,0.172182


In [22]:
# Reorder columns
new_order = [
    "Date", "FarmName_Pseudo", "SE_Number", "Age", "BreedName", "LactationNumber", "DaysInMilk",'YearSeason', "DailyYield", "PreviousDailyYield", 
    "DailyYieldChange", "ExpectedYield", "NormalizedDailyYield", 
    "NormalizedDailyYieldChange", "Residuals", "HeatStress", "Temp15Threshold", "HW", 
    "cum_HW", "MeanTemperature", "MeanTHI_adj", "HeatLoad", "CumulativeHeatLoad"
]
data_cleaned = data_cleaned[new_order]
data_cleaned.head()

Unnamed: 0,Date,FarmName_Pseudo,SE_Number,Age,BreedName,LactationNumber,DaysInMilk,YearSeason,DailyYield,PreviousDailyYield,...,NormalizedDailyYieldChange,Residuals,HeatStress,Temp15Threshold,HW,cum_HW,MeanTemperature,MeanTHI_adj,HeatLoad,CumulativeHeatLoad
0,2022-05-28,a624fb9a,SE-064c0cec-1189,3242,02 SLB,8,3,2022-2,15.22,0.0,...,0.0,1.820438,0,0,0,0,9.9125,50.478673,-12.521327,0.0
1,2022-05-29,a624fb9a,SE-064c0cec-1189,3243,02 SLB,8,4,2022-2,18.96,15.22,...,0.107655,1.589745,0,0,0,0,10.066667,53.841648,-9.158352,0.0
2,2022-05-30,a624fb9a,SE-064c0cec-1189,3244,02 SLB,8,5,2022-2,22.64,17.09,...,0.089176,1.894598,0,1,0,0,10.466667,52.935959,-10.064041,0.0
3,2022-05-31,a624fb9a,SE-064c0cec-1189,3245,02 SLB,8,6,2022-2,26.49,18.94,...,0.079936,2.877443,0,0,0,0,11.183333,52.872112,-10.127888,0.0
4,2022-06-01,a624fb9a,SE-064c0cec-1189,3246,02 SLB,8,7,2022-3,33.61,20.8275,...,0.098152,7.563598,0,1,0,0,12.704167,56.056547,-6.943453,0.0


In [23]:
# Check if NormalizedDailyYield is centered around 1 for each unique farm
print("Mean of NormalizedDailyYield:", data_cleaned.groupby('FarmName_Pseudo')['NormalizedDailyYield'].mean())
print("Standard Deviation of NormalizedDailyYield:", data_cleaned.groupby('FarmName_Pseudo')['NormalizedDailyYield'].std())

Mean of NormalizedDailyYield: FarmName_Pseudo
5c06d92d    1.002293
752efd72    1.003195
a624fb9a    1.000694
f454e660    1.000827
Name: NormalizedDailyYield, dtype: float64
Standard Deviation of NormalizedDailyYield: FarmName_Pseudo
5c06d92d    0.128028
752efd72    0.109442
a624fb9a    0.171171
f454e660    0.229254
Name: NormalizedDailyYield, dtype: float64


In [24]:
# Save the reordered DataFrame to a CSV file
data_cleaned.to_csv('../Data/MergedData/HeatApproachYieldDataTest.csv', index=False)