In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.optimize import curve_fit, OptimizeWarning
from tqdm import tqdm
import warnings
from scipy.stats import zscore
from statsmodels.tsa.stattools import acf, pacf
from scipy.optimize import minimize


sns.set_theme()
sns.set_context("notebook")
%load_ext autoreload
%autoreload 2

In [2]:
dtype_dict = {
    'FarmName_Pseudo': 'str',
    'SE_Number': 'str',
    'AnimalNumber': 'Int64',          
    'StartDate': 'str',
    'StartTime': 'str',
    'DateTime': 'str',
    'LactationNumber': 'Int64',       
    'DaysInMilk': 'Int64', 
    'YearSeason': 'str',           
    'TotalYield': 'float',
    'DateTime': 'str',
    'BreedName': 'str',
    'Age': 'Int64',
    'Mother': 'str',
    'Father': 'str',
    'CullDecisionDate': 'str',
    'Temperature': 'float',
    'RelativeHumidity': 'float',      
    'THI_adj': 'float',
    'HW': 'Int64',                    
    'cum_HW': 'Int64',                
    'Temp15Threshold': 'Int64'        
}


# Load the CSV with specified dtypes
data = pd.read_csv('../Data/MergedData/CleanedYieldData.csv', dtype=dtype_dict)

# Convert date and time columns back to datetime and time objects
data['DateTime'] = pd.to_datetime(data['DateTime'], errors='coerce')
data['StartTime'] = pd.to_datetime(data['StartTime'], format='%H:%M:%S', errors='coerce').dt.time
data['StartDate'] = pd.to_datetime(data['StartDate'], errors='coerce')
data['CullDecisionDate'] = pd.to_datetime(data['CullDecisionDate'], errors='coerce')
data['DateTime'] = pd.to_datetime(data['DateTime'], errors='coerce')
data.head()

Unnamed: 0,FarmName_Pseudo,SE_Number,AnimalNumber,StartDate,StartTime,LactationNumber,DaysInMilk,TotalYield,DateTime,YearSeason,...,Mother,Father,CullDecisionDate,Temperature,RelativeHumidity,THI_adj,HW,cum_HW,Temp15Threshold,Age
0,a624fb9a,SE-064c0cec-1189,5189,2022-01-01,06:25:00,7,191,13.9,2022-01-01 06:25:00,2022-1,...,,,2022-12-20,-3.025,0.930917,28.012944,0,0,0,3095
1,a624fb9a,SE-064c0cec-1189,5189,2022-01-01,16:41:00,7,191,16.87,2022-01-01 16:41:00,2022-1,...,,,2022-12-20,-3.025,0.930917,28.012944,0,0,0,3095
2,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,15:29:00,7,192,20.41,2022-01-02 15:29:00,2022-1,...,,,2022-12-20,-0.279167,0.990542,32.898193,0,0,0,3096
3,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,22:44:00,7,192,11.53,2022-01-02 22:44:00,2022-1,...,,,2022-12-20,-0.279167,0.990542,32.898193,0,0,0,3096
4,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,03:31:00,7,192,16.28,2022-01-02 03:31:00,2022-1,...,,,2022-12-20,-0.279167,0.990542,32.898193,0,0,0,3096


In [3]:
# Calculate the DailyYield for each cow each day
data['DailyYield'] = data.groupby(['SE_Number', 'StartDate'])['TotalYield'].transform('sum')

# Sort the data by AnimalNumber and StartDate
data.sort_values(['AnimalNumber', 'StartDate'], inplace=True)

# Calculate the previous day's total yield for each cow
data['PreviousDailyYield'] = data.groupby('AnimalNumber')['DailyYield'].shift(1)

# Calculate the daily yield change for each cow
data['DailyYieldChange'] = data['DailyYield'] - data['PreviousDailyYield']

# Group and aggregate data
data = data.groupby(['SE_Number', 'FarmName_Pseudo', 'StartDate']).agg({
    'DailyYield': 'first',
    'PreviousDailyYield': 'first',
    'DailyYieldChange': 'first',
    'HW': 'max',
    'Temperature': 'mean',
    'THI_adj': 'mean',
    'DaysInMilk': 'first',
    'YearSeason': 'first',
    'cum_HW': 'max',
    'Temp15Threshold': 'max',
    'Age': 'first',
    'BreedName': 'first',
    'LactationNumber': 'first'
}).reset_index()

# Renaming and formatting
data.rename(columns={
    'Temperature': 'MeanTemperature',
    'THI_adj': 'MeanTHI_adj',
    'StartDate': 'Date'
}, inplace=True)
data['Date'] = pd.to_datetime(data['Date'])

# Display the first few rows of the transformed data
data.head()

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,YearSeason,cum_HW,Temp15Threshold,Age,BreedName,LactationNumber
0,SE-064c0cec-1189,a624fb9a,2022-01-01,30.77,30.77,0.0,0,-3.025,28.012944,191,2022-1,0,0,3095,02 SLB,7
1,SE-064c0cec-1189,a624fb9a,2022-01-02,48.22,30.77,17.45,0,-0.279167,32.898193,192,2022-1,0,0,3096,02 SLB,7
2,SE-064c0cec-1189,a624fb9a,2022-01-03,30.53,48.22,-17.69,0,2.033333,36.760487,193,2022-1,0,0,3097,02 SLB,7
3,SE-064c0cec-1189,a624fb9a,2022-01-04,42.26,30.53,11.73,0,0.066667,31.939524,194,2022-1,0,0,3098,02 SLB,7
4,SE-064c0cec-1189,a624fb9a,2022-01-05,38.49,42.26,-3.77,0,-3.7,26.498206,195,2022-1,0,0,3099,02 SLB,7


In [4]:
# Check if DailyYield is centered around approx the same for each farm
print("Mean of DailyYield:", data.groupby('FarmName_Pseudo')['DailyYield'].mean())
print("Standard Deviation of DailyYield:", data.groupby('FarmName_Pseudo')['DailyYield'].std())

Mean of DailyYield: FarmName_Pseudo
5c06d92d    37.322718
752efd72    31.412607
a624fb9a    34.164215
f454e660    30.811276
Name: DailyYield, dtype: float64
Standard Deviation of DailyYield: FarmName_Pseudo
5c06d92d     9.854998
752efd72     7.760655
a624fb9a    11.417583
f454e660    11.923900
Name: DailyYield, dtype: float64


## Wilmink Lactation Curve
$$
Y(t) = a + bt + c \exp(-dt)
$$
- \(Y(t)\): Milk yield at time \(t\) post-calving, so t = DaysInMilk
- \(a\): Intercept, representing baseline milk yield
- \(b\): Linear increase rate of milk yield over time
- \(c\): Initial exponential increase in milk yield
- \(d\): Rate at which the exponential increase declines over time

The Wilmink model captures the lactation curve by considering both linear and exponential components, providing a flexible representation of milk production dynamics over the lactation period.

In [5]:
# Define the Wilmink Lactation Curve function
def wilmink_lactation_curve(dim, a, b, c, d):
    dim = np.array(dim, dtype=float)
    return a + b * dim + c * np.exp(-d * dim)

# Function to detect and remove outliers
def remove_outliers(group, threshold=3.5):
    mean = np.mean(group['DailyYield'])
    std_dev = np.std(group['DailyYield'])
    return group[(group['DailyYield'] > mean - threshold * std_dev) & (group['DailyYield'] < mean + threshold * std_dev)]

# Function to smooth the data using a rolling average
def smooth_data(group, window=5):
    group = group.copy()
    group['DailyYield'] = group['DailyYield'].rolling(window, min_periods=1).mean()
    return group

# Function to fit the Wilmink Lactation Curve to the dataset
def fit_wilmink_lactation_curve(dataset):
    # Initialize the 'ExpectedYield' column to NaN
    dataset['ExpectedYield'] = np.nan
    params_dict = {}
    
    valid_indices = []

    # Group the dataset by 'SE_Number' and 'LactationNumber' and fit the curve for each segment
    for (animal_number, lactation_number), group in tqdm(dataset.groupby(['SE_Number', 'LactationNumber']), unit=" Segments"):
        # Prepare the data for fitting
        group = remove_outliers(group, threshold=3.5)  # Remove outliers with threshold 4
        group = smooth_data(group)  # Smooth the data
        x_data = group['DaysInMilk'].values
        y_data = group['DailyYield'].values
        
        # Ensure there are no NaN or infinite values in the data
        if not np.isfinite(x_data).all() or not np.isfinite(y_data).all():
            print(f"Non-finite values found for cow {animal_number}, lactation {lactation_number}, skipping.")
            continue
        
        # Ensure there are enough data points to fit the curve
        if len(x_data) < 10 or len(y_data) < 10:
            print(f"Insufficient data points for cow {animal_number}, lactation {lactation_number}, skipping.")
            continue

        valid_indices.extend(group.index)
        
        # Fit the model
        try:
            # Initial parameter guesses
            initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1]
            # Bounds on the parameters to prevent overflow
            bounds = ([-np.inf, -np.inf, -np.inf, 0], [np.inf, np.inf, np.inf, np.inf])
            
            with warnings.catch_warnings():
                warnings.filterwarnings('error', category=OptimizeWarning)
                try:
                    popt, pcov = curve_fit(
                        wilmink_lactation_curve, x_data, y_data,
                        p0=initial_guesses, bounds=bounds, maxfev=30000
                    )
                    
                    # Store the parameters in the dictionary
                    params_dict[(animal_number, lactation_number)] = {'a': popt[0], 'b': popt[1], 'c': popt[2], 'd': popt[3]}
                    
                    # Predict the expected yield using the fitted model
                    dataset.loc[group.index, 'ExpectedYield'] = wilmink_lactation_curve(group['DaysInMilk'], *popt)
                    
                    # Normalize the DailyYield
                    dataset.loc[group.index, 'NormalizedDailyYield'] = group['DailyYield'] / dataset.loc[group.index, 'ExpectedYield']
                    
                    # Calculate the daily yield change and normalize it
                    dataset.loc[group.index, 'PreviousDailyYield'] = group['DailyYield'].shift(1)
                    dataset.loc[group.index, 'DailyYieldChange'] = group['DailyYield'] - dataset.loc[group.index, 'PreviousDailyYield']
                    dataset.loc[group.index, 'NormalizedDailyYieldChange'] = dataset.loc[group.index, 'DailyYieldChange'] / dataset.loc[group.index, 'ExpectedYield']
                
                except OptimizeWarning:
                    print(f"OptimizeWarning for cow {animal_number}, lactation {lactation_number}, skipping.")
            
        except RuntimeError as e:
            print(f"Curve fit failed for cow {animal_number}, lactation {lactation_number}: {e}")
        except ValueError as e:
            print(f"Value error for cow {animal_number}, lactation {lactation_number}: {e}")
    
    # Keep only valid indices
    dataset = dataset.loc[valid_indices].reset_index(drop=True)
    
    # Fill any NaN values in the newly created columns with 0
    dataset['ExpectedYield'] = dataset['ExpectedYield'].fillna(0)
    dataset['NormalizedDailyYield'] = dataset['NormalizedDailyYield'].fillna(0)
    dataset['PreviousDailyYield'] = dataset['PreviousDailyYield'].fillna(0)
    dataset['DailyYieldChange'] = dataset['DailyYieldChange'].fillna(0)
    dataset['NormalizedDailyYieldChange'] = dataset['NormalizedDailyYieldChange'].fillna(0)
    
    return dataset, params_dict

# Apply the curve fitting function to your dataset
data, params_dict = fit_wilmink_lactation_curve(data)
data

  4%|▍         | 108/2746 [00:06<02:06, 20.92 Segments/s]

Insufficient data points for cow SE-5c06d92d-2621, lactation 3, skipping.


  4%|▍         | 121/2746 [00:08<04:09, 10.51 Segments/s]

Insufficient data points for cow SE-5c06d92d-2639, lactation 3, skipping.


  8%|▊         | 212/2746 [00:16<03:31, 11.96 Segments/s]

Insufficient data points for cow SE-5c06d92d-2776, lactation 5, skipping.


 10%|▉         | 261/2746 [00:20<03:10, 13.07 Segments/s]

Insufficient data points for cow SE-5c06d92d-2815, lactation 2, skipping.


 10%|▉         | 268/2746 [00:22<06:05,  6.78 Segments/s]

Insufficient data points for cow SE-5c06d92d-2824, lactation 3, skipping.


 11%|█▏        | 310/2746 [00:25<02:16, 17.85 Segments/s]

Insufficient data points for cow SE-5c06d92d-2845, lactation 2, skipping.


 12%|█▏        | 316/2746 [00:25<03:03, 13.23 Segments/s]

Insufficient data points for cow SE-5c06d92d-2870, lactation 2, skipping.


 14%|█▍        | 382/2746 [00:26<00:43, 54.59 Segments/s]

Insufficient data points for cow SE-5c06d92d-2911, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-2914, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-2919, lactation 2, skipping.


 19%|█▉        | 517/2746 [00:37<01:48, 20.49 Segments/s]

Insufficient data points for cow SE-5c06d92d-3045, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3047, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3048, lactation 5, skipping.
Insufficient data points for cow SE-5c06d92d-3049, lactation 1, skipping.


 19%|█▉        | 527/2746 [00:38<01:48, 20.52 Segments/s]

Insufficient data points for cow SE-5c06d92d-3063, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3063, lactation 3, skipping.
Insufficient data points for cow SE-5c06d92d-3065, lactation 1, skipping.


 20%|█▉        | 543/2746 [00:38<01:43, 21.19 Segments/s]

Insufficient data points for cow SE-5c06d92d-3068, lactation 1, skipping.


 23%|██▎       | 620/2746 [00:39<00:36, 58.56 Segments/s]

Insufficient data points for cow SE-5c06d92d-3116, lactation 3, skipping.


 32%|███▏      | 865/2746 [00:49<00:18, 101.14 Segments/s]

Insufficient data points for cow SE-5c06d92d-3273, lactation 3, skipping.


 34%|███▎      | 922/2746 [00:50<00:18, 98.90 Segments/s] 

Insufficient data points for cow SE-5c06d92d-3288, lactation 3, skipping.
Insufficient data points for cow SE-5c06d92d-3310, lactation 3, skipping.
Insufficient data points for cow SE-5c06d92d-3327, lactation 3, skipping.


 42%|████▏     | 1149/2746 [00:53<00:15, 104.70 Segments/s]

Insufficient data points for cow SE-5c06d92d-3655, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3662, lactation 1, skipping.


 43%|████▎     | 1188/2746 [00:56<01:27, 17.87 Segments/s] 

Insufficient data points for cow SE-752efd72-0051, lactation 3, skipping.


 46%|████▌     | 1257/2746 [01:03<02:21, 10.52 Segments/s]

Insufficient data points for cow SE-752efd72-0117, lactation 2, skipping.


 46%|████▋     | 1276/2746 [01:03<01:36, 15.21 Segments/s]

Insufficient data points for cow SE-752efd72-0129, lactation 2, skipping.


 47%|████▋     | 1282/2746 [01:04<01:44, 13.94 Segments/s]

Insufficient data points for cow SE-752efd72-0136, lactation 2, skipping.


 48%|████▊     | 1306/2746 [01:05<00:55, 25.75 Segments/s]

Insufficient data points for cow SE-752efd72-0143, lactation 2, skipping.


 48%|████▊     | 1320/2746 [01:06<01:45, 13.52 Segments/s]

Insufficient data points for cow SE-752efd72-0166, lactation 1, skipping.


 50%|█████     | 1375/2746 [01:17<05:23,  4.24 Segments/s]

Insufficient data points for cow SE-752efd72-0196, lactation 5, skipping.


 53%|█████▎    | 1466/2746 [01:22<00:34, 36.94 Segments/s]

Insufficient data points for cow SE-752efd72-0232, lactation 1, skipping.
Insufficient data points for cow SE-752efd72-0234, lactation 1, skipping.
Insufficient data points for cow SE-752efd72-0239, lactation 1, skipping.
Insufficient data points for cow SE-752efd72-0243, lactation 1, skipping.


 57%|█████▋    | 1578/2746 [01:34<01:18, 14.94 Segments/s]

Insufficient data points for cow SE-752efd72-0289, lactation 4, skipping.
Insufficient data points for cow SE-752efd72-0298, lactation 1, skipping.


 58%|█████▊    | 1593/2746 [01:34<00:48, 23.82 Segments/s]

Insufficient data points for cow SE-752efd72-0312, lactation 4, skipping.


 59%|█████▉    | 1620/2746 [01:37<01:09, 16.19 Segments/s]

Insufficient data points for cow SE-752efd72-0317, lactation 1, skipping.
Insufficient data points for cow SE-752efd72-0322, lactation 4, skipping.


 59%|█████▉    | 1629/2746 [01:37<01:13, 15.10 Segments/s]

Insufficient data points for cow SE-752efd72-0329, lactation 1, skipping.


 64%|██████▎   | 1744/2746 [01:38<00:11, 89.19 Segments/s]

Insufficient data points for cow SE-752efd72-0369, lactation 1, skipping.


 65%|██████▌   | 1785/2746 [01:38<00:07, 123.79 Segments/s]

Insufficient data points for cow SE-752efd72-0394, lactation 3, skipping.
Insufficient data points for cow SE-752efd72-0409, lactation 3, skipping.
Insufficient data points for cow SE-752efd72-0411, lactation 3, skipping.


 69%|██████▉   | 1901/2746 [01:43<00:25, 32.88 Segments/s] 

Insufficient data points for cow SE-752efd72-0468, lactation 2, skipping.


 73%|███████▎  | 2005/2746 [01:46<00:12, 58.90 Segments/s]

Insufficient data points for cow SE-752efd72-0502, lactation 2, skipping.
Insufficient data points for cow SE-752efd72-0521, lactation 2, skipping.
Insufficient data points for cow SE-752efd72-0526, lactation 2, skipping.


 74%|███████▍  | 2042/2746 [01:47<00:11, 62.32 Segments/s]

Insufficient data points for cow SE-752efd72-0590, lactation 1, skipping.


 75%|███████▍  | 2056/2746 [01:47<00:14, 46.20 Segments/s]

Insufficient data points for cow SE-752efd72-0612, lactation 1, skipping.


 75%|███████▌  | 2067/2746 [01:48<00:17, 38.35 Segments/s]

Insufficient data points for cow SE-752efd72-0621, lactation 1, skipping.
Insufficient data points for cow SE-752efd72-0622, lactation 1, skipping.


 76%|███████▋  | 2095/2746 [01:50<00:40, 16.17 Segments/s]

Insufficient data points for cow SE-752efd72-2751, lactation 5, skipping.


 78%|███████▊  | 2143/2746 [01:58<00:44, 13.43 Segments/s]

Insufficient data points for cow SE-752efd72-2797, lactation 3, skipping.
Insufficient data points for cow SE-7fd04cd3-679, lactation 4, skipping.
Insufficient data points for cow SE-a624fb9a-1162, lactation 7, skipping.
Insufficient data points for cow SE-a624fb9a-1200, lactation 4, skipping.


 79%|███████▉  | 2166/2746 [02:02<01:43,  5.61 Segments/s]

Insufficient data points for cow SE-a624fb9a-1251, lactation 3, skipping.


 80%|████████  | 2197/2746 [02:03<00:35, 15.27 Segments/s]

Insufficient data points for cow SE-a624fb9a-1267, lactation 3, skipping.


 81%|████████  | 2220/2746 [02:07<01:29,  5.85 Segments/s]

Insufficient data points for cow SE-a624fb9a-1312, lactation 2, skipping.


 81%|████████▏ | 2235/2746 [02:08<00:57,  8.88 Segments/s]

Insufficient data points for cow SE-a624fb9a-1330, lactation 2, skipping.
Insufficient data points for cow SE-a624fb9a-1333, lactation 1, skipping.


 83%|████████▎ | 2291/2746 [02:10<00:19, 23.83 Segments/s]

Insufficient data points for cow SE-a624fb9a-1373, lactation 1, skipping.
Insufficient data points for cow SE-a624fb9a-1374, lactation 1, skipping.


 89%|████████▊ | 2431/2746 [02:23<00:05, 56.73 Segments/s]

Insufficient data points for cow SE-f454e660-0448, lactation 5, skipping.


 95%|█████████▍| 2598/2746 [02:31<00:05, 25.15 Segments/s]

Insufficient data points for cow SE-f454e660-509, lactation 3, skipping.
Insufficient data points for cow SE-f454e660-510, lactation 2, skipping.


 96%|█████████▌| 2638/2746 [02:35<00:07, 14.84 Segments/s]

Insufficient data points for cow SE-f454e660-551, lactation 1, skipping.
Insufficient data points for cow SE-f454e660-559, lactation 1, skipping.
Insufficient data points for cow SE-f454e660-567, lactation 1, skipping.


 97%|█████████▋| 2667/2746 [02:37<00:03, 19.77 Segments/s]

Insufficient data points for cow SE-f454e660-585, lactation 1, skipping.


100%|██████████| 2746/2746 [02:40<00:00, 17.08 Segments/s]

Insufficient data points for cow SE-f454e660-729, lactation 1, skipping.





Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,YearSeason,cum_HW,Temp15Threshold,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange
0,SE-064c0cec-1189,a624fb9a,2022-01-01,30.77,0.000000,0.000000,0,-3.025000,28.012944,191,2022-1,0,0,3095,02 SLB,7,35.914865,0.856748,0.000000
1,SE-064c0cec-1189,a624fb9a,2022-01-02,48.22,30.770000,8.725000,0,-0.279167,32.898193,192,2022-1,0,0,3096,02 SLB,7,35.799613,1.103224,0.243718
2,SE-064c0cec-1189,a624fb9a,2022-01-03,30.53,39.495000,-2.988333,0,2.033333,36.760487,193,2022-1,0,0,3097,02 SLB,7,35.684360,1.023044,-0.083744
3,SE-064c0cec-1189,a624fb9a,2022-01-04,42.26,36.506667,1.438333,0,0.066667,31.939524,194,2022-1,0,0,3098,02 SLB,7,35.569108,1.066796,0.040438
4,SE-064c0cec-1189,a624fb9a,2022-01-05,38.49,37.945000,0.109000,0,-3.700000,26.498206,195,2022-1,0,0,3099,02 SLB,7,35.453856,1.073339,0.003074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
654671,SE-fcdf259d-0044-0,f454e660,2023-06-08,15.14,13.252000,0.494000,0,15.570833,59.383267,352,2023-3,0,1,4155,41 Fjällko,10,13.149489,1.045364,0.037568
654672,SE-fcdf259d-0044-0,f454e660,2023-06-09,7.47,13.746000,-2.968000,0,13.254167,54.534255,353,2023-3,0,1,4156,41 Fjällko,10,13.057668,0.825415,-0.227299
654673,SE-fcdf259d-0044-0,f454e660,2023-06-10,14.73,10.778000,0.378000,0,13.258333,54.082367,354,2023-3,0,1,4157,41 Fjällko,10,12.965846,0.860414,0.029154
654674,SE-fcdf259d-0044-0,f454e660,2023-06-12,12.27,11.156000,0.560000,0,15.820833,62.015093,356,2023-3,0,1,4159,41 Fjällko,10,12.782204,0.916587,0.043811


In [6]:
# Check if NormalizedDailyYield is centered around 1 for each unique farm
print("Mean of NormalizedDailyYield:", data.groupby('FarmName_Pseudo')['NormalizedDailyYield'].mean())
print("Standard Deviation of NormalizedDailyYield:", data.groupby('FarmName_Pseudo')['NormalizedDailyYield'].std())

Mean of NormalizedDailyYield: FarmName_Pseudo
5c06d92d    1.000023
752efd72    1.000081
a624fb9a    1.000075
f454e660    1.000228
Name: NormalizedDailyYield, dtype: float64
Standard Deviation of NormalizedDailyYield: FarmName_Pseudo
5c06d92d    0.107134
752efd72    0.074296
a624fb9a    0.165915
f454e660    0.136271
Name: NormalizedDailyYield, dtype: float64


In [7]:
# Define the THI threshold
THI_THRESHOLD = 61

# Calculate the daily heat load based on the THI threshold
data['HeatLoad'] = data['MeanTHI_adj'].apply(lambda x: x - THI_THRESHOLD if x > THI_THRESHOLD else -(THI_THRESHOLD - x))

# Initialize the cumulative heat load column with float type
data['CumulativeHeatLoad'] = 0.0  # Explicitly set as float

# Iterate through the data to calculate cumulative heat load correctly
for i in range(1, len(data)):
    previous_cumulative = data.at[i-1, 'CumulativeHeatLoad']
    current_heat_load = data.at[i, 'HeatLoad']
    
    if current_heat_load < 0:  # If current heat load is negative
        new_cumulative = previous_cumulative + 2 * current_heat_load
    else:
        new_cumulative = previous_cumulative + current_heat_load
    
    # Ensure the cumulative heat load never goes below zero
    if new_cumulative > 0:
        data.at[i, 'CumulativeHeatLoad'] = new_cumulative
    else:
        data.at[i, 'CumulativeHeatLoad'] = 0.0  # Ensure float is maintained

data.head(-5)

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,cum_HW,Temp15Threshold,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad
0,SE-064c0cec-1189,a624fb9a,2022-01-01,30.77,0.000000,0.000000,0,-3.025000,28.012944,191,...,0,0,3095,02 SLB,7,35.914865,0.856748,0.000000,-32.987056,0.000000
1,SE-064c0cec-1189,a624fb9a,2022-01-02,48.22,30.770000,8.725000,0,-0.279167,32.898193,192,...,0,0,3096,02 SLB,7,35.799613,1.103224,0.243718,-28.101807,0.000000
2,SE-064c0cec-1189,a624fb9a,2022-01-03,30.53,39.495000,-2.988333,0,2.033333,36.760487,193,...,0,0,3097,02 SLB,7,35.684360,1.023044,-0.083744,-24.239513,0.000000
3,SE-064c0cec-1189,a624fb9a,2022-01-04,42.26,36.506667,1.438333,0,0.066667,31.939524,194,...,0,0,3098,02 SLB,7,35.569108,1.066796,0.040438,-29.060476,0.000000
4,SE-064c0cec-1189,a624fb9a,2022-01-05,38.49,37.945000,0.109000,0,-3.700000,26.498206,195,...,0,0,3099,02 SLB,7,35.453856,1.073339,0.003074,-34.501794,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
654666,SE-fcdf259d-0044-0,f454e660,2023-06-03,12.67,14.652000,-0.622000,0,12.666667,53.132530,347,...,0,1,4150,41 Fjällko,10,13.608593,1.030966,-0.045706,-7.867470,0.000000
654667,SE-fcdf259d-0044-0,f454e660,2023-06-04,22.31,14.030000,0.954000,0,13.079167,56.726870,348,...,0,1,4151,41 Fjällko,10,13.516773,1.108549,0.070579,-4.273130,0.000000
654668,SE-fcdf259d-0044-0,f454e660,2023-06-05,12.84,14.984000,-0.092000,0,14.237500,58.482418,349,...,0,1,4152,41 Fjällko,10,13.424952,1.109278,-0.006853,-2.517582,0.000000
654669,SE-fcdf259d-0044-0,f454e660,2023-06-06,9.47,14.892000,-0.284000,0,15.345833,60.546358,350,...,0,1,4153,41 Fjällko,10,13.333131,1.095617,-0.021300,-0.453642,0.000000


In [8]:
# When CumulativeHeatLoad is greater than 3, it indicates that the cow is under heat stress
data['HeatStress'] = (data['CumulativeHeatLoad'] > 3).astype(int)
data.head(-5)

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Temp15Threshold,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad,HeatStress
0,SE-064c0cec-1189,a624fb9a,2022-01-01,30.77,0.000000,0.000000,0,-3.025000,28.012944,191,...,0,3095,02 SLB,7,35.914865,0.856748,0.000000,-32.987056,0.000000,0
1,SE-064c0cec-1189,a624fb9a,2022-01-02,48.22,30.770000,8.725000,0,-0.279167,32.898193,192,...,0,3096,02 SLB,7,35.799613,1.103224,0.243718,-28.101807,0.000000,0
2,SE-064c0cec-1189,a624fb9a,2022-01-03,30.53,39.495000,-2.988333,0,2.033333,36.760487,193,...,0,3097,02 SLB,7,35.684360,1.023044,-0.083744,-24.239513,0.000000,0
3,SE-064c0cec-1189,a624fb9a,2022-01-04,42.26,36.506667,1.438333,0,0.066667,31.939524,194,...,0,3098,02 SLB,7,35.569108,1.066796,0.040438,-29.060476,0.000000,0
4,SE-064c0cec-1189,a624fb9a,2022-01-05,38.49,37.945000,0.109000,0,-3.700000,26.498206,195,...,0,3099,02 SLB,7,35.453856,1.073339,0.003074,-34.501794,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
654666,SE-fcdf259d-0044-0,f454e660,2023-06-03,12.67,14.652000,-0.622000,0,12.666667,53.132530,347,...,1,4150,41 Fjällko,10,13.608593,1.030966,-0.045706,-7.867470,0.000000,0
654667,SE-fcdf259d-0044-0,f454e660,2023-06-04,22.31,14.030000,0.954000,0,13.079167,56.726870,348,...,1,4151,41 Fjällko,10,13.516773,1.108549,0.070579,-4.273130,0.000000,0
654668,SE-fcdf259d-0044-0,f454e660,2023-06-05,12.84,14.984000,-0.092000,0,14.237500,58.482418,349,...,1,4152,41 Fjällko,10,13.424952,1.109278,-0.006853,-2.517582,0.000000,0
654669,SE-fcdf259d-0044-0,f454e660,2023-06-06,9.47,14.892000,-0.284000,0,15.345833,60.546358,350,...,1,4153,41 Fjällko,10,13.333131,1.095617,-0.021300,-0.453642,0.000000,0


In [9]:
# Make a dataframe from the parameters dictionary, it should contain Se_Number, LactationNumber, a, b, c, d
params_df = pd.DataFrame(params_dict).T.reset_index()
params_df.columns = ['SE_Number', 'LactationNumber', 'a', 'b', 'c', 'd']
params_df.head(-5)

Unnamed: 0,SE_Number,LactationNumber,a,b,c,d
0,SE-064c0cec-1189,7,57.928060,-0.115252,-6.063436,17.367795
1,SE-064c0cec-1189,8,41.195196,-0.080255,-44.398922,0.159012
2,SE-30dc5787-1389,5,115.924643,-0.343237,12.205804,0.099999
3,SE-30dc5787-1389,6,52.877694,-0.084404,-32.593599,0.093018
4,SE-30dc5787-1389,7,47.551089,-0.094727,-45.321451,0.203812
...,...,...,...,...,...,...
2662,SE-f454e660-780,1,19.385586,0.004089,-15.651796,0.078672
2663,SE-f454e660-787,1,36.029973,-0.007639,-13.181145,0.029809
2664,SE-f454e660-788,1,20.085017,0.014708,-48.693344,0.238306
2665,SE-f454e660-789,1,54.011233,-0.069178,-33.117659,0.015508


In [10]:
# Calculate Z-scores for each parameter
params_df['z_a'] = zscore(params_df['a'])
params_df['z_b'] = zscore(params_df['b'])
params_df['z_c'] = zscore(params_df['c'])
params_df['z_d'] = zscore(params_df['d'])

params_df.head(-5)

Unnamed: 0,SE_Number,LactationNumber,a,b,c,d,z_a,z_b,z_c,z_d
0,SE-064c0cec-1189,7,57.928060,-0.115252,-6.063436,17.367795,-0.069572,0.071634,-0.02277,1.483761
1,SE-064c0cec-1189,8,41.195196,-0.080255,-44.398922,0.159012,-0.069709,0.074625,-0.02277,-0.206574
2,SE-30dc5787-1389,5,115.924643,-0.343237,12.205804,0.099999,-0.069097,0.052148,-0.02277,-0.212371
3,SE-30dc5787-1389,6,52.877694,-0.084404,-32.593599,0.093018,-0.069613,0.074271,-0.02277,-0.213056
4,SE-30dc5787-1389,7,47.551089,-0.094727,-45.321451,0.203812,-0.069657,0.073388,-0.02277,-0.202174
...,...,...,...,...,...,...,...,...,...,...
2662,SE-f454e660-780,1,19.385586,0.004089,-15.651796,0.078672,-0.069887,0.081834,-0.02277,-0.214466
2663,SE-f454e660-787,1,36.029973,-0.007639,-13.181145,0.029809,-0.069751,0.080832,-0.02277,-0.219265
2664,SE-f454e660-788,1,20.085017,0.014708,-48.693344,0.238306,-0.069882,0.082742,-0.02277,-0.198785
2665,SE-f454e660-789,1,54.011233,-0.069178,-33.117659,0.015508,-0.069604,0.075572,-0.02277,-0.220670


In [11]:
# Identify outliers (using Z-score > 3.5 or < -3.5 as threshold)
outliers = params_df[(np.abs(params_df[['z_a', 'z_b', 'z_c', 'z_d']]) > 3.5).any(axis=1)]

x = outliers.count()
print("Number of outliers:", x)

# Optionally, drop the outliers
params_df_cleaned = params_df.drop(outliers.index)
params_df_cleaned.head(-5)

Number of outliers: SE_Number          133
LactationNumber    133
a                  133
b                  133
c                  133
d                  133
z_a                133
z_b                133
z_c                133
z_d                133
dtype: int64


Unnamed: 0,SE_Number,LactationNumber,a,b,c,d,z_a,z_b,z_c,z_d
0,SE-064c0cec-1189,7,57.928060,-0.115252,-6.063436,17.367795,-0.069572,0.071634,-0.02277,1.483761
1,SE-064c0cec-1189,8,41.195196,-0.080255,-44.398922,0.159012,-0.069709,0.074625,-0.02277,-0.206574
2,SE-30dc5787-1389,5,115.924643,-0.343237,12.205804,0.099999,-0.069097,0.052148,-0.02277,-0.212371
3,SE-30dc5787-1389,6,52.877694,-0.084404,-32.593599,0.093018,-0.069613,0.074271,-0.02277,-0.213056
4,SE-30dc5787-1389,7,47.551089,-0.094727,-45.321451,0.203812,-0.069657,0.073388,-0.02277,-0.202174
...,...,...,...,...,...,...,...,...,...,...
2662,SE-f454e660-780,1,19.385586,0.004089,-15.651796,0.078672,-0.069887,0.081834,-0.02277,-0.214466
2663,SE-f454e660-787,1,36.029973,-0.007639,-13.181145,0.029809,-0.069751,0.080832,-0.02277,-0.219265
2664,SE-f454e660-788,1,20.085017,0.014708,-48.693344,0.238306,-0.069882,0.082742,-0.02277,-0.198785
2665,SE-f454e660-789,1,54.011233,-0.069178,-33.117659,0.015508,-0.069604,0.075572,-0.02277,-0.220670


In [12]:
# Identify unique SE_Number and LactationNumber combinations from the outliers
outlier_combinations = outliers[['SE_Number', 'LactationNumber']].drop_duplicates()

# Merge with the original data to find rows that match these outlier combinations
data_cleaned = data.merge(outlier_combinations, on=['SE_Number', 'LactationNumber'], how='left', indicator=True)

# Keep only the rows that do not match the outlier combinations
data_cleaned = data_cleaned[data_cleaned['_merge'] == 'left_only'].drop(columns=['_merge'])

# Now data_cleaned contains the original data with the outlier combinations removed
print("Number of rows removed:", len(data) - len(data_cleaned))
data_cleaned.head(-5)

Number of rows removed: 24896


Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Temp15Threshold,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad,HeatStress
0,SE-064c0cec-1189,a624fb9a,2022-01-01,30.77,0.000000,0.000000,0,-3.025000,28.012944,191,...,0,3095,02 SLB,7,35.914865,0.856748,0.000000,-32.987056,0.000000,0
1,SE-064c0cec-1189,a624fb9a,2022-01-02,48.22,30.770000,8.725000,0,-0.279167,32.898193,192,...,0,3096,02 SLB,7,35.799613,1.103224,0.243718,-28.101807,0.000000,0
2,SE-064c0cec-1189,a624fb9a,2022-01-03,30.53,39.495000,-2.988333,0,2.033333,36.760487,193,...,0,3097,02 SLB,7,35.684360,1.023044,-0.083744,-24.239513,0.000000,0
3,SE-064c0cec-1189,a624fb9a,2022-01-04,42.26,36.506667,1.438333,0,0.066667,31.939524,194,...,0,3098,02 SLB,7,35.569108,1.066796,0.040438,-29.060476,0.000000,0
4,SE-064c0cec-1189,a624fb9a,2022-01-05,38.49,37.945000,0.109000,0,-3.700000,26.498206,195,...,0,3099,02 SLB,7,35.453856,1.073339,0.003074,-34.501794,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
654666,SE-fcdf259d-0044-0,f454e660,2023-06-03,12.67,14.652000,-0.622000,0,12.666667,53.132530,347,...,1,4150,41 Fjällko,10,13.608593,1.030966,-0.045706,-7.867470,0.000000,0
654667,SE-fcdf259d-0044-0,f454e660,2023-06-04,22.31,14.030000,0.954000,0,13.079167,56.726870,348,...,1,4151,41 Fjällko,10,13.516773,1.108549,0.070579,-4.273130,0.000000,0
654668,SE-fcdf259d-0044-0,f454e660,2023-06-05,12.84,14.984000,-0.092000,0,14.237500,58.482418,349,...,1,4152,41 Fjällko,10,13.424952,1.109278,-0.006853,-2.517582,0.000000,0
654669,SE-fcdf259d-0044-0,f454e660,2023-06-06,9.47,14.892000,-0.284000,0,15.345833,60.546358,350,...,1,4153,41 Fjällko,10,13.333131,1.095617,-0.021300,-0.453642,0.000000,0


In [13]:
# Check if NormalizedDailyYield is centered around 1 for each unique farm
print("Mean of NormalizedDailyYield:", data_cleaned.groupby('FarmName_Pseudo')['NormalizedDailyYield'].mean())
print("Standard Deviation of NormalizedDailyYield:", data_cleaned.groupby('FarmName_Pseudo')['NormalizedDailyYield'].std())

Mean of NormalizedDailyYield: FarmName_Pseudo
5c06d92d    1.000037
752efd72    1.000098
a624fb9a    1.000247
f454e660    1.000286
Name: NormalizedDailyYield, dtype: float64
Standard Deviation of NormalizedDailyYield: FarmName_Pseudo
5c06d92d    0.107455
752efd72    0.073929
a624fb9a    0.162169
f454e660    0.137267
Name: NormalizedDailyYield, dtype: float64


In [14]:
data_cleaned['Residuals'] = data_cleaned['DailyYield'] - data_cleaned['ExpectedYield']
data_cleaned

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad,HeatStress,Residuals
0,SE-064c0cec-1189,a624fb9a,2022-01-01,30.77,0.000000,0.000000,0,-3.025000,28.012944,191,...,3095,02 SLB,7,35.914865,0.856748,0.000000,-32.987056,0.000000,0,-5.144865
1,SE-064c0cec-1189,a624fb9a,2022-01-02,48.22,30.770000,8.725000,0,-0.279167,32.898193,192,...,3096,02 SLB,7,35.799613,1.103224,0.243718,-28.101807,0.000000,0,12.420387
2,SE-064c0cec-1189,a624fb9a,2022-01-03,30.53,39.495000,-2.988333,0,2.033333,36.760487,193,...,3097,02 SLB,7,35.684360,1.023044,-0.083744,-24.239513,0.000000,0,-5.154360
3,SE-064c0cec-1189,a624fb9a,2022-01-04,42.26,36.506667,1.438333,0,0.066667,31.939524,194,...,3098,02 SLB,7,35.569108,1.066796,0.040438,-29.060476,0.000000,0,6.690892
4,SE-064c0cec-1189,a624fb9a,2022-01-05,38.49,37.945000,0.109000,0,-3.700000,26.498206,195,...,3099,02 SLB,7,35.453856,1.073339,0.003074,-34.501794,0.000000,0,3.036144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
654671,SE-fcdf259d-0044-0,f454e660,2023-06-08,15.14,13.252000,0.494000,0,15.570833,59.383267,352,...,4155,41 Fjällko,10,13.149489,1.045364,0.037568,-1.616733,0.000000,0,1.990511
654672,SE-fcdf259d-0044-0,f454e660,2023-06-09,7.47,13.746000,-2.968000,0,13.254167,54.534255,353,...,4156,41 Fjällko,10,13.057668,0.825415,-0.227299,-6.465745,0.000000,0,-5.587668
654673,SE-fcdf259d-0044-0,f454e660,2023-06-10,14.73,10.778000,0.378000,0,13.258333,54.082367,354,...,4157,41 Fjällko,10,12.965846,0.860414,0.029154,-6.917633,0.000000,0,1.764154
654674,SE-fcdf259d-0044-0,f454e660,2023-06-12,12.27,11.156000,0.560000,0,15.820833,62.015093,356,...,4159,41 Fjällko,10,12.782204,0.916587,0.043811,1.015093,1.015093,0,-0.512204


In [15]:
# Group by 'FarmName_Pseudo' to perform calculations at the farm level
farm_results = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    farm_residuals = []
    
    for se_number, cow_group in farm_group.groupby('SE_Number'):
        residuals = cow_group['Residuals']
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            farm_residuals.append(residuals)
    
    if len(farm_residuals) > 0:
        # Combine residuals from all cows in the farm
        combined_residuals = np.concatenate(farm_residuals)
        
        # Calculate farm-level statistics
        mean_residuals = combined_residuals.mean()
        std_residuals = combined_residuals.std()
        acf_values = acf(combined_residuals, nlags=30, fft=False)
        pacf_values = pacf(combined_residuals, nlags=min(30, len(combined_residuals)//2))

        # Print the farm-level statistics
        print(f"Farm: {farm_name}")
        print(f"Mean Residuals: {mean_residuals}")
        print(f"Standard Deviation of Residuals: {std_residuals}")
        print(f"ACF (first 5 lags): {acf_values[:5]}")
        print(f"PACF (first 5 lags): {pacf_values[:5]}")
        print("=" * 50)


Farm: 5c06d92d
Mean Residuals: -0.010804757208422347
Standard Deviation of Residuals: 5.19047833282636
ACF (first 5 lags): [1.         0.27940885 0.25794868 0.20024289 0.16691753]
PACF (first 5 lags): [1.         0.27940992 0.19511322 0.09891065 0.06094869]
Farm: 752efd72
Mean Residuals: -0.020661700496045047
Standard Deviation of Residuals: 3.49575025944653
ACF (first 5 lags): [1.         0.2927943  0.25594383 0.20468709 0.16244152]
PACF (first 5 lags): [1.         0.29279573 0.18617791 0.1010562  0.05316634]
Farm: a624fb9a
Mean Residuals: -0.00934553404094611
Standard Deviation of Residuals: 7.516084623742135
ACF (first 5 lags): [1.         0.36334711 0.54017127 0.51149508 0.4965544 ]
PACF (first 5 lags): [1.         0.36335212 0.47024545 0.35008695 0.23181268]
Farm: f454e660
Mean Residuals: -0.01057281221043747
Standard Deviation of Residuals: 7.528876809893905
ACF (first 5 lags): [ 1.         -0.00881951  0.27348774  0.25666815  0.24058209]
PACF (first 5 lags): [ 1.         -0.0088

In [16]:
# Group by 'FarmName_Pseudo', 'SE_Number', and 'LactationNumber' to perform individual calculations
farm_results = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    print(f"Farm: {farm_name}")
    
    for (se_number, lactation_number), cow_group in farm_group.groupby(['SE_Number', 'LactationNumber']):
        residuals = cow_group['Residuals']
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            mean_residuals = residuals.mean()
            std_residuals = residuals.std()
            acf_values = acf(residuals, nlags=30, fft=False)
            pacf_values = pacf(residuals, nlags=min(30, len(residuals)//2))

            # Print the statistics
            print(f"\nCow: {se_number}, Lactation Number: {lactation_number}")
            print(f"Mean Residuals: {mean_residuals}")
            print(f"Standard Deviation of Residuals: {std_residuals}")
            print(f"ACF (first 5 lags): {acf_values[:5]}")
            print(f"PACF (first 5 lags): {pacf_values[:5]}")
            print("-" * 50)
            
    print("=" * 50)

Farm: 5c06d92d

Cow: SE-5c06d92d-2055, Lactation Number: 6
Mean Residuals: -0.3578012321028742
Standard Deviation of Residuals: 4.788415669121281
ACF (first 5 lags): [ 1.          0.076273    0.31997299  0.02182998 -0.00740504]
PACF (first 5 lags): [ 1.          0.07671388  0.31968996 -0.02151787 -0.12452141]
--------------------------------------------------

Cow: SE-5c06d92d-2055, Lactation Number: 7
Mean Residuals: -0.07712953746443753
Standard Deviation of Residuals: 8.53892984965088
ACF (first 5 lags): [1.         0.37688227 0.31437412 0.26089741 0.23573651]
PACF (first 5 lags): [1.         0.37790087 0.20213581 0.11109077 0.08573913]
--------------------------------------------------

Cow: SE-5c06d92d-2055, Lactation Number: 8
Mean Residuals: -0.05713641530419862
Standard Deviation of Residuals: 10.899393218574385
ACF (first 5 lags): [1.         0.63672597 0.4583209  0.31991586 0.24888028]
PACF (first 5 lags): [ 1.          0.64105744  0.09107044 -0.00437799  0.04230007]
--------

In [17]:
# Define the thresholds
mean_residual_threshold = 0.075
std_residual_threshold = 7.5
acf_threshold = 0.25
pacf_threshold = 0.25

# List to collect flagged combinations
flagged_combinations = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    for (se_number, lactation_number), cow_group in farm_group.groupby(['SE_Number', 'LactationNumber']):
        residuals = cow_group['Residuals'].dropna()
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            mean_residuals = residuals.mean()
            std_residuals = residuals.std()
            acf_values = acf(residuals, nlags=30, fft=False)
            pacf_values = pacf(residuals, nlags=min(30, len(residuals)//2))

            # Check against thresholds
            if (abs(mean_residuals) > mean_residual_threshold or 
                std_residuals > std_residual_threshold or 
                abs(acf_values[1]) > acf_threshold or 
                abs(pacf_values[1]) > pacf_threshold):
                
                # Collect the combination if it exceeds any threshold
                flagged_combinations.append({
                    'Farm': farm_name,
                    'SE_Number': se_number,
                    'LactationNumber': lactation_number,
                    'Mean Residuals': mean_residuals,
                    'Std Residuals': std_residuals,
                    'ACF[1]': acf_values[1],
                    'PACF[1]': pacf_values[1]
                })

# Convert to a DataFrame for easier inspection
flagged_df = pd.DataFrame(flagged_combinations)
flagged_df

Unnamed: 0,Farm,SE_Number,LactationNumber,Mean Residuals,Std Residuals,ACF[1],PACF[1]
0,5c06d92d,SE-5c06d92d-2055,6,-0.357801,4.788416,0.076273,0.076714
1,5c06d92d,SE-5c06d92d-2055,7,-0.077130,8.538930,0.376882,0.377901
2,5c06d92d,SE-5c06d92d-2055,8,-0.057136,10.899393,0.636726,0.641057
3,5c06d92d,SE-5c06d92d-2058,7,-0.204307,4.423878,-0.047470,-0.047693
4,5c06d92d,SE-5c06d92d-2058,8,-0.039645,7.674564,0.289566,0.290418
...,...,...,...,...,...,...,...
1910,f454e660,SE-f454e660-796,1,0.177523,9.949975,0.646401,0.647908
1911,f454e660,SE-f454e660-799,1,0.113697,8.020783,0.166562,0.167001
1912,f454e660,SE-f454e660-804,1,0.141757,6.755059,-0.073437,-0.073649
1913,f454e660,SE-f454e660-810,1,0.146478,8.238240,0.161859,0.162261


In [18]:
# Define the Wilmink Lactation Curve function
def wilmink_lactation_curve(dim, a, b, c, d):
    dim = np.array(dim, dtype=float)
    return a + b * dim + c * np.exp(-d * dim)

# Function to directly refit the Wilmink Lactation Curve (Standard Process)
def refit_wilmink(cow_data):
    x_data = cow_data['DaysInMilk'].values
    y_data = cow_data['DailyYield'].values

    # Use initial guesses and bounds from the original fitting process
    initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1]
    bounds = ([-np.inf, -np.inf, -np.inf, 0], [np.inf, np.inf, np.inf, np.inf])

    popt, _ = curve_fit(wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=30000)
    
    # Calculate the expected yield with the refitted parameters
    cow_data['ExpectedYield'] = wilmink_lactation_curve(cow_data['DaysInMilk'], *popt)
    
    # Calculate new residuals
    cow_data['Residuals'] = cow_data['DailyYield'] - cow_data['ExpectedYield']
    
    return cow_data

# Function to add lagged variables for addressing autocorrelation
def add_lagged_variables(cow_data, max_lag=3):
    for lag in range(1, max_lag + 1):
        cow_data[f'lag_{lag}'] = cow_data['DailyYield'].shift(lag)
    return cow_data.dropna()

# Define the Robust Wilmink Lactation Curve function
def robust_wilmink_lactation_curve(dim, a, b, c, d, lag1, lag2, lag3):
    dim = np.array(dim, dtype=np.float64)
    days_in_milk = dim[0]
    lag_1 = dim[1]
    lag_2 = dim[2]
    lag_3 = dim[3]
    
    return a + b * days_in_milk + c * np.exp(-d * days_in_milk) + lag1 * lag_1 + lag2 * lag_2 + lag3 * lag_3

# Function to fit the robust Wilmink model
def fit_robust_wilmink(cow_data, lags=3):
    cow_data = add_lagged_variables(cow_data, max_lag=lags)
    
    x_data = cow_data[['DaysInMilk', 'lag_1', 'lag_2', 'lag_3']].values.T
    y_data = cow_data['DailyYield'].values
    
    initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1, 0, 0, 0]
    bounds = ([-np.inf, -np.inf, -np.inf, 0, -np.inf, -np.inf, -np.inf], 
              [np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf])
    
    try:
        popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)
        cow_data.loc[:, 'ExpectedYield'] = robust_wilmink_lactation_curve(x_data, *popt)
        cow_data.loc[:, 'Residuals'] = cow_data['DailyYield'] - cow_data['ExpectedYield']
    except RuntimeError as e:
        print(f"Curve fitting failed: {e}")
        cow_data.loc[:, 'ExpectedYield'] = np.nan
        cow_data.loc[:, 'Residuals'] = np.nan
    
    return cow_data

# Function to add lagged variables for addressing autocorrelation
def add_lagged_variables(cow_data, max_lag=3):
    for lag in range(1, max_lag + 1):
        cow_data[f'lag_{lag}'] = cow_data['DailyYield'].shift(lag)
    return cow_data.dropna()

# Example usage: Applying the robust model to flagged cases
for index, row in flagged_df.iterrows():
    se_number = row['SE_Number']
    lactation_number = row['LactationNumber']
    
    cow_data = data_cleaned[(data_cleaned['SE_Number'] == se_number) & 
                            (data_cleaned['LactationNumber'] == lactation_number)].copy()
    
    if abs(row['ACF[1]']) > 0.2:  # Significant autocorrelation
        cow_data = add_lagged_variables(cow_data, max_lag=3)
        cow_data_refitted = fit_robust_wilmink(cow_data, lags=3)
        data_cleaned.update(cow_data_refitted)
    else:
        cow_data_refitted = refit_wilmink(cow_data)
        data_cleaned.update(cow_data_refitted)

data_cleaned['NormalizedDailyYield'] = data_cleaned['DailyYield'] / data_cleaned['ExpectedYield']
data_cleaned['NormalizedDailyYieldChange'] = data_cleaned['DailyYieldChange'] / data_cleaned['ExpectedYield']

data_cleaned

  popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)
  popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)
  popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)
  popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)
  popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)
  popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)
  popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)


Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad,HeatStress,Residuals
0,SE-064c0cec-1189,a624fb9a,2022-01-01,30.77,0.000000,0.000000,0,-3.025000,28.012944,191,...,3095,02 SLB,7,35.914865,0.856748,0.000000,-32.987056,0.000000,0,-5.144865
1,SE-064c0cec-1189,a624fb9a,2022-01-02,48.22,30.770000,8.725000,0,-0.279167,32.898193,192,...,3096,02 SLB,7,35.799613,1.346942,0.243718,-28.101807,0.000000,0,12.420387
2,SE-064c0cec-1189,a624fb9a,2022-01-03,30.53,39.495000,-2.988333,0,2.033333,36.760487,193,...,3097,02 SLB,7,35.684360,0.855557,-0.083744,-24.239513,0.000000,0,-5.154360
3,SE-064c0cec-1189,a624fb9a,2022-01-04,42.26,36.506667,1.438333,0,0.066667,31.939524,194,...,3098,02 SLB,7,35.569108,1.188110,0.040438,-29.060476,0.000000,0,6.690892
4,SE-064c0cec-1189,a624fb9a,2022-01-05,38.49,37.945000,0.109000,0,-3.700000,26.498206,195,...,3099,02 SLB,7,35.453856,1.085637,0.003074,-34.501794,0.000000,0,3.036144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
654671,SE-fcdf259d-0044-0,f454e660,2023-06-08,15.14,13.252000,0.494000,0,15.570833,59.383267,352,...,4155,41 Fjällko,10,13.149489,1.151376,0.037568,-1.616733,0.000000,0,1.990511
654672,SE-fcdf259d-0044-0,f454e660,2023-06-09,7.47,13.746000,-2.968000,0,13.254167,54.534255,353,...,4156,41 Fjällko,10,13.057668,0.572078,-0.227299,-6.465745,0.000000,0,-5.587668
654673,SE-fcdf259d-0044-0,f454e660,2023-06-10,14.73,10.778000,0.378000,0,13.258333,54.082367,354,...,4157,41 Fjällko,10,12.965846,1.136062,0.029154,-6.917633,0.000000,0,1.764154
654674,SE-fcdf259d-0044-0,f454e660,2023-06-12,12.27,11.156000,0.560000,0,15.820833,62.015093,356,...,4159,41 Fjällko,10,12.782204,0.959928,0.043811,1.015093,1.015093,0,-0.512204


In [19]:
# Define the thresholds
mean_residual_threshold = 0.075
std_residual_threshold = 7.5
acf_threshold = 0.25
pacf_threshold = 0.25

# List to collect flagged combinations
flagged_combinations = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    for (se_number, lactation_number), cow_group in farm_group.groupby(['SE_Number', 'LactationNumber']):
        residuals = cow_group['Residuals'].dropna()
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            mean_residuals = residuals.mean()
            std_residuals = residuals.std()
            acf_values = acf(residuals, nlags=30, fft=False)
            pacf_values = pacf(residuals, nlags=min(30, len(residuals)//2))

            # Check against thresholds
            if (abs(mean_residuals) > mean_residual_threshold or 
                std_residuals > std_residual_threshold or 
                abs(acf_values[1]) > acf_threshold or 
                abs(pacf_values[1]) > pacf_threshold):
                
                # Collect the combination if it exceeds any threshold
                flagged_combinations.append({
                    'Farm': farm_name,
                    'SE_Number': se_number,
                    'LactationNumber': lactation_number,
                    'Mean Residuals': mean_residuals,
                    'Std Residuals': std_residuals,
                    'ACF[1]': acf_values[1],
                    'PACF[1]': pacf_values[1]
                })

# Convert to a DataFrame for easier inspection
flagged_df = pd.DataFrame(flagged_combinations)
flagged_df

Unnamed: 0,Farm,SE_Number,LactationNumber,Mean Residuals,Std Residuals,ACF[1],PACF[1]
0,5c06d92d,SE-5c06d92d-2055,7,1.298174e-01,7.646087,-0.008971,-0.008996
1,5c06d92d,SE-5c06d92d-2055,8,-1.193259e+00,9.454095,0.381596,0.384192
2,5c06d92d,SE-5c06d92d-2058,8,-1.723122e-01,6.998856,-0.012969,-0.013007
3,5c06d92d,SE-5c06d92d-2211,7,-1.420659e-01,7.138278,-0.030475,-0.030616
4,5c06d92d,SE-5c06d92d-2254,7,-8.305082e-02,8.678431,-0.000748,-0.000750
...,...,...,...,...,...,...,...
578,f454e660,SE-f454e660-750,1,-4.032712e-11,7.535962,-0.166229,-0.166535
579,f454e660,SE-f454e660-787,1,-9.143292e-03,8.252705,-0.019901,-0.019960
580,f454e660,SE-f454e660-799,1,-1.325353e-05,8.017082,0.165831,0.166269
581,f454e660,SE-f454e660-810,1,2.824120e-08,8.230547,0.160006,0.160403


In [20]:
def remove_outliers(data, threshold=3.5):
    # Calculate z-scores of residuals
    data = data.copy()  # Create a copy to avoid the SettingWithCopyWarning
    data['z_score'] = (data['Residuals'] - data['Residuals'].mean()) / data['Residuals'].std()
    
    # Identify the number of outliers
    num_outliers = (data['z_score'].abs() >= threshold).sum()
    print(f"Number of outliers detected: {num_outliers}")
    
    # Remove rows where the z-score of the residual is greater than the threshold
    cleaned_data = data.loc[(data['z_score'].abs() < threshold)].drop(columns=['z_score'])
    
    # Print the number of rows before and after
    print(f"Number of rows before outlier removal: {len(data)}")
    print(f"Number of rows after outlier removal: {len(cleaned_data)}")
    
    return cleaned_data

# Apply to flagged cases
for index, row in flagged_df.iterrows():
    se_number = row['SE_Number']
    lactation_number = row['LactationNumber']
    
    # Select the cow data for the specific SE_Number and LactationNumber
    cow_data = data_cleaned.loc[(data_cleaned['SE_Number'] == se_number) & 
                                (data_cleaned['LactationNumber'] == lactation_number)]
    
    # Remove outliers
    cow_data_trimmed = remove_outliers(cow_data, threshold=3.5)
    
    # Recalculate the residuals and update the dataset
    cow_data_trimmed['Residuals'] = cow_data_trimmed['DailyYield'] - cow_data_trimmed['ExpectedYield']
    
    # Remove the old data for this cow from data_cleaned
    data_cleaned = data_cleaned.loc[~((data_cleaned['SE_Number'] == se_number) & 
                                      (data_cleaned['LactationNumber'] == lactation_number))]
    
    # Append the cleaned data back to data_cleaned
    data_cleaned = pd.concat([data_cleaned, cow_data_trimmed], ignore_index=True)

Number of outliers detected: 4
Number of rows before outlier removal: 371
Number of rows after outlier removal: 367
Number of outliers detected: 3
Number of rows before outlier removal: 148
Number of rows after outlier removal: 145
Number of outliers detected: 4
Number of rows before outlier removal: 341
Number of rows after outlier removal: 337
Number of outliers detected: 3
Number of rows before outlier removal: 217
Number of rows after outlier removal: 214
Number of outliers detected: 2
Number of rows before outlier removal: 483
Number of rows after outlier removal: 481
Number of outliers detected: 4
Number of rows before outlier removal: 362
Number of rows after outlier removal: 358
Number of outliers detected: 0
Number of rows before outlier removal: 80
Number of rows after outlier removal: 80
Number of outliers detected: 0
Number of rows before outlier removal: 340
Number of rows after outlier removal: 340
Number of outliers detected: 3
Number of rows before outlier removal: 428


In [21]:
data_cleaned

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad,HeatStress,Residuals
0,SE-064c0cec-1189,a624fb9a,2022-05-28,15.22,0.0000,0.0000,0,9.912500,50.478673,3,...,3242,02 SLB,8,13.399562,1.135858,0.000000,-10.521327,0.0,0,1.820438
1,SE-064c0cec-1189,a624fb9a,2022-05-29,18.96,15.2200,1.8700,0,10.066667,53.841648,4,...,3243,02 SLB,8,17.370255,1.091521,0.107655,-7.158352,0.0,0,1.589745
2,SE-064c0cec-1189,a624fb9a,2022-05-30,22.64,17.0900,1.8500,0,10.466667,52.935959,5,...,3244,02 SLB,8,20.745402,1.091326,0.089176,-8.064041,0.0,0,1.894598
3,SE-064c0cec-1189,a624fb9a,2022-05-31,26.49,18.9400,1.8875,0,11.183333,52.872112,6,...,3245,02 SLB,8,23.612557,1.121861,0.079936,-8.127888,0.0,0,2.877443
4,SE-064c0cec-1189,a624fb9a,2022-06-01,33.61,20.8275,2.5565,0,12.704167,56.056547,7,...,3246,02 SLB,8,26.046402,1.290389,0.098152,-4.943453,0.0,0,7.563598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
628676,SE-fcdf259d-0044-0,f454e660,2022-03-24,13.29,13.9480,-1.4240,0,5.550000,44.186100,279,...,3714,41 Fjällko,9,14.122139,0.941076,-0.100835,-16.813900,0.0,0,-0.832139
628677,SE-fcdf259d-0044-0,f454e660,2022-03-25,10.19,12.5240,-0.3960,0,7.504167,43.802362,280,...,3715,41 Fjällko,9,12.810366,0.795450,-0.030912,-17.197638,0.0,0,-2.620366
628678,SE-fcdf259d-0044-0,f454e660,2022-03-26,13.83,12.1280,0.0480,0,4.445833,37.090483,281,...,3716,41 Fjällko,9,13.332476,1.037317,0.003600,-23.909517,0.0,0,0.497524
628679,SE-fcdf259d-0044-0,f454e660,2022-03-27,13.72,12.1760,0.1040,0,2.620833,37.266066,282,...,3717,41 Fjällko,9,12.232194,1.121630,0.008502,-23.733934,0.0,0,1.487806


In [22]:
# Reorder columns
new_order = [
    "Date", "FarmName_Pseudo", "SE_Number", "Age", "BreedName", "LactationNumber", "DaysInMilk",'YearSeason', "DailyYield", "PreviousDailyYield", 
    "DailyYieldChange", "ExpectedYield", "NormalizedDailyYield", 
    "NormalizedDailyYieldChange", "Residuals", "HeatStress", "Temp15Threshold", "HW", 
    "cum_HW", "MeanTemperature", "MeanTHI_adj", "HeatLoad", "CumulativeHeatLoad"
]
data_cleaned = data_cleaned[new_order]
data_cleaned.head()

Unnamed: 0,Date,FarmName_Pseudo,SE_Number,Age,BreedName,LactationNumber,DaysInMilk,YearSeason,DailyYield,PreviousDailyYield,...,NormalizedDailyYieldChange,Residuals,HeatStress,Temp15Threshold,HW,cum_HW,MeanTemperature,MeanTHI_adj,HeatLoad,CumulativeHeatLoad
0,2022-05-28,a624fb9a,SE-064c0cec-1189,3242,02 SLB,8,3,2022-2,15.22,0.0,...,0.0,1.820438,0,0,0,0,9.9125,50.478673,-10.521327,0.0
1,2022-05-29,a624fb9a,SE-064c0cec-1189,3243,02 SLB,8,4,2022-2,18.96,15.22,...,0.107655,1.589745,0,0,0,0,10.066667,53.841648,-7.158352,0.0
2,2022-05-30,a624fb9a,SE-064c0cec-1189,3244,02 SLB,8,5,2022-2,22.64,17.09,...,0.089176,1.894598,0,1,0,0,10.466667,52.935959,-8.064041,0.0
3,2022-05-31,a624fb9a,SE-064c0cec-1189,3245,02 SLB,8,6,2022-2,26.49,18.94,...,0.079936,2.877443,0,0,0,0,11.183333,52.872112,-8.127888,0.0
4,2022-06-01,a624fb9a,SE-064c0cec-1189,3246,02 SLB,8,7,2022-3,33.61,20.8275,...,0.098152,7.563598,0,1,0,0,12.704167,56.056547,-4.943453,0.0


In [23]:
# Check if NormalizedDailyYield is centered around 1 for each unique farm
print("Mean of NormalizedDailyYield:", data_cleaned.groupby('FarmName_Pseudo')['NormalizedDailyYield'].mean())
print("Standard Deviation of NormalizedDailyYield:", data_cleaned.groupby('FarmName_Pseudo')['NormalizedDailyYield'].std())

Mean of NormalizedDailyYield: FarmName_Pseudo
5c06d92d    1.001135
752efd72    1.001769
a624fb9a    1.000680
f454e660    1.000930
Name: NormalizedDailyYield, dtype: float64
Standard Deviation of NormalizedDailyYield: FarmName_Pseudo
5c06d92d    0.132582
752efd72    0.109979
a624fb9a    0.178958
f454e660    0.235095
Name: NormalizedDailyYield, dtype: float64


In [24]:
# Count the number of HeatStress occurrences in each farm
heat_stress_counts = data_cleaned.groupby('FarmName_Pseudo')['HeatStress'].sum()
heat_stress_counts

FarmName_Pseudo
5c06d92d    71152
752efd72    52083
a624fb9a    17415
f454e660    22160
Name: HeatStress, dtype: int64

In [25]:
# Save the reordered DataFrame to a CSV file
data_cleaned.to_csv('../Data/MergedData/HeatApproachYieldDataTest.csv', index=False)