In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.optimize import curve_fit, OptimizeWarning
from tqdm import tqdm
import warnings
from scipy.stats import zscore
from statsmodels.tsa.stattools import acf, pacf
from scipy.optimize import minimize
from vqr import VectorQuantileRegressor
from vqr.solvers.regularized_lse import RegularizedDualVQRSolver

sns.set_theme()
sns.set_context("notebook")
%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dtype_dict = {
    'FarmName_Pseudo': 'str',
    'SE_Number': 'str',
    'AnimalNumber': 'Int64',          
    'StartDate': 'str',
    'StartTime': 'str',
    'DateTime': 'str',
    'LactationNumber': 'Int64',       
    'DaysInMilk': 'Int64', 
    'YearSeason': 'str',           
    'TotalYield': 'float',
    'DateTime': 'str',
    'BreedName': 'str',
    'Age': 'Int64',
    'Mother': 'str',
    'Father': 'str',
    'CullDecisionDate': 'str',
    'Temperature': 'float',
    'RelativeHumidity': 'float',      
    'THI_adj': 'float',
    'HW': 'Int64',                    
    'cum_HW': 'Int64',                
    'Temp15Threshold': 'Int64'        
}


# Load the CSV with specified dtypes
data = pd.read_csv('../Data/MergedData/CleanedYieldData.csv', dtype=dtype_dict)

# Convert date and time columns back to datetime and time objects
data['DateTime'] = pd.to_datetime(data['DateTime'], errors='coerce')
data['StartTime'] = pd.to_datetime(data['StartTime'], format='%H:%M:%S', errors='coerce').dt.time
data['StartDate'] = pd.to_datetime(data['StartDate'], errors='coerce')
data['CullDecisionDate'] = pd.to_datetime(data['CullDecisionDate'], errors='coerce')
data['DateTime'] = pd.to_datetime(data['DateTime'], errors='coerce')
data.head()

Unnamed: 0,FarmName_Pseudo,SE_Number,AnimalNumber,StartDate,StartTime,LactationNumber,DaysInMilk,TotalYield,DateTime,YearSeason,...,Mother,Father,CullDecisionDate,Temperature,RelativeHumidity,THI_adj,HW,cum_HW,Temp15Threshold,Age
0,a624fb9a,SE-064c0cec-1189,5189,2022-01-01,06:25:00,7,191,13.9,2022-01-01 06:25:00,2022-1,...,,,2022-12-20,-3.025,0.930917,28.012944,0,0,0,3095
1,a624fb9a,SE-064c0cec-1189,5189,2022-01-01,16:41:00,7,191,16.87,2022-01-01 16:41:00,2022-1,...,,,2022-12-20,-3.025,0.930917,28.012944,0,0,0,3095
2,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,15:29:00,7,192,20.41,2022-01-02 15:29:00,2022-1,...,,,2022-12-20,-0.279167,0.990542,32.898193,0,0,0,3096
3,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,03:31:00,7,192,16.28,2022-01-02 03:31:00,2022-1,...,,,2022-12-20,-0.279167,0.990542,32.898193,0,0,0,3096
4,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,22:44:00,7,192,11.53,2022-01-02 22:44:00,2022-1,...,,,2022-12-20,-0.279167,0.990542,32.898193,0,0,0,3096


In [3]:
# Define the THI threshold
THI_THRESHOLD = 61

# Calculate the daily heat load based on the THI threshold
data['HeatLoad'] = data['THI_adj'].apply(lambda x: x - THI_THRESHOLD if x > THI_THRESHOLD else -(THI_THRESHOLD - x))

# Initialize the cumulative heat load column with float type
data['CumulativeHeatLoad'] = 0.0  # Explicitly set as float

# Iterate through the data to calculate cumulative heat load correctly
for i in range(1, len(data)):
    previous_cumulative = data.at[i-1, 'CumulativeHeatLoad']
    current_heat_load = data.at[i, 'HeatLoad']
    if previous_cumulative + current_heat_load > 0:
        data.at[i, 'CumulativeHeatLoad'] = previous_cumulative + current_heat_load
    else:
        data.at[i, 'CumulativeHeatLoad'] = 0.0  # Ensure float is maintained

data.head(-5)

Unnamed: 0,FarmName_Pseudo,SE_Number,AnimalNumber,StartDate,StartTime,LactationNumber,DaysInMilk,TotalYield,DateTime,YearSeason,...,CullDecisionDate,Temperature,RelativeHumidity,THI_adj,HW,cum_HW,Temp15Threshold,Age,HeatLoad,CumulativeHeatLoad
0,a624fb9a,SE-064c0cec-1189,5189,2022-01-01,06:25:00,7,191,13.90,2022-01-01 06:25:00,2022-1,...,2022-12-20,-3.025000,0.930917,28.012944,0,0,0,3095,-32.987056,0.000000
1,a624fb9a,SE-064c0cec-1189,5189,2022-01-01,16:41:00,7,191,16.87,2022-01-01 16:41:00,2022-1,...,2022-12-20,-3.025000,0.930917,28.012944,0,0,0,3095,-32.987056,0.000000
2,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,15:29:00,7,192,20.41,2022-01-02 15:29:00,2022-1,...,2022-12-20,-0.279167,0.990542,32.898193,0,0,0,3096,-28.101807,0.000000
3,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,03:31:00,7,192,16.28,2022-01-02 03:31:00,2022-1,...,2022-12-20,-0.279167,0.990542,32.898193,0,0,0,3096,-28.101807,0.000000
4,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,22:44:00,7,192,11.53,2022-01-02 22:44:00,2022-1,...,2022-12-20,-0.279167,0.990542,32.898193,0,0,0,3096,-28.101807,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1364930,f454e660,SE-fcdf259d-0044-0,1044,2023-06-07,00:51:00,10,351,5.63,2023-06-07 00:51:00,2023-3,...,NaT,15.645833,0.731917,61.559237,0,0,1,4154,0.559237,0.559237
1364931,f454e660,SE-fcdf259d-0044-0,1044,2023-06-07,11:17:00,10,351,3.34,2023-06-07 11:17:00,2023-3,...,NaT,15.645833,0.731917,61.559237,0,0,1,4154,0.559237,1.118475
1364932,f454e660,SE-fcdf259d-0044-0,1044,2023-06-08,17:01:00,10,352,6.96,2023-06-08 17:01:00,2023-3,...,NaT,15.570833,0.601708,59.383267,0,0,1,4155,-1.616733,0.000000
1364933,f454e660,SE-fcdf259d-0044-0,1044,2023-06-08,02:23:00,10,352,8.18,2023-06-08 02:23:00,2023-3,...,NaT,15.570833,0.601708,59.383267,0,0,1,4155,-1.616733,0.000000


In [4]:
# When CumulativeHeatLoad is greater than 5, it indicates that the cow is under heat stress
data['HeatStress'] = (data['CumulativeHeatLoad'] > 5).astype(int)
data.head(-5)

Unnamed: 0,FarmName_Pseudo,SE_Number,AnimalNumber,StartDate,StartTime,LactationNumber,DaysInMilk,TotalYield,DateTime,YearSeason,...,Temperature,RelativeHumidity,THI_adj,HW,cum_HW,Temp15Threshold,Age,HeatLoad,CumulativeHeatLoad,HeatStress
0,a624fb9a,SE-064c0cec-1189,5189,2022-01-01,06:25:00,7,191,13.90,2022-01-01 06:25:00,2022-1,...,-3.025000,0.930917,28.012944,0,0,0,3095,-32.987056,0.000000,0
1,a624fb9a,SE-064c0cec-1189,5189,2022-01-01,16:41:00,7,191,16.87,2022-01-01 16:41:00,2022-1,...,-3.025000,0.930917,28.012944,0,0,0,3095,-32.987056,0.000000,0
2,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,15:29:00,7,192,20.41,2022-01-02 15:29:00,2022-1,...,-0.279167,0.990542,32.898193,0,0,0,3096,-28.101807,0.000000,0
3,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,03:31:00,7,192,16.28,2022-01-02 03:31:00,2022-1,...,-0.279167,0.990542,32.898193,0,0,0,3096,-28.101807,0.000000,0
4,a624fb9a,SE-064c0cec-1189,5189,2022-01-02,22:44:00,7,192,11.53,2022-01-02 22:44:00,2022-1,...,-0.279167,0.990542,32.898193,0,0,0,3096,-28.101807,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1364930,f454e660,SE-fcdf259d-0044-0,1044,2023-06-07,00:51:00,10,351,5.63,2023-06-07 00:51:00,2023-3,...,15.645833,0.731917,61.559237,0,0,1,4154,0.559237,0.559237,0
1364931,f454e660,SE-fcdf259d-0044-0,1044,2023-06-07,11:17:00,10,351,3.34,2023-06-07 11:17:00,2023-3,...,15.645833,0.731917,61.559237,0,0,1,4154,0.559237,1.118475,0
1364932,f454e660,SE-fcdf259d-0044-0,1044,2023-06-08,17:01:00,10,352,6.96,2023-06-08 17:01:00,2023-3,...,15.570833,0.601708,59.383267,0,0,1,4155,-1.616733,0.000000,0
1364933,f454e660,SE-fcdf259d-0044-0,1044,2023-06-08,02:23:00,10,352,8.18,2023-06-08 02:23:00,2023-3,...,15.570833,0.601708,59.383267,0,0,1,4155,-1.616733,0.000000,0


In [5]:
# Calculate the DailyYield for each cow each day
data['DailyYield'] = data.groupby(['SE_Number', 'StartDate'])['TotalYield'].transform('sum')

# Sort the data by AnimalNumber and StartDate
data.sort_values(['AnimalNumber', 'StartDate'], inplace=True)

# Calculate the previous day's total yield for each cow
data['PreviousDailyYield'] = data.groupby('AnimalNumber')['DailyYield'].shift(1)

# Calculate the daily yield change for each cow
data['DailyYieldChange'] = data['DailyYield'] - data['PreviousDailyYield']

# Group and aggregate data
data = data.groupby(['SE_Number', 'FarmName_Pseudo', 'StartDate']).agg({
    'DailyYield': 'first',
    'PreviousDailyYield': 'first',
    'DailyYieldChange': 'first',
    'HW': 'max',
    'Temperature': 'mean',
    'THI_adj': 'mean',
    'DaysInMilk': 'first',
    'YearSeason': 'first',
    'cum_HW': 'max',
    'Temp15Threshold': 'max',
    'Age': 'first',
    'BreedName': 'first',
    'LactationNumber': 'first',
    'HeatLoad': 'mean',
    'CumulativeHeatLoad': 'mean',
    'HeatStress': 'max'
}).reset_index()

# Renaming and formatting
data.rename(columns={
    'Temperature': 'MeanTemperature',
    'THI_adj': 'MeanTHI_adj',
    'StartDate': 'Date'
}, inplace=True)
data['Date'] = pd.to_datetime(data['Date'])

# Display the first few rows of the transformed data
data.head()

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,YearSeason,cum_HW,Temp15Threshold,Age,BreedName,LactationNumber,HeatLoad,CumulativeHeatLoad,HeatStress
0,SE-064c0cec-1189,a624fb9a,2022-01-01,30.77,30.77,0.0,0,-3.025,28.012944,191,2022-1,0,0,3095,02 SLB,7,-32.987056,0.0,0
1,SE-064c0cec-1189,a624fb9a,2022-01-02,48.22,30.77,17.45,0,-0.279167,32.898193,192,2022-1,0,0,3096,02 SLB,7,-28.101807,0.0,0
2,SE-064c0cec-1189,a624fb9a,2022-01-03,30.53,48.22,-17.69,0,2.033333,36.760487,193,2022-1,0,0,3097,02 SLB,7,-24.239513,0.0,0
3,SE-064c0cec-1189,a624fb9a,2022-01-04,42.26,30.53,11.73,0,0.066667,31.939524,194,2022-1,0,0,3098,02 SLB,7,-29.060476,0.0,0
4,SE-064c0cec-1189,a624fb9a,2022-01-05,38.49,42.26,-3.77,0,-3.7,26.498206,195,2022-1,0,0,3099,02 SLB,7,-34.501794,0.0,0


In [6]:
# Check if DailyYield is centered around approx the same for each farm
print("Mean of DailyYield:", data.groupby('FarmName_Pseudo')['DailyYield'].mean())
print("Standard Deviation of DailyYield:", data.groupby('FarmName_Pseudo')['DailyYield'].std())

Mean of DailyYield: FarmName_Pseudo
5c06d92d    37.389675
752efd72    31.151716
a624fb9a    33.413694
f454e660    30.485127
Name: DailyYield, dtype: float64
Standard Deviation of DailyYield: FarmName_Pseudo
5c06d92d     9.960240
752efd72     7.799288
a624fb9a    11.050811
f454e660    11.833056
Name: DailyYield, dtype: float64


In [7]:
import numpy as np
import pandas as pd
import warnings
from scipy.optimize import minimize
from tqdm import tqdm
from torch.optim.lr_scheduler import ExponentialLR

# Assuming that RegularizedDualVQRSolver and VectorQuantileRegressor have been imported

# Define the Wilmink Lactation Curve function
def wilmink_lactation_curve(dim, a, b, c, d):
    dim = np.array(dim, dtype=float)
    return a + b * dim + c * np.exp(-d * dim)

# Function to remove outliers
def remove_outliers(group, threshold=3.5):
    mean = np.mean(group['DailyYield'])
    std_dev = np.std(group['DailyYield'])
    return group[(group['DailyYield'] > mean - threshold * std_dev) & (group['DailyYield'] < mean + threshold * std_dev)]

# Function to smooth the data
def smooth_data(group, window=5):
    group = group.copy()
    group['DailyYield'] = group['DailyYield'].rolling(window, min_periods=1).mean()
    return group

# Quantile loss function
def quantile_loss(params, dim, yield_data_quantile):
    a, b, c, d = params
    predictions = wilmink_lactation_curve(dim, a, b, c, d)
    residuals = yield_data_quantile - predictions
    loss = np.sum(np.abs(residuals))
    return loss

# Function to fit the Wilmink curve using VQR for quantile estimation
def fit_wilmink_lactation_curve_with_vqr(dataset, quantile=0.7):
    dataset['ExpectedYield'] = np.nan
    params_dict = {}

    # Initialize the VQR solver and model
    vqr_solver = RegularizedDualVQRSolver(verbose=False, epsilon=1e-2, num_epochs=1000, lr=0.9)
    vqr = VectorQuantileRegressor(solver=vqr_solver)

    for (animal_number, lactation_number), group in tqdm(dataset.groupby(['SE_Number', 'LactationNumber']), unit=" Segments"):
        group = remove_outliers(group)
        group = smooth_data(group)
        x_data = group['DaysInMilk'].values.reshape(-1, 1).astype(float)
        y_data = group['DailyYield'].values.reshape(-1, 1).astype(float)

        if len(x_data) < 10 or len(y_data) < 10:
            continue

        try:
            # Fit the VQR model
            vqr.fit(x_data, y_data)
            all_quantiles = vqr.predict(x_data)

            # Squeeze out the unnecessary dimension
            all_quantiles = np.squeeze(all_quantiles, axis=1)

            # Ensure that `all_quantiles` is 2D (n_samples, n_quantiles)
            if all_quantiles.ndim == 2:
                quantile_index = int(quantile * (all_quantiles.shape[1] - 1))
                y_data_quantile = all_quantiles[:, quantile_index]

                # Fit the Wilmink curve to the quantile data
                initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1]
                bounds = ([-np.inf, -np.inf, -np.inf, 0], [np.inf, np.inf, np.inf, np.inf])

                result = minimize(
                    quantile_loss, initial_guesses, args=(x_data.flatten(), y_data_quantile),
                    method='L-BFGS-B', bounds=bounds
                )

                if result.success:
                    popt = result.x
                    params_dict[(animal_number, lactation_number)] = {'a': popt[0], 'b': popt[1], 'c': popt[2], 'd': popt[3]}
                    dataset.loc[group.index, 'ExpectedYield'] = wilmink_lactation_curve(group['DaysInMilk'], *popt)
                else:
                    print(f"Optimization failed for cow {animal_number}, lactation {lactation_number}.")
            else:
                print(f"Unexpected shape after squeezing: {all_quantiles.shape}")

        except Exception as e:
            print(f"Error with cow {animal_number}, lactation {lactation_number}: {e}")

    dataset['ExpectedYield'] = dataset['ExpectedYield'].fillna(0)
    return dataset, params_dict

# Apply the quantile-based curve fitting function
data, params_dict = fit_wilmink_lactation_curve_with_vqr(data, quantile=0.7)


  0%|          | 1/2315 [00:00<31:51,  1.21 Segments/s]

Error with cow SE-064c0cec-1189, lactation 7: too many values to unpack (expected 2)


  0%|          | 2/2315 [00:01<21:23,  1.80 Segments/s]

Error with cow SE-064c0cec-1189, lactation 8: too many values to unpack (expected 2)


  0%|          | 3/2315 [00:01<16:19,  2.36 Segments/s]

Error with cow SE-30dc5787-1389, lactation 5: too many values to unpack (expected 2)


  0%|          | 4/2315 [00:01<15:43,  2.45 Segments/s]

Error with cow SE-30dc5787-1389, lactation 6: too many values to unpack (expected 2)


  0%|          | 5/2315 [00:02<15:03,  2.56 Segments/s]

Error with cow SE-30dc5787-1389, lactation 7: too many values to unpack (expected 2)


  0%|          | 6/2315 [00:02<15:00,  2.56 Segments/s]

Error with cow SE-30dc5787-1396, lactation 5: too many values to unpack (expected 2)


  0%|          | 7/2315 [00:03<15:20,  2.51 Segments/s]

Error with cow SE-30dc5787-1396, lactation 6: too many values to unpack (expected 2)


  0%|          | 8/2315 [00:03<15:36,  2.46 Segments/s]

Error with cow SE-30dc5787-1402, lactation 5: too many values to unpack (expected 2)


  0%|          | 9/2315 [00:03<15:21,  2.50 Segments/s]

Error with cow SE-5c06d92d-2000, lactation 8: too many values to unpack (expected 2)


  0%|          | 10/2315 [00:04<15:11,  2.53 Segments/s]

Error with cow SE-5c06d92d-2016, lactation 8: too many values to unpack (expected 2)


  0%|          | 11/2315 [00:04<14:55,  2.57 Segments/s]

Error with cow SE-5c06d92d-2055, lactation 6: too many values to unpack (expected 2)


  1%|          | 12/2315 [00:05<15:28,  2.48 Segments/s]

Error with cow SE-5c06d92d-2055, lactation 7: too many values to unpack (expected 2)


  1%|          | 13/2315 [00:05<15:08,  2.53 Segments/s]

Error with cow SE-5c06d92d-2058, lactation 7: too many values to unpack (expected 2)


  1%|          | 14/2315 [00:05<15:44,  2.43 Segments/s]

Error with cow SE-5c06d92d-2058, lactation 8: too many values to unpack (expected 2)


  1%|          | 15/2315 [00:06<14:06,  2.72 Segments/s]

Error with cow SE-5c06d92d-2058, lactation 9: too many values to unpack (expected 2)


  1%|          | 16/2315 [00:06<14:35,  2.63 Segments/s]

Error with cow SE-5c06d92d-2211, lactation 7: too many values to unpack (expected 2)


  1%|          | 17/2315 [00:06<14:35,  2.63 Segments/s]

Error with cow SE-5c06d92d-2246, lactation 6: too many values to unpack (expected 2)


  1%|          | 18/2315 [00:07<14:26,  2.65 Segments/s]

Error with cow SE-5c06d92d-2254, lactation 6: too many values to unpack (expected 2)


  1%|          | 19/2315 [00:07<14:28,  2.64 Segments/s]

Error with cow SE-5c06d92d-2254, lactation 7: too many values to unpack (expected 2)


  1%|          | 20/2315 [00:07<13:59,  2.73 Segments/s]

Error with cow SE-5c06d92d-2268, lactation 6: too many values to unpack (expected 2)


  1%|          | 21/2315 [00:08<14:59,  2.55 Segments/s]

Error with cow SE-5c06d92d-2268, lactation 7: too many values to unpack (expected 2)


  1%|          | 22/2315 [00:08<13:31,  2.83 Segments/s]

Error with cow SE-5c06d92d-2268, lactation 8: too many values to unpack (expected 2)


  1%|          | 23/2315 [00:09<13:35,  2.81 Segments/s]

Error with cow SE-5c06d92d-2283, lactation 6: too many values to unpack (expected 2)


  1%|          | 24/2315 [00:09<14:24,  2.65 Segments/s]

Error with cow SE-5c06d92d-2283, lactation 7: too many values to unpack (expected 2)


  1%|          | 25/2315 [00:09<14:07,  2.70 Segments/s]

Error with cow SE-5c06d92d-2325, lactation 5: too many values to unpack (expected 2)


  1%|          | 26/2315 [00:10<15:11,  2.51 Segments/s]

Error with cow SE-5c06d92d-2325, lactation 6: too many values to unpack (expected 2)


  1%|          | 27/2315 [00:10<13:39,  2.79 Segments/s]

Error with cow SE-5c06d92d-2325, lactation 7: too many values to unpack (expected 2)


  1%|          | 28/2315 [00:10<13:02,  2.92 Segments/s]

Error with cow SE-5c06d92d-2327, lactation 6: too many values to unpack (expected 2)


  1%|▏         | 29/2315 [00:11<12:17,  3.10 Segments/s]

Error with cow SE-5c06d92d-2328, lactation 5: too many values to unpack (expected 2)


  1%|▏         | 30/2315 [00:11<13:06,  2.90 Segments/s]

Error with cow SE-5c06d92d-2333, lactation 6: too many values to unpack (expected 2)


  1%|▏         | 31/2315 [00:11<13:36,  2.80 Segments/s]

Error with cow SE-5c06d92d-2333, lactation 7: too many values to unpack (expected 2)


  1%|▏         | 31/2315 [00:12<14:56,  2.55 Segments/s]


KeyboardInterrupt: 