In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.optimize import curve_fit, OptimizeWarning
from tqdm import tqdm
import warnings
from scipy.stats import zscore
from statsmodels.tsa.stattools import acf, pacf
from scipy.optimize import minimize
from vqr import VectorQuantileRegressor
from vqr.solvers.regularized_lse import RegularizedDualVQRSolver
import statsmodels.api as sm


sns.set_theme()
sns.set_context("notebook")
%load_ext autoreload
%autoreload 2

In [2]:
dtype_dict = {
    'FarmName_Pseudo': 'str',
    'SE_Number': 'str',
    'AnimalNumber': 'Int64',          
    'StartDate': 'str',
    'StartTime': 'str',
    'DateTime': 'str',
    'LactationNumber': 'Int64',       
    'DaysInMilk': 'Int64', 
    'YearSeason': 'str',           
    'TotalYield': 'float',
    'DateTime': 'str',
    'BreedName': 'str',
    'Age': 'Int64',
    'Mother': 'str',
    'Father': 'str',
    'CullDecisionDate': 'str',
    'Temperature': 'float',
    'RelativeHumidity': 'float',      
    'THI_adj': 'float',
    'HW': 'Int64',                    
    'cum_HW': 'Int64',                
    'Temp15Threshold': 'Int64'        
}


# Load the CSV with specified dtypes
data = pd.read_csv('../Data/MergedData/CleanedYieldData.csv', dtype=dtype_dict)

# Convert date and time columns back to datetime and time objects
data['DateTime'] = pd.to_datetime(data['DateTime'], errors='coerce')
data['StartTime'] = pd.to_datetime(data['StartTime'], format='%H:%M:%S', errors='coerce').dt.time
data['StartDate'] = pd.to_datetime(data['StartDate'], errors='coerce')
data['CullDecisionDate'] = pd.to_datetime(data['CullDecisionDate'], errors='coerce')
data['DateTime'] = pd.to_datetime(data['DateTime'], errors='coerce')
data.head()

Unnamed: 0,FarmName_Pseudo,SE_Number,AnimalNumber,StartDate,StartTime,LactationNumber,DaysInMilk,YearSeason,TotalYield,DateTime,...,Mother,Father,CullDecisionDate,Temperature2,RelativeHumidity,THI_adj2,HW,cum_HW,Temp15Threshold,Age
0,5b581702,SE-27c3257a-1492,1492,2022-11-25,07:13:00,1,32,202204,18.57,2022-11-25 07:13:00,...,SE-27c3257a-1355,SE-458cc45c-7488,NaT,1.2,0.968,35.706,0,0,0,773
1,5b581702,SE-27c3257a-1492,1492,2022-11-25,14:13:00,1,32,202204,9.73,2022-11-25 14:13:00,...,SE-27c3257a-1355,SE-458cc45c-7488,NaT,1.7,0.971,38.17406,0,0,0,773
2,5b581702,SE-27c3257a-1492,1492,2022-11-25,23:02:00,1,32,202204,11.3,2022-11-25 23:02:00,...,SE-27c3257a-1355,SE-458cc45c-7488,NaT,1.6,0.982,37.8276,0,0,0,773
3,5b581702,SE-27c3257a-1492,1492,2022-11-26,08:01:00,1,33,202204,12.78,2022-11-26 08:01:00,...,SE-27c3257a-1355,SE-458cc45c-7488,NaT,1.6,0.979,36.56612,0,0,0,774
4,5b581702,SE-27c3257a-1492,1492,2022-11-26,14:53:00,1,33,202204,9.41,2022-11-26 14:53:00,...,SE-27c3257a-1355,SE-458cc45c-7488,NaT,1.3,0.976,37.1724,0,0,0,774


In [4]:
# Calculate the DailyYield for each cow each day
data['DailyYield'] = data.groupby(['SE_Number', 'StartDate'])['TotalYield'].transform('sum')

# Sort the data by AnimalNumber and StartDate
data.sort_values(['AnimalNumber', 'StartDate'], inplace=True)

# Calculate the previous day's total yield for each cow
data['PreviousDailyYield'] = data.groupby('AnimalNumber')['DailyYield'].shift(1)

# Calculate the daily yield change for each cow
data['DailyYieldChange'] = data['DailyYield'] - data['PreviousDailyYield']

# Group and aggregate data
data = data.groupby(['SE_Number', 'FarmName_Pseudo', 'StartDate']).agg({
    'DailyYield': 'first',
    'PreviousDailyYield': 'first',
    'DailyYieldChange': 'first',
    'HW': 'max',
    'Temperature2': 'mean',
    'THI_adj2': 'mean',
    'DaysInMilk': 'first',
    'YearSeason': 'first',
    'cum_HW': 'max',
    'Temp15Threshold': 'max',
    'Age': 'first',
    'BreedName': 'first',
    'LactationNumber': 'first'
}).reset_index()

# Renaming and formatting
data.rename(columns={
    'Temperature2': 'MeanTemperature',
    'THI_adj2': 'MeanTHI_adj',
    'StartDate': 'Date'
}, inplace=True)
data['Date'] = pd.to_datetime(data['Date'])

# Display the first few rows of the transformed data
data.head()

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,YearSeason,cum_HW,Temp15Threshold,Age,BreedName,LactationNumber
0,SE-27c3257a-1492,5b581702,2022-11-25,39.6,39.6,0.0,0,1.5,37.235887,32,202204,0,0,773,DairyCross,1
1,SE-27c3257a-1492,5b581702,2022-11-26,22.19,39.6,-17.41,0,1.45,36.86926,33,202204,0,0,774,DairyCross,1
2,SE-27c3257a-1492,5b581702,2022-11-27,29.4,22.19,7.21,0,-0.15,30.6789,34,202204,0,0,775,DairyCross,1
3,SE-27c3257a-1492,5b581702,2022-11-28,27.03,29.4,-2.37,0,2.7,37.0075,35,202204,0,0,776,DairyCross,1
4,SE-27c3257a-1492,5b581702,2022-11-29,26.88,27.03,-0.15,0,2.85,38.53602,36,202204,0,0,777,DairyCross,1


In [5]:
# Check if DailyYield is centered around approx the same for each farm
print("Mean of DailyYield:", data.groupby('FarmName_Pseudo')['DailyYield'].mean())
print("Standard Deviation of DailyYield:", data.groupby('FarmName_Pseudo')['DailyYield'].std())

Mean of DailyYield: FarmName_Pseudo
5b581702    35.678012
5c06d92d    37.452461
6d38bc90    13.700241
a624fb9a    34.210868
a756bc39    35.051945
ad0a39f5    39.726063
f454e660    31.359891
Name: DailyYield, dtype: float64
Standard Deviation of DailyYield: FarmName_Pseudo
5b581702    11.797328
5c06d92d     9.368416
6d38bc90     4.865680
a624fb9a    10.963657
a756bc39    12.326717
ad0a39f5     9.542836
f454e660    11.422065
Name: DailyYield, dtype: float64


In [6]:
# Define the Wilmink Lactation Curve function
def wilmink_lactation_curve(dim, a, b, c, d):
    return a + b * dim + c * np.exp(-d * dim)

# Function to remove outliers
def remove_outliers(group, threshold=3.5):
    mean = np.mean(group['DailyYield'])
    std_dev = np.std(group['DailyYield'])
    return group[(group['DailyYield'] > mean - threshold * std_dev) & (group['DailyYield'] < mean + threshold * std_dev)]

# Function to smooth the data using .loc to avoid SettingWithCopyWarning
def smooth_data(group, window=5):
    group.loc[:, 'DailyYield'] = group['DailyYield'].rolling(window, min_periods=1).mean()
    return group

# Function to fit curve_fit before applying Quantile Regression
def fit_with_curve_fit_before_quantreg(dataset, quantile=0.7, max_iter=100000):
    params_dict = {}
    valid_indices = []

    for (animal_number, lactation_number), group in tqdm(dataset.groupby(['SE_Number', 'LactationNumber']), unit=" Segments"):
        try:
            group = remove_outliers(group)
            group = smooth_data(group)
            x_data = group['DaysInMilk'].values.astype(float)
            y_data = group['DailyYield'].values.astype(float)

            # Ensure there are enough data points to fit the curve
            if (len(x_data) < 150) or (len(y_data) < 150):
                print(f"Insufficient data points for cow {animal_number}, lactation {lactation_number}, skipping.")
                continue

            valid_indices.extend(group.index)

            # Fit the model using curve_fit
            try:
                # Initial parameter guesses
                initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1]
                # Bounds on the parameters to prevent overflow
                bounds = ([-np.inf, -np.inf, -np.inf, 0], [np.inf, np.inf, np.inf, np.inf])

                with warnings.catch_warnings():
                    warnings.filterwarnings('error', category=OptimizeWarning)
                    popt, _ = curve_fit(
                        wilmink_lactation_curve, x_data, y_data,
                        p0=initial_guesses, bounds=bounds, maxfev=30000
                    )

                # Store the parameters in the dictionary
                params_dict[(animal_number, lactation_number)] = {'a': popt[0], 'b': popt[1], 'c': popt[2], 'd': popt[3]}

            except Exception as e:
                print(f"Curve fitting failed for cow {animal_number}, lactation {lactation_number}: {e}")
                continue

            # Now use the parameters from curve_fit for quantile regression
            X = np.column_stack([np.ones_like(x_data), x_data, np.exp(-x_data), -x_data * np.exp(-x_data)])
            quantreg_model = sm.QuantReg(y_data, X)
            quantreg_fit = quantreg_model.fit(q=quantile, max_iter=max_iter, start_params=popt)

            # Update parameters after quantile regression
            a, b, c, d = quantreg_fit.params
            dataset.loc[group.index, 'ExpectedYield'] = wilmink_lactation_curve(group['DaysInMilk'], a, b, c, d)
            params_dict[(animal_number, lactation_number)] = {'a': a, 'b': b, 'c': c, 'd': d}

        except Exception as e:
            print(f"Error processing cow {animal_number}, lactation {lactation_number}: {e}")

    return dataset, params_dict

# Apply the curve fitting before quantile regression
data, params_dict = fit_with_curve_fit_before_quantreg(data, quantile=0.7, max_iter=100000)

# Remove rows where ExpectedYield is NaN
data = data.dropna(subset=['ExpectedYield'])

# Calculate NormalizedDailyYield, PreviousDailyYield, DailyYieldChange, and NormalizedDailyYieldChange
data.loc[:, 'NormalizedDailyYield'] = data['DailyYield'] / data['ExpectedYield']
data.loc[:, 'PreviousDailyYield'] = data.groupby('SE_Number')['DailyYield'].shift(1)
data.loc[:, 'DailyYieldChange'] = data['DailyYield'] - data['PreviousDailyYield']
data.loc[:, 'NormalizedDailyYieldChange'] = data['DailyYieldChange'] / data['ExpectedYield']
data

  2%|▏         | 27/1207 [00:06<01:45, 11.15 Segments/s]

Insufficient data points for cow SE-5b581702-1912, lactation 3, skipping.


  4%|▎         | 44/1207 [00:09<02:48,  6.89 Segments/s]

Insufficient data points for cow SE-5b581702-2002, lactation 3, skipping.


  result = getattr(ufunc, method)(*inputs2, **kwargs)
 10%|▉         | 116/1207 [00:18<01:32, 11.75 Segments/s]

Insufficient data points for cow SE-5c06d92d-2744, lactation 4, skipping.
Insufficient data points for cow SE-5c06d92d-2762, lactation 5, skipping.


 10%|█         | 125/1207 [00:18<01:04, 16.85 Segments/s]

Insufficient data points for cow SE-5c06d92d-2782, lactation 5, skipping.


 13%|█▎        | 158/1207 [00:19<00:34, 30.30 Segments/s]

Insufficient data points for cow SE-5c06d92d-2866, lactation 4, skipping.


 14%|█▍        | 175/1207 [00:20<00:36, 28.55 Segments/s]

Insufficient data points for cow SE-5c06d92d-2941, lactation 3, skipping.


 17%|█▋        | 206/1207 [00:23<02:04,  8.06 Segments/s]

Insufficient data points for cow SE-5c06d92d-3017, lactation 4, skipping.


 20%|██        | 247/1207 [00:29<00:54, 17.62 Segments/s]

Insufficient data points for cow SE-5c06d92d-3054, lactation 3, skipping.


 21%|██▏       | 259/1207 [00:29<00:33, 27.96 Segments/s]

Insufficient data points for cow SE-5c06d92d-3076, lactation 4, skipping.


 22%|██▏       | 271/1207 [00:30<00:29, 31.54 Segments/s]

Insufficient data points for cow SE-5c06d92d-3106, lactation 4, skipping.


 27%|██▋       | 327/1207 [00:35<01:36,  9.07 Segments/s]

Insufficient data points for cow SE-5c06d92d-3176, lactation 3, skipping.


 32%|███▏      | 388/1207 [00:39<00:28, 28.51 Segments/s]

Insufficient data points for cow SE-5c06d92d-3226, lactation 3, skipping.


 34%|███▍      | 408/1207 [00:39<00:26, 29.78 Segments/s]

Insufficient data points for cow SE-5c06d92d-3266, lactation 2, skipping.


 37%|███▋      | 445/1207 [00:40<00:17, 43.68 Segments/s]

Insufficient data points for cow SE-5c06d92d-3292, lactation 1, skipping.


 39%|███▉      | 468/1207 [00:41<00:19, 38.89 Segments/s]

Insufficient data points for cow SE-5c06d92d-3330, lactation 2, skipping.


 41%|████▏     | 500/1207 [00:43<00:30, 23.41 Segments/s]

Insufficient data points for cow SE-5c06d92d-3377, lactation 2, skipping.


 43%|████▎     | 518/1207 [00:43<00:17, 38.90 Segments/s]

Insufficient data points for cow SE-5c06d92d-3403, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-3404, lactation 2, skipping.


 44%|████▍     | 532/1207 [00:45<00:45, 14.88 Segments/s]

Insufficient data points for cow SE-5c06d92d-3427, lactation 2, skipping.


  result = getattr(ufunc, method)(*inputs2, **kwargs)
 51%|█████     | 613/1207 [00:47<00:08, 66.85 Segments/s]

Insufficient data points for cow SE-5c06d92d-3589, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3593, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3594, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3599, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3601, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3602, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3605, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3607, lactation 1, skipping.
Insufficient data points for cow SE-61d8a0fa-0972, lactation 5, skipping.
Insufficient data points for cow SE-6d38bc90-2373, lactation 5, skipping.
Insufficient data points for cow SE-6d38bc90-2592, lactation 3, skipping.


 51%|█████▏    | 621/1207 [00:47<00:10, 56.99 Segments/s]

Insufficient data points for cow SE-6d38bc90-2869, lactation 2, skipping.
Insufficient data points for cow SE-6d38bc90-2923, lactation 2, skipping.
Insufficient data points for cow SE-6d38bc90-3120, lactation 1, skipping.
Insufficient data points for cow SE-6d38bc90-3146, lactation 1, skipping.
Insufficient data points for cow SE-7b463eec-1624, lactation 2, skipping.
Insufficient data points for cow SE-a624fb9a-1187, lactation 6, skipping.


 54%|█████▍    | 650/1207 [00:50<00:44, 12.54 Segments/s]

Insufficient data points for cow SE-a624fb9a-1275, lactation 5, skipping.
Insufficient data points for cow SE-a624fb9a-1287, lactation 5, skipping.


 55%|█████▌    | 665/1207 [00:54<01:48,  4.97 Segments/s]

Insufficient data points for cow SE-a624fb9a-1322, lactation 4, skipping.


 56%|█████▌    | 675/1207 [00:54<01:06,  8.04 Segments/s]

Insufficient data points for cow SE-a624fb9a-1330, lactation 5, skipping.


 57%|█████▋    | 691/1207 [00:57<00:56,  9.08 Segments/s]

Insufficient data points for cow SE-a624fb9a-1342, lactation 4, skipping.
Insufficient data points for cow SE-a624fb9a-1348, lactation 3, skipping.


 68%|██████▊   | 817/1207 [01:06<00:10, 37.23 Segments/s]

Insufficient data points for cow SE-a624fb9a-1521, lactation 1, skipping.
Insufficient data points for cow SE-a756bc39-0884, lactation 7, skipping.
Insufficient data points for cow SE-a756bc39-0894, lactation 6, skipping.


 69%|██████▉   | 832/1207 [01:06<00:08, 45.95 Segments/s]

Insufficient data points for cow SE-a756bc39-0975, lactation 5, skipping.
Insufficient data points for cow SE-a756bc39-0986, lactation 5, skipping.
Insufficient data points for cow SE-a756bc39-0999, lactation 5, skipping.
Insufficient data points for cow SE-a756bc39-1024, lactation 4, skipping.


 70%|██████▉   | 843/1207 [01:17<02:20,  2.59 Segments/s]

Insufficient data points for cow SE-a756bc39-1070, lactation 4, skipping.
Insufficient data points for cow SE-a756bc39-1112, lactation 3, skipping.
Insufficient data points for cow SE-a756bc39-1127, lactation 3, skipping.
Insufficient data points for cow SE-a756bc39-1129, lactation 3, skipping.
Insufficient data points for cow SE-a756bc39-1135, lactation 3, skipping.


 71%|███████   | 853/1207 [01:17<01:22,  4.30 Segments/s]

Insufficient data points for cow SE-a756bc39-1154, lactation 3, skipping.


 71%|███████   | 859/1207 [01:20<01:31,  3.82 Segments/s]

Insufficient data points for cow SE-a756bc39-1157, lactation 3, skipping.


 72%|███████▏  | 867/1207 [01:22<01:30,  3.74 Segments/s]

Insufficient data points for cow SE-a756bc39-1159, lactation 3, skipping.
Insufficient data points for cow SE-a756bc39-1162, lactation 3, skipping.
Insufficient data points for cow SE-a756bc39-1195, lactation 3, skipping.


 73%|███████▎  | 879/1207 [01:22<00:44,  7.43 Segments/s]

Insufficient data points for cow SE-a756bc39-1206, lactation 3, skipping.
Insufficient data points for cow SE-a756bc39-1207, lactation 2, skipping.
Insufficient data points for cow SE-a756bc39-1222, lactation 2, skipping.
Insufficient data points for cow SE-a756bc39-1224, lactation 2, skipping.


 74%|███████▍  | 893/1207 [01:23<00:21, 14.50 Segments/s]

Insufficient data points for cow SE-a756bc39-1226, lactation 2, skipping.
Insufficient data points for cow SE-a756bc39-1230, lactation 2, skipping.
Insufficient data points for cow SE-a756bc39-1231, lactation 2, skipping.


 75%|███████▍  | 903/1207 [01:23<00:15, 20.26 Segments/s]

Insufficient data points for cow SE-a756bc39-1252, lactation 2, skipping.
Insufficient data points for cow SE-a756bc39-1264, lactation 1, skipping.
Insufficient data points for cow SE-a756bc39-1268, lactation 1, skipping.


 75%|███████▌  | 909/1207 [01:23<00:11, 25.53 Segments/s]

Insufficient data points for cow SE-a756bc39-1275, lactation 1, skipping.
Insufficient data points for cow SE-a756bc39-1276, lactation 1, skipping.
Insufficient data points for cow SE-a756bc39-1284, lactation 1, skipping.
Insufficient data points for cow SE-a756bc39-1285, lactation 1, skipping.
Insufficient data points for cow SE-a756bc39-1288, lactation 1, skipping.


 77%|███████▋  | 935/1207 [01:25<00:15, 17.83 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2139, lactation 5, skipping.
Insufficient data points for cow SE-ad0a39f5-2147, lactation 5, skipping.
Insufficient data points for cow SE-ad0a39f5-2176, lactation 4, skipping.
Insufficient data points for cow SE-ad0a39f5-2230, lactation 4, skipping.


 79%|███████▊  | 948/1207 [01:26<00:10, 25.56 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2280, lactation 4, skipping.
Insufficient data points for cow SE-ad0a39f5-2282, lactation 3, skipping.
Insufficient data points for cow SE-ad0a39f5-2283, lactation 4, skipping.
Insufficient data points for cow SE-ad0a39f5-2288, lactation 4, skipping.
Insufficient data points for cow SE-ad0a39f5-2295, lactation 4, skipping.
Insufficient data points for cow SE-ad0a39f5-2321, lactation 4, skipping.


 79%|███████▉  | 957/1207 [01:28<00:28,  8.90 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2339, lactation 4, skipping.
Insufficient data points for cow SE-ad0a39f5-2346, lactation 3, skipping.
Insufficient data points for cow SE-ad0a39f5-2349, lactation 4, skipping.


 80%|████████  | 970/1207 [01:31<00:31,  7.41 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2392, lactation 4, skipping.
Insufficient data points for cow SE-ad0a39f5-2420, lactation 3, skipping.
Insufficient data points for cow SE-ad0a39f5-2438, lactation 3, skipping.


 83%|████████▎ | 1007/1207 [01:38<00:31,  6.33 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2533, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2537, lactation 3, skipping.


  result = getattr(ufunc, method)(*inputs2, **kwargs)
 84%|████████▍ | 1018/1207 [01:38<00:16, 11.41 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2548, lactation 3, skipping.
Insufficient data points for cow SE-ad0a39f5-2576, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2580, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2599, lactation 2, skipping.


 85%|████████▌ | 1031/1207 [01:38<00:08, 20.83 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2621, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2625, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2628, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2629, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2642, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2643, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2644, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2647, lactation 2, skipping.


 86%|████████▋ | 1042/1207 [01:39<00:05, 31.68 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2653, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2654, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2657, lactation 2, skipping.


 88%|████████▊ | 1057/1207 [01:41<00:12, 11.55 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2664, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2668, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2676, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2677, lactation 2, skipping.


 89%|████████▉ | 1080/1207 [01:42<00:05, 23.19 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2710, lactation 2, skipping.


 91%|█████████ | 1095/1207 [01:42<00:03, 31.92 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2734, lactation 1, skipping.


 92%|█████████▏| 1110/1207 [01:43<00:03, 32.28 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2765, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2768, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2781, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2787, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2790, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2792, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2795, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2797, lactation 1, skipping.


 94%|█████████▍| 1136/1207 [01:43<00:01, 70.03 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2806, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2807, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2810, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2811, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2813, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2815, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2816, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2817, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2831, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2832, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2842, lactation 1, skipping.


100%|██████████| 1207/1207 [02:11<00:00,  9.14 Segments/s]


Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,YearSeason,cum_HW,Temp15Threshold,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange
0,SE-27c3257a-1492,5b581702,2022-11-25,39.60,,,0,1.500000,37.235887,32,202204,0,0,773,DairyCross,1,32.969051,1.201126,
1,SE-27c3257a-1492,5b581702,2022-11-26,22.19,39.60,-17.41,0,1.450000,36.869260,33,202204,0,0,774,DairyCross,1,32.956278,0.673316,-0.528276
2,SE-27c3257a-1492,5b581702,2022-11-27,29.40,22.19,7.21,0,-0.150000,30.678900,34,202204,0,0,775,DairyCross,1,32.943505,0.892437,0.21886
3,SE-27c3257a-1492,5b581702,2022-11-28,27.03,29.40,-2.37,0,2.700000,37.007500,35,202204,0,0,776,DairyCross,1,32.930733,0.820814,-0.071969
4,SE-27c3257a-1492,5b581702,2022-11-29,26.88,27.03,-0.15,0,2.850000,38.536020,36,202204,0,0,777,DairyCross,1,32.91796,0.816576,-0.004557
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311101,SE-f454e660-0829,f454e660,2024-08-14,31.69,30.01,1.68,0,17.325000,64.076020,284,202403,0,1,939,SLB,1,27.922181,1.13494,0.060167
311102,SE-f454e660-0829,f454e660,2024-08-15,26.34,31.69,-5.35,0,21.066667,65.716753,285,202403,0,1,940,SLB,1,27.895739,0.94423,-0.191786
311103,SE-f454e660-0829,f454e660,2024-08-16,24.10,26.34,-2.24,0,19.966667,64.264293,286,202403,0,1,941,SLB,1,27.869296,0.864751,-0.080375
311104,SE-f454e660-0829,f454e660,2024-08-17,25.73,24.10,1.63,0,16.333333,60.634800,287,202403,0,1,942,SLB,1,27.842854,0.924115,0.058543


In [7]:
# Check if NormalizedDailyYield is centered around 1 for each unique farm
print("Mean of NormalizedDailyYield:", data.groupby('FarmName_Pseudo')['NormalizedDailyYield'].mean())
print("Standard Deviation of NormalizedDailyYield:", data.groupby('FarmName_Pseudo')['NormalizedDailyYield'].std())

Mean of NormalizedDailyYield: FarmName_Pseudo
5b581702    0.936165
5c06d92d    0.945045
a624fb9a     0.92939
ad0a39f5    0.951541
f454e660    0.929767
Name: NormalizedDailyYield, dtype: Float64
Standard Deviation of NormalizedDailyYield: FarmName_Pseudo
5b581702    0.188149
5c06d92d    0.141736
a624fb9a    0.183671
ad0a39f5    0.122988
f454e660     0.21784
Name: NormalizedDailyYield, dtype: Float64


In [8]:
# Define the THI threshold
THI_THRESHOLD = 61

# Calculate the daily heat load based on the THI threshold
data['HeatLoad'] = data['MeanTHI_adj'].apply(lambda x: x - THI_THRESHOLD if x > THI_THRESHOLD else -(THI_THRESHOLD - x))

# Initialize the cumulative heat load column with float type
data['CumulativeHeatLoad'] = 0.0  # Explicitly set as float

data = data.reset_index(drop=True)

# Iterate through the data to calculate cumulative heat load correctly
for i in range(1, len(data)):
    previous_cumulative = data.at[i-1, 'CumulativeHeatLoad']
    current_heat_load = data.at[i, 'HeatLoad']
    
    if current_heat_load < 0:  # If current heat load is negative
        new_cumulative = previous_cumulative + 2 * current_heat_load
    else:
        new_cumulative = previous_cumulative + current_heat_load
    
    # Ensure the cumulative heat load never goes below zero
    if new_cumulative > 0:
        data.at[i, 'CumulativeHeatLoad'] = new_cumulative
    else:
        data.at[i, 'CumulativeHeatLoad'] = 0.0  # Ensure float is maintained

# Drop rows where the 'DailyYield' column has NaN values
data = data.dropna(subset=['DailyYield'])

data.head(-5)

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,cum_HW,Temp15Threshold,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad
0,SE-27c3257a-1492,5b581702,2022-11-25,39.60,,,0,1.500000,37.235887,32,...,0,0,773,DairyCross,1,32.969051,1.201126,,-23.764113,0.000000
1,SE-27c3257a-1492,5b581702,2022-11-26,22.19,39.60,-17.41,0,1.450000,36.869260,33,...,0,0,774,DairyCross,1,32.956278,0.673316,-0.528276,-24.130740,0.000000
2,SE-27c3257a-1492,5b581702,2022-11-27,29.40,22.19,7.21,0,-0.150000,30.678900,34,...,0,0,775,DairyCross,1,32.943505,0.892437,0.21886,-30.321100,0.000000
3,SE-27c3257a-1492,5b581702,2022-11-28,27.03,29.40,-2.37,0,2.700000,37.007500,35,...,0,0,776,DairyCross,1,32.930733,0.820814,-0.071969,-23.992500,0.000000
4,SE-27c3257a-1492,5b581702,2022-11-29,26.88,27.03,-0.15,0,2.850000,38.536020,36,...,0,0,777,DairyCross,1,32.91796,0.816576,-0.004557,-22.463980,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297961,SE-f454e660-0829,f454e660,2024-08-09,23.96,32.13,-8.17,0,19.066667,64.864167,279,...,0,1,934,SLB,1,28.054395,0.854055,-0.29122,3.864167,85.509565
297962,SE-f454e660-0829,f454e660,2024-08-10,24.37,23.96,0.41,0,18.466667,54.884093,280,...,0,1,935,SLB,1,28.027952,0.869489,0.014628,-6.115907,73.277752
297963,SE-f454e660-0829,f454e660,2024-08-11,26.92,24.37,2.55,0,17.933333,55.203273,281,...,0,1,936,SLB,1,28.001509,0.961377,0.091067,-5.796727,61.684298
297964,SE-f454e660-0829,f454e660,2024-08-12,23.94,26.92,-2.98,0,17.166667,58.966293,282,...,0,1,937,SLB,1,27.975067,0.855762,-0.106523,-2.033707,57.616885


In [9]:
# When CumulativeHeatLoad is greater than 3, it indicates that the cow is under heat stress
data['HeatStress'] = (data['CumulativeHeatLoad'] > 3).astype(int)
data.head(-5)

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Temp15Threshold,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad,HeatStress
0,SE-27c3257a-1492,5b581702,2022-11-25,39.60,,,0,1.500000,37.235887,32,...,0,773,DairyCross,1,32.969051,1.201126,,-23.764113,0.000000,0
1,SE-27c3257a-1492,5b581702,2022-11-26,22.19,39.60,-17.41,0,1.450000,36.869260,33,...,0,774,DairyCross,1,32.956278,0.673316,-0.528276,-24.130740,0.000000,0
2,SE-27c3257a-1492,5b581702,2022-11-27,29.40,22.19,7.21,0,-0.150000,30.678900,34,...,0,775,DairyCross,1,32.943505,0.892437,0.21886,-30.321100,0.000000,0
3,SE-27c3257a-1492,5b581702,2022-11-28,27.03,29.40,-2.37,0,2.700000,37.007500,35,...,0,776,DairyCross,1,32.930733,0.820814,-0.071969,-23.992500,0.000000,0
4,SE-27c3257a-1492,5b581702,2022-11-29,26.88,27.03,-0.15,0,2.850000,38.536020,36,...,0,777,DairyCross,1,32.91796,0.816576,-0.004557,-22.463980,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297961,SE-f454e660-0829,f454e660,2024-08-09,23.96,32.13,-8.17,0,19.066667,64.864167,279,...,1,934,SLB,1,28.054395,0.854055,-0.29122,3.864167,85.509565,1
297962,SE-f454e660-0829,f454e660,2024-08-10,24.37,23.96,0.41,0,18.466667,54.884093,280,...,1,935,SLB,1,28.027952,0.869489,0.014628,-6.115907,73.277752,1
297963,SE-f454e660-0829,f454e660,2024-08-11,26.92,24.37,2.55,0,17.933333,55.203273,281,...,1,936,SLB,1,28.001509,0.961377,0.091067,-5.796727,61.684298,1
297964,SE-f454e660-0829,f454e660,2024-08-12,23.94,26.92,-2.98,0,17.166667,58.966293,282,...,1,937,SLB,1,27.975067,0.855762,-0.106523,-2.033707,57.616885,1


In [10]:
# Make a dataframe from the parameters dictionary, it should contain Se_Number, LactationNumber, a, b, c, d
params_df = pd.DataFrame(params_dict).T.reset_index()
params_df.columns = ['SE_Number', 'LactationNumber', 'a', 'b', 'c', 'd']
params_df.head(-5)

Unnamed: 0,SE_Number,LactationNumber,a,b,c,d
0,SE-27c3257a-1492,1,33.377780,-0.012773,-3.909940e-26,1.275143e-24
1,SE-27c3257a-1492,2,52.204415,-0.085917,5.498649e+02,3.991520e+02
2,SE-30dc5787-1389,6,53.550501,-0.078500,1.622484e+03,7.041568e+02
3,SE-4b8091ac-1472,1,34.455809,-0.022543,3.570575e-14,-7.041866e-13
4,SE-5b581702-1742,3,51.529751,-0.062250,3.196039e-11,-4.776671e-10
...,...,...,...,...,...,...
1064,SE-f454e660-0791,1,28.983995,-0.012756,-2.198956e+02,1.319373e+03
1065,SE-f454e660-0793,1,34.934424,-0.002913,2.042574e+04,4.936441e+03
1066,SE-f454e660-0794,1,40.528130,-0.030210,-1.715867e+02,1.029520e+03
1067,SE-f454e660-0795,1,21.530605,-0.004697,3.096113e+03,8.352267e+02


In [11]:
# Calculate Z-scores for each parameter
params_df['z_a'] = zscore(params_df['a'])
params_df['z_b'] = zscore(params_df['b'])
params_df['z_c'] = zscore(params_df['c'])
params_df['z_d'] = zscore(params_df['d'])

params_df.head(-5)

Unnamed: 0,SE_Number,LactationNumber,a,b,c,d,z_a,z_b,z_c,z_d
0,SE-27c3257a-1492,1,33.377780,-0.012773,-3.909940e-26,1.275143e-24,-1.315942,1.078161,-0.580496,-0.356803
1,SE-27c3257a-1492,2,52.204415,-0.085917,5.498649e+02,3.991520e+02,0.513397,-0.879186,-0.516239,-0.284445
2,SE-30dc5787-1389,6,53.550501,-0.078500,1.622484e+03,7.041568e+02,0.644193,-0.680715,-0.390895,-0.229154
3,SE-4b8091ac-1472,1,34.455809,-0.022543,3.570575e-14,-7.041866e-13,-1.211192,0.816708,-0.580496,-0.356803
4,SE-5b581702-1742,3,51.529751,-0.062250,3.196039e-11,-4.776671e-10,0.447841,-0.245861,-0.580496,-0.356803
...,...,...,...,...,...,...,...,...,...,...
1064,SE-f454e660-0791,1,28.983995,-0.012756,-2.198956e+02,1.319373e+03,-1.742875,1.078623,-0.606192,-0.117627
1065,SE-f454e660-0793,1,34.934424,-0.002913,2.042574e+04,4.936441e+03,-1.164686,1.342001,1.806424,0.538073
1066,SE-f454e660-0794,1,40.528130,-0.030210,-1.715867e+02,1.029520e+03,-0.621160,0.611550,-0.600547,-0.170172
1067,SE-f454e660-0795,1,21.530605,-0.004697,3.096113e+03,8.352267e+02,-2.467103,1.294265,-0.218689,-0.205393


In [12]:
# Identify outliers (using Z-score > 3.5 or < -3.5 as threshold)
outliers = params_df[(np.abs(params_df[['z_a', 'z_b', 'z_c', 'z_d']]) > 3.5).any(axis=1)]

x = outliers.count()
print("Number of outliers:", x)

# Optionally, drop the outliers
params_df_cleaned = params_df.drop(outliers.index)
params_df_cleaned.head(-5)

Number of outliers: SE_Number          23
LactationNumber    23
a                  23
b                  23
c                  23
d                  23
z_a                23
z_b                23
z_c                23
z_d                23
dtype: int64


Unnamed: 0,SE_Number,LactationNumber,a,b,c,d,z_a,z_b,z_c,z_d
0,SE-27c3257a-1492,1,33.377780,-0.012773,-3.909940e-26,1.275143e-24,-1.315942,1.078161,-0.580496,-0.356803
1,SE-27c3257a-1492,2,52.204415,-0.085917,5.498649e+02,3.991520e+02,0.513397,-0.879186,-0.516239,-0.284445
2,SE-30dc5787-1389,6,53.550501,-0.078500,1.622484e+03,7.041568e+02,0.644193,-0.680715,-0.390895,-0.229154
3,SE-4b8091ac-1472,1,34.455809,-0.022543,3.570575e-14,-7.041866e-13,-1.211192,0.816708,-0.580496,-0.356803
4,SE-5b581702-1742,3,51.529751,-0.062250,3.196039e-11,-4.776671e-10,0.447841,-0.245861,-0.580496,-0.356803
...,...,...,...,...,...,...,...,...,...,...
1064,SE-f454e660-0791,1,28.983995,-0.012756,-2.198956e+02,1.319373e+03,-1.742875,1.078623,-0.606192,-0.117627
1065,SE-f454e660-0793,1,34.934424,-0.002913,2.042574e+04,4.936441e+03,-1.164686,1.342001,1.806424,0.538073
1066,SE-f454e660-0794,1,40.528130,-0.030210,-1.715867e+02,1.029520e+03,-0.621160,0.611550,-0.600547,-0.170172
1067,SE-f454e660-0795,1,21.530605,-0.004697,3.096113e+03,8.352267e+02,-2.467103,1.294265,-0.218689,-0.205393


In [13]:
# Identify unique SE_Number and LactationNumber combinations from the outliers
outlier_combinations = outliers[['SE_Number', 'LactationNumber']].drop_duplicates()

# Merge with the original data to find rows that match these outlier combinations
data_cleaned = data.merge(outlier_combinations, on=['SE_Number', 'LactationNumber'], how='left', indicator=True)

# Keep only the rows that do not match the outlier combinations
data_cleaned = data_cleaned[data_cleaned['_merge'] == 'left_only'].drop(columns=['_merge'])

# Now data_cleaned contains the original data with the outlier combinations removed
print("Number of rows removed:", len(data) - len(data_cleaned))
data_cleaned.head(-5)

Number of rows removed: 6008


Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Temp15Threshold,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad,HeatStress
0,SE-27c3257a-1492,5b581702,2022-11-25,39.60,,,0,1.500000,37.235887,32,...,0,773,DairyCross,1,32.969051,1.201126,,-23.764113,0.000000,0
1,SE-27c3257a-1492,5b581702,2022-11-26,22.19,39.60,-17.41,0,1.450000,36.869260,33,...,0,774,DairyCross,1,32.956278,0.673316,-0.528276,-24.130740,0.000000,0
2,SE-27c3257a-1492,5b581702,2022-11-27,29.40,22.19,7.21,0,-0.150000,30.678900,34,...,0,775,DairyCross,1,32.943505,0.892437,0.21886,-30.321100,0.000000,0
3,SE-27c3257a-1492,5b581702,2022-11-28,27.03,29.40,-2.37,0,2.700000,37.007500,35,...,0,776,DairyCross,1,32.930733,0.820814,-0.071969,-23.992500,0.000000,0
4,SE-27c3257a-1492,5b581702,2022-11-29,26.88,27.03,-0.15,0,2.850000,38.536020,36,...,0,777,DairyCross,1,32.91796,0.816576,-0.004557,-22.463980,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297961,SE-f454e660-0829,f454e660,2024-08-09,23.96,32.13,-8.17,0,19.066667,64.864167,279,...,1,934,SLB,1,28.054395,0.854055,-0.29122,3.864167,85.509565,1
297962,SE-f454e660-0829,f454e660,2024-08-10,24.37,23.96,0.41,0,18.466667,54.884093,280,...,1,935,SLB,1,28.027952,0.869489,0.014628,-6.115907,73.277752,1
297963,SE-f454e660-0829,f454e660,2024-08-11,26.92,24.37,2.55,0,17.933333,55.203273,281,...,1,936,SLB,1,28.001509,0.961377,0.091067,-5.796727,61.684298,1
297964,SE-f454e660-0829,f454e660,2024-08-12,23.94,26.92,-2.98,0,17.166667,58.966293,282,...,1,937,SLB,1,27.975067,0.855762,-0.106523,-2.033707,57.616885,1


In [14]:
# Check if NormalizedDailyYield is centered around 1 for each unique farm
print("Mean of NormalizedDailyYield:", data_cleaned.groupby('FarmName_Pseudo')['NormalizedDailyYield'].mean())
print("Standard Deviation of NormalizedDailyYield:", data_cleaned.groupby('FarmName_Pseudo')['NormalizedDailyYield'].std())

Mean of NormalizedDailyYield: FarmName_Pseudo
5b581702    0.935824
5c06d92d    0.945253
a624fb9a    0.929762
ad0a39f5    0.951643
f454e660    0.930904
Name: NormalizedDailyYield, dtype: Float64
Standard Deviation of NormalizedDailyYield: FarmName_Pseudo
5b581702    0.188627
5c06d92d    0.141514
a624fb9a    0.183263
ad0a39f5    0.123008
f454e660    0.217892
Name: NormalizedDailyYield, dtype: Float64


In [15]:
data_cleaned['Residuals'] = data_cleaned['DailyYield'] - data_cleaned['ExpectedYield']
data_cleaned

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad,HeatStress,Residuals
0,SE-27c3257a-1492,5b581702,2022-11-25,39.60,,,0,1.500000,37.235887,32,...,773,DairyCross,1,32.969051,1.201126,,-23.764113,0.000000,0,6.630949
1,SE-27c3257a-1492,5b581702,2022-11-26,22.19,39.60,-17.41,0,1.450000,36.869260,33,...,774,DairyCross,1,32.956278,0.673316,-0.528276,-24.130740,0.000000,0,-10.766278
2,SE-27c3257a-1492,5b581702,2022-11-27,29.40,22.19,7.21,0,-0.150000,30.678900,34,...,775,DairyCross,1,32.943505,0.892437,0.21886,-30.321100,0.000000,0,-3.543505
3,SE-27c3257a-1492,5b581702,2022-11-28,27.03,29.40,-2.37,0,2.700000,37.007500,35,...,776,DairyCross,1,32.930733,0.820814,-0.071969,-23.992500,0.000000,0,-5.900733
4,SE-27c3257a-1492,5b581702,2022-11-29,26.88,27.03,-0.15,0,2.850000,38.536020,36,...,777,DairyCross,1,32.91796,0.816576,-0.004557,-22.463980,0.000000,0,-6.03796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297966,SE-f454e660-0829,f454e660,2024-08-14,31.69,30.01,1.68,0,17.325000,64.076020,284,...,939,SLB,1,27.922181,1.13494,0.060167,3.076020,66.740018,1,3.767819
297967,SE-f454e660-0829,f454e660,2024-08-15,26.34,31.69,-5.35,0,21.066667,65.716753,285,...,940,SLB,1,27.895739,0.94423,-0.191786,4.716753,71.456772,1,-1.555739
297968,SE-f454e660-0829,f454e660,2024-08-16,24.10,26.34,-2.24,0,19.966667,64.264293,286,...,941,SLB,1,27.869296,0.864751,-0.080375,3.264293,74.721065,1,-3.769296
297969,SE-f454e660-0829,f454e660,2024-08-17,25.73,24.10,1.63,0,16.333333,60.634800,287,...,942,SLB,1,27.842854,0.924115,0.058543,-0.365200,73.990665,1,-2.112854


In [16]:
farm_results = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    farm_residuals = []
    
    for se_number, cow_group in farm_group.groupby('SE_Number'):
        residuals = cow_group['Residuals'].dropna()  # Drop NaN values
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            farm_residuals.append(residuals)
    
    if len(farm_residuals) > 0:
        # Combine residuals from all cows in the farm
        combined_residuals = np.concatenate(farm_residuals)
        
        if len(combined_residuals) > 1:  # Ensure enough data to perform calculations
            # Calculate farm-level statistics
            acf_values = acf(combined_residuals, nlags=30, fft=False)
            pacf_values = pacf(combined_residuals, nlags=min(30, len(combined_residuals)//2))

            # Print the farm-level statistics
            print(f"Farm: {farm_name}")
            print(f"ACF (first 5 lags): {acf_values[:5]}")
            print(f"PACF (first 5 lags): {pacf_values[:5]}")
        else:
            print(f"Farm: {farm_name} does not have enough data for reliable calculations.")
        
        print("=" * 50)

Farm: 5b581702
ACF (first 5 lags): [1.         0.15829563 0.3676841  0.32646119 0.31282548]
PACF (first 5 lags): [1.         0.15830333 0.35146768 0.27281069 0.18781303]


  xo = x - x.mean()
  x -= x.mean()


Farm: 5c06d92d
ACF (first 5 lags): [nan nan nan nan nan]
PACF (first 5 lags): [ 1. nan nan nan nan]
Farm: a624fb9a
ACF (first 5 lags): [1.         0.25034882 0.44442352 0.40662651 0.38223703]
PACF (first 5 lags): [1.         0.25035358 0.40729127 0.30535097 0.19067094]


  xo = x - x.mean()
  x -= x.mean()


Farm: ad0a39f5
ACF (first 5 lags): [nan nan nan nan nan]
PACF (first 5 lags): [ 1. nan nan nan nan]
Farm: f454e660
ACF (first 5 lags): [1.         0.02190352 0.282593   0.27776274 0.25034053]
PACF (first 5 lags): [1.         0.02190457 0.28227572 0.29044734 0.2183942 ]


In [17]:
# Group by 'FarmName_Pseudo', 'SE_Number', and 'LactationNumber' to perform individual calculations
farm_results = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    print(f"Farm: {farm_name}")
    
    for (se_number, lactation_number), cow_group in farm_group.groupby(['SE_Number', 'LactationNumber']):
        residuals = cow_group['Residuals']
        residuals = cow_group['Residuals'].dropna()  # Drop NaN values
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            acf_values = acf(residuals, nlags=30, fft=False)
            pacf_values = pacf(residuals, nlags=min(30, len(residuals)//2))

            # Print the statistics
            print(f"\nCow: {se_number}, Lactation Number: {lactation_number}")
            print(f"ACF (first 5 lags): {acf_values[:5]}")
            print(f"PACF (first 5 lags): {pacf_values[:5]}")
            print("-" * 50)
            
    print("=" * 50)

Farm: 5b581702

Cow: SE-27c3257a-1492, Lactation Number: 1
ACF (first 5 lags): [ 1.         -0.04794779  0.17260587  0.19176177  0.18892561]
PACF (first 5 lags): [ 1.         -0.04812279  0.17195279  0.2159505   0.20111666]
--------------------------------------------------

Cow: SE-27c3257a-1492, Lactation Number: 2
ACF (first 5 lags): [1.         0.06616381 0.19955605 0.34026079 0.17919348]
PACF (first 5 lags): [1.         0.06642533 0.19759938 0.3349812  0.1508133 ]
--------------------------------------------------

Cow: SE-4b8091ac-1472, Lactation Number: 1
ACF (first 5 lags): [1.         0.11608628 0.34493712 0.29705093 0.14983035]
PACF (first 5 lags): [1.         0.11648114 0.33831398 0.26779714 0.01779562]
--------------------------------------------------

Cow: SE-5b581702-1742, Lactation Number: 3
ACF (first 5 lags): [ 1.         -0.33950868  0.08122775  0.00108425  0.06716684]
PACF (first 5 lags): [ 1.         -0.34061457 -0.03875604  0.01905789  0.08788735]
----------------

  xo = x - x.mean()
  x -= x.mean()



Cow: SE-5c06d92d-2798, Lactation Number: 5
ACF (first 5 lags): [1.         0.50524803 0.28924453 0.25252476 0.25518363]
PACF (first 5 lags): [1.         0.50716185 0.0460845  0.12283486 0.10761791]
--------------------------------------------------

Cow: SE-5c06d92d-2801, Lactation Number: 4
ACF (first 5 lags): [1.         0.56854026 0.44105822 0.39799461 0.30348086]
PACF (first 5 lags): [ 1.          0.57058537  0.17596294  0.14236579 -0.00133713]
--------------------------------------------------

Cow: SE-5c06d92d-2801, Lactation Number: 5
ACF (first 5 lags): [1.         0.62927916 0.49794437 0.43149708 0.37503457]
PACF (first 5 lags): [1.         0.63181658 0.17107653 0.11277215 0.05793778]
--------------------------------------------------

Cow: SE-5c06d92d-2804, Lactation Number: 4
ACF (first 5 lags): [1.         0.43013219 0.32906163 0.24540129 0.26560414]
PACF (first 5 lags): [1.         0.43176767 0.17841001 0.06800885 0.13281012]
----------------------------------------------

  xo = x - x.mean()
  x -= x.mean()



Cow: SE-a624fb9a-1200, Lactation Number: 6
ACF (first 5 lags): [1.         0.38378689 0.42969691 0.408782   0.42781665]
PACF (first 5 lags): [1.         0.38486798 0.33338303 0.22858291 0.21129355]
--------------------------------------------------

Cow: SE-a624fb9a-1201, Lactation Number: 6
ACF (first 5 lags): [1.         0.54597182 0.62902957 0.58681781 0.63739074]
PACF (first 5 lags): [1.         0.54797907 0.47646387 0.27816959 0.30360324]
--------------------------------------------------

Cow: SE-a624fb9a-1231, Lactation Number: 5
ACF (first 5 lags): [1.         0.52724265 0.58566299 0.52714635 0.50979394]
PACF (first 5 lags): [1.         0.52876208 0.42955914 0.2160457  0.13534633]
--------------------------------------------------

Cow: SE-a624fb9a-1244, Lactation Number: 5
ACF (first 5 lags): [1.         0.10654156 0.42277747 0.18667321 0.33510091]
PACF (first 5 lags): [1.         0.1069115  0.41908374 0.14404163 0.18729764]
--------------------------------------------------


  xo = x - x.mean()
  x -= x.mean()



Cow: SE-ad0a39f5-2660, Lactation Number: 2
ACF (first 5 lags): [1.         0.49535546 0.44036564 0.4469044  0.39624214]
PACF (first 5 lags): [1.         0.49857205 0.26289746 0.22974171 0.10960976]
--------------------------------------------------

Cow: SE-ad0a39f5-2661, Lactation Number: 1
ACF (first 5 lags): [1.         0.56447372 0.55172089 0.54290257 0.45502847]
PACF (first 5 lags): [1.         0.56651891 0.34575885 0.24585513 0.04296629]
--------------------------------------------------

Cow: SE-ad0a39f5-2664, Lactation Number: 1
ACF (first 5 lags): [1.         0.4350309  0.45143812 0.42331084 0.38631905]
PACF (first 5 lags): [1.         0.43661861 0.32631731 0.20982807 0.12077789]
--------------------------------------------------

Cow: SE-ad0a39f5-2670, Lactation Number: 1
ACF (first 5 lags): [1.         0.57688679 0.55436279 0.53380028 0.51427064]
PACF (first 5 lags): [1.         0.5783852  0.33468145 0.21994319 0.15237406]
--------------------------------------------------


In [18]:
# Define the thresholds
mean_residual_threshold = 0.075
std_residual_threshold = 7.5
acf_threshold = 0.25
pacf_threshold = 0.25

# List to collect flagged combinations
flagged_combinations = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    for (se_number, lactation_number), cow_group in farm_group.groupby(['SE_Number', 'LactationNumber']):
        residuals = cow_group['Residuals'].dropna()
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            acf_values = acf(residuals, nlags=30, fft=False)
            pacf_values = pacf(residuals, nlags=min(30, len(residuals)//2))

            # Check against thresholds
            if (abs(acf_values[1]) > acf_threshold or 
                abs(pacf_values[1]) > pacf_threshold):
                
                # Collect the combination if it exceeds any threshold
                flagged_combinations.append({
                    'Farm': farm_name,
                    'SE_Number': se_number,
                    'LactationNumber': lactation_number,
                    'ACF[1]': acf_values[1],
                    'PACF[1]': pacf_values[1]
                })

# Convert to a DataFrame for easier inspection
flagged_df = pd.DataFrame(flagged_combinations)
flagged_df

  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()


Unnamed: 0,Farm,SE_Number,LactationNumber,ACF[1],PACF[1]
0,5b581702,SE-5b581702-1742,3,-0.339509,-0.340615
1,5b581702,SE-5b581702-1816,3,-0.256527,-0.257829
2,5b581702,SE-5b581702-1820,4,0.312182,0.313365
3,5b581702,SE-5b581702-1829,3,0.403051,0.404189
4,5b581702,SE-5b581702-1855,4,-0.298455,-0.299678
...,...,...,...,...,...
739,f454e660,SE-f454e660-0579,2,-0.257063,-0.258026
740,f454e660,SE-f454e660-0616,3,-0.266043,-0.266870
741,f454e660,SE-f454e660-0743,2,-0.262702,-0.263783
742,f454e660,SE-f454e660-0798,1,-0.286468,-0.287313


In [20]:
# JOAKIM'S EDITS
# Define the Wilmink Lactation Curve function
def wilmink_lactation_curve(dim, a, b, c, d):
    dim = np.array(dim, dtype=float)
    return a + b * dim + c * np.exp(-d * dim)

# Function to directly refit the Wilmink Lactation Curve (Standard Process)
def refit_wilmink(cow_data):
    x_data = cow_data['DaysInMilk'].values
    y_data = cow_data['DailyYield'].values

    # Use initial guesses and bounds from the original fitting process
    initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1]
    bounds = ([-np.inf, -np.inf, -np.inf, 0], [np.inf, np.inf, np.inf, np.inf])

    popt, _ = curve_fit(wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=30000)
    
    # Calculate the expected yield with the refitted parameters
    cow_data['ExpectedYield'] = wilmink_lactation_curve(cow_data['DaysInMilk'], *popt)
    
    # Calculate new residuals
    cow_data['Residuals'] = cow_data['DailyYield'] - cow_data['ExpectedYield']
    
    return cow_data

# Function to add lagged variables for addressing autocorrelation
def add_lagged_variables(cow_data, max_lag=3):
    for lag in range(1, max_lag + 1):
        cow_data[f'lag_{lag}'] = cow_data['DailyYield'].shift(lag)
    return cow_data.dropna()

# Define the Robust Wilmink Lactation Curve function
def robust_wilmink_lactation_curve(dim, a, b, c, d, lag1, lag2, lag3):
    dim = np.array(dim, dtype=np.float64)
    days_in_milk = dim[0]
    lag_1 = dim[1]
    lag_2 = dim[2]
    lag_3 = dim[3]
    
    return a + b * days_in_milk + c * np.exp(-d * days_in_milk) + lag1 * lag_1 + lag2 * lag_2 + lag3 * lag_3

def fit_robust_wilmink(cow_data, lags=3):
    cow_data = add_lagged_variables(cow_data, max_lag=lags)

    # Extract individual columns from cow_data as separate arrays
    days_in_milk = cow_data['DaysInMilk'].values
    lag_1 = cow_data['lag_1'].values
    lag_2 = cow_data['lag_2'].values
    lag_3 = cow_data['lag_3'].values
    y_data = cow_data['DailyYield'].values

    # Ensure all arrays have the same shape
    assert len(days_in_milk) == len(lag_1) == len(lag_2) == len(lag_3) == len(y_data), "Mismatch in data lengths"

    # Prepare initial guesses and bounds
    initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1, 0, 0, 0]
    bounds = ([-np.inf, -np.inf, -np.inf, 0, -np.inf, -np.inf, -np.inf], 
              [np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf])
    
    try:
        # Pass individual components of x_data to curve_fit
        popt, _ = curve_fit(
            lambda dim, a, b, c, d, lag1, lag2, lag3: robust_wilmink_lactation_curve(dim, a, b, c, d, lag1, lag2, lag3), 
            (days_in_milk, lag_1, lag_2, lag_3), 
            y_data, 
            p0=initial_guesses, 
            bounds=bounds, 
            maxfev=50000
        )

        cow_data.loc[:, 'ExpectedYield'] = robust_wilmink_lactation_curve(
            (days_in_milk, lag_1, lag_2, lag_3), *popt
        )
        cow_data.loc[:, 'Residuals'] = cow_data['DailyYield'] - cow_data['ExpectedYield']

    
    except RuntimeError as e:
        print(f"Curve fitting failed: {e}")
        cow_data['ExpectedYield'] = np.nan
        cow_data['Residuals'] = np.nan
    
    return cow_data



# Function to add lagged variables for addressing autocorrelation
def add_lagged_variables(cow_data, max_lag=3):
    for lag in range(1, max_lag + 1):
        cow_data[f'lag_{lag}'] = cow_data['DailyYield'].shift(lag)
    
    # Check for missing values and drop rows with NaNs in the lagged columns or DailyYield
    cow_data_cleaned = cow_data.dropna(subset=['DailyYield'] + [f'lag_{lag}' for lag in range(1, max_lag + 1)])
    
    # Ensure we're not dropping too much data, and there's still sufficient data for fitting
    if len(cow_data_cleaned) == 0:
        raise ValueError("Insufficient data after adding lagged variables. Check for missing data.")

    return cow_data_cleaned


# Apply lagged variables to all cases, all lactations
for se_number in data_cleaned['SE_Number'].unique():
    for lactation_number in data_cleaned[data_cleaned['SE_Number'] == se_number]['LactationNumber'].unique():
        
        cow_data = data_cleaned[(data_cleaned['SE_Number'] == se_number) & 
                                (data_cleaned['LactationNumber'] == lactation_number)].copy()
        
        # Apply lagged variables for all cases, regardless of autocorrelation
        cow_data = add_lagged_variables(cow_data, max_lag=3)
        cow_data_refitted = fit_robust_wilmink(cow_data, lags=3)
        
        data_cleaned.update(cow_data_refitted)

# Remove rows where ExpectedYield is NaN
data_cleaned = data_cleaned.dropna(subset=['ExpectedYield']).reset_index(drop=True)

# Normalize yields
data_cleaned['NormalizedDailyYield'] = data_cleaned['DailyYield'] / data_cleaned['ExpectedYield']
data_cleaned['NormalizedDailyYieldChange'] = data_cleaned['DailyYieldChange'] / data_cleaned['ExpectedYield']


data_cleaned

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad,HeatStress,Residuals
0,SE-27c3257a-1492,5b581702,2022-11-25,39.60,,,0,1.500000,37.235887,32,...,773,DairyCross,1,32.969051,1.201126,,-23.764113,0.000000,0,6.630949
1,SE-27c3257a-1492,5b581702,2022-11-26,22.19,39.60,-17.41,0,1.450000,36.869260,33,...,774,DairyCross,1,32.956278,0.673316,-0.528276,-24.130740,0.000000,0,-10.766278
2,SE-27c3257a-1492,5b581702,2022-11-27,29.40,22.19,7.21,0,-0.150000,30.678900,34,...,775,DairyCross,1,32.943505,0.892437,0.21886,-30.321100,0.000000,0,-3.543505
3,SE-27c3257a-1492,5b581702,2022-11-28,27.03,29.40,-2.37,0,2.700000,37.007500,35,...,776,DairyCross,1,32.930733,0.820814,-0.071969,-23.992500,0.000000,0,-5.900733
4,SE-27c3257a-1492,5b581702,2022-11-29,26.88,27.03,-0.15,0,2.850000,38.536020,36,...,777,DairyCross,1,32.91796,0.816576,-0.004557,-22.463980,0.000000,0,-6.03796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291958,SE-f454e660-0829,f454e660,2024-08-14,31.69,30.01,1.68,0,17.325000,64.076020,284,...,939,SLB,1,27.38798,1.157077,0.061341,3.076020,66.740018,1,4.30202
291959,SE-f454e660-0829,f454e660,2024-08-15,26.34,31.69,-5.35,0,21.066667,65.716753,285,...,940,SLB,1,26.817713,0.982187,-0.199495,4.716753,71.456772,1,-0.477713
291960,SE-f454e660-0829,f454e660,2024-08-16,24.10,26.34,-2.24,0,19.966667,64.264293,286,...,941,SLB,1,28.94124,0.832722,-0.077398,3.264293,74.721065,1,-4.84124
291961,SE-f454e660-0829,f454e660,2024-08-17,25.73,24.10,1.63,0,16.333333,60.634800,287,...,942,SLB,1,29.389107,0.875494,0.055463,-0.365200,73.990665,1,-3.659107


In [19]:
"""OLD CODE
# Define the Wilmink Lactation Curve function
def wilmink_lactation_curve(dim, a, b, c, d):
    dim = np.array(dim, dtype=float)
    return a + b * dim + c * np.exp(-d * dim)

# Function to directly refit the Wilmink Lactation Curve (Standard Process)
def refit_wilmink(cow_data):
    x_data = cow_data['DaysInMilk'].values
    y_data = cow_data['DailyYield'].values

    # Use initial guesses and bounds from the original fitting process
    initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1]
    bounds = ([-np.inf, -np.inf, -np.inf, 0], [np.inf, np.inf, np.inf, np.inf])

    popt, _ = curve_fit(wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=30000)
    
    # Calculate the expected yield with the refitted parameters
    cow_data['ExpectedYield'] = wilmink_lactation_curve(cow_data['DaysInMilk'], *popt)
    
    # Calculate new residuals
    cow_data['Residuals'] = cow_data['DailyYield'] - cow_data['ExpectedYield']
    
    return cow_data

# Function to add lagged variables for addressing autocorrelation
def add_lagged_variables(cow_data, max_lag=3):
    for lag in range(1, max_lag + 1):
        cow_data[f'lag_{lag}'] = cow_data['DailyYield'].shift(lag)
    return cow_data.dropna()

# Define the Robust Wilmink Lactation Curve function
def robust_wilmink_lactation_curve(dim, a, b, c, d, lag1, lag2, lag3):
    dim = np.array(dim, dtype=np.float64)
    days_in_milk = dim[0]
    lag_1 = dim[1]
    lag_2 = dim[2]
    lag_3 = dim[3]
    
    return a + b * days_in_milk + c * np.exp(-d * days_in_milk) + lag1 * lag_1 + lag2 * lag_2 + lag3 * lag_3

# Function to fit the robust Wilmink model
def fit_robust_wilmink(cow_data, lags=3):
    cow_data = add_lagged_variables(cow_data, max_lag=lags)
    
    x_data = cow_data[['DaysInMilk', 'lag_1', 'lag_2', 'lag_3']].values.T
    y_data = cow_data['DailyYield'].values
    
    initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1, 0, 0, 0]
    bounds = ([-np.inf, -np.inf, -np.inf, 0, -np.inf, -np.inf, -np.inf], 
              [np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf])
    
    try:
        popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)
        cow_data.loc[:, 'ExpectedYield'] = robust_wilmink_lactation_curve(x_data, *popt)
        cow_data.loc[:, 'Residuals'] = cow_data['DailyYield'] - cow_data['ExpectedYield']
    except RuntimeError as e:
        print(f"Curve fitting failed: {e}")
        cow_data.loc[:, 'ExpectedYield'] = np.nan
        cow_data.loc[:, 'Residuals'] = np.nan
    
    return cow_data

# Function to add lagged variables for addressing autocorrelation
def add_lagged_variables(cow_data, max_lag=3):
    for lag in range(1, max_lag + 1):
        cow_data[f'lag_{lag}'] = cow_data['DailyYield'].shift(lag)
    return cow_data.dropna()

# Example usage: Applying the robust model to flagged cases
for index, row in flagged_df.iterrows():
    se_number = row['SE_Number']
    lactation_number = row['LactationNumber']
    
    cow_data = data_cleaned[(data_cleaned['SE_Number'] == se_number) & 
                            (data_cleaned['LactationNumber'] == lactation_number)].copy()
    
    if abs(row['ACF[1]']) > 0.2:  # Significant autocorrelation
        cow_data = add_lagged_variables(cow_data, max_lag=3)
        cow_data_refitted = fit_robust_wilmink(cow_data, lags=3)
        data_cleaned.update(cow_data_refitted)
    else:
        cow_data_refitted = refit_wilmink(cow_data)
        data_cleaned.update(cow_data_refitted)

# Erase all rows where ExpectedYield is NaN
data_cleaned = data_cleaned.dropna(subset=['ExpectedYield']).reset_index(drop=True)

data_cleaned['NormalizedDailyYield'] = data_cleaned['DailyYield'] / data_cleaned['ExpectedYield']
data_cleaned['NormalizedDailyYieldChange'] = data_cleaned['DailyYieldChange'] / data_cleaned['ExpectedYield']

data_cleaned
"""

  data_cleaned.update(cow_data_refitted)
  popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)


Curve fitting failed: Optimal parameters not found: The maximum number of function evaluations is exceeded.


  popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)
  popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)
  popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)
  popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)
  popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


ValueError: `ydata` must not be empty!

In [21]:
# Define the thresholds
mean_residual_threshold = 0.075
std_residual_threshold = 7.5
acf_threshold = 0.25
pacf_threshold = 0.25

# List to collect flagged combinations
flagged_combinations = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    for (se_number, lactation_number), cow_group in farm_group.groupby(['SE_Number', 'LactationNumber']):
        residuals = cow_group['Residuals'].dropna()
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            acf_values = acf(residuals, nlags=30, fft=False)
            pacf_values = pacf(residuals, nlags=min(30, len(residuals)//2))

            # Check against thresholds
            if (abs(acf_values[1]) > acf_threshold or 
                abs(pacf_values[1]) > pacf_threshold):
                
                # Collect the combination if it exceeds any threshold
                flagged_combinations.append({
                    'Farm': farm_name,
                    'SE_Number': se_number,
                    'LactationNumber': lactation_number,
                    'ACF[1]': acf_values[1],
                    'PACF[1]': pacf_values[1]
                })

# Convert to a DataFrame for easier inspection
flagged_df = pd.DataFrame(flagged_combinations)
flagged_df

  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()


Unnamed: 0,Farm,SE_Number,LactationNumber,ACF[1],PACF[1]
0,5b581702,SE-5b581702-1802,4,0.261488,0.262864
1,5b581702,SE-5b581702-1856,3,0.298499,0.299775
2,5b581702,SE-5b581702-1902,3,0.341334,0.342545
3,5b581702,SE-5b581702-2104,1,0.343146,0.344664
4,5b581702,SE-5b581702-2151,1,0.293114,0.294633
...,...,...,...,...,...
394,ad0a39f5,SE-ad0a39f5-2773,1,0.270531,0.271933
395,ad0a39f5,SE-ad0a39f5-2778,1,0.320956,0.322937
396,ad0a39f5,SE-ad0a39f5-2785,1,0.309062,0.311136
397,ad0a39f5,SE-ad0a39f5-2801,1,0.343519,0.345588


In [22]:
def remove_outliers(data, threshold=3.5):
    # Calculate z-scores of residuals
    data = data.copy()  # Create a copy to avoid the SettingWithCopyWarning
    data['z_score'] = (data['Residuals'] - data['Residuals'].mean()) / data['Residuals'].std()
    
    # Identify the number of outliers
    num_outliers = (data['z_score'].abs() >= threshold).sum()
    print(f"Number of outliers detected: {num_outliers}")
    
    # Remove rows where the z-score of the residual is greater than the threshold
    cleaned_data = data.loc[(data['z_score'].abs() < threshold)].drop(columns=['z_score'])
    
    # Print the number of rows before and after
    print(f"Number of rows before outlier removal: {len(data)}")
    print(f"Number of rows after outlier removal: {len(cleaned_data)}")
    
    return cleaned_data

# Apply to flagged cases
for index, row in flagged_df.iterrows():
    se_number = row['SE_Number']
    lactation_number = row['LactationNumber']
    
    # Select the cow data for the specific SE_Number and LactationNumber
    cow_data = data_cleaned.loc[(data_cleaned['SE_Number'] == se_number) & 
                                (data_cleaned['LactationNumber'] == lactation_number)]
    
    # Remove outliers
    cow_data_trimmed = remove_outliers(cow_data, threshold=3.5)
    
    # Recalculate the residuals and update the dataset
    cow_data_trimmed['Residuals'] = cow_data_trimmed['DailyYield'] - cow_data_trimmed['ExpectedYield']
    
    # Remove the old data for this cow from data_cleaned
    data_cleaned = data_cleaned.loc[~((data_cleaned['SE_Number'] == se_number) & 
                                      (data_cleaned['LactationNumber'] == lactation_number))]
    
    # Append the cleaned data back to data_cleaned
    data_cleaned = pd.concat([data_cleaned, cow_data_trimmed], ignore_index=True)

Number of outliers detected: 2
Number of rows before outlier removal: 191
Number of rows after outlier removal: 189
Number of outliers detected: 4
Number of rows before outlier removal: 235
Number of rows after outlier removal: 231
Number of outliers detected: 5
Number of rows before outlier removal: 283
Number of rows after outlier removal: 278
Number of outliers detected: 6
Number of rows before outlier removal: 227
Number of rows after outlier removal: 221
Number of outliers detected: 3
Number of rows before outlier removal: 194
Number of rows after outlier removal: 191
Number of outliers detected: 3
Number of rows before outlier removal: 256
Number of rows after outlier removal: 253
Number of outliers detected: 4
Number of rows before outlier removal: 273
Number of rows after outlier removal: 269
Number of outliers detected: 2
Number of rows before outlier removal: 177
Number of rows after outlier removal: 175
Number of outliers detected: 7
Number of rows before outlier removal: 36

In [23]:
data_cleaned

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad,HeatStress,Residuals
0,SE-27c3257a-1492,5b581702,2022-11-25,39.60,,,0,1.500000,37.235887,32,...,773,DairyCross,1,32.969051,1.201126,,-23.764113,0.0,0,6.630949
1,SE-27c3257a-1492,5b581702,2022-11-26,22.19,39.60,-17.41,0,1.450000,36.869260,33,...,774,DairyCross,1,32.956278,0.673316,-0.528276,-24.130740,0.0,0,-10.766278
2,SE-27c3257a-1492,5b581702,2022-11-27,29.40,22.19,7.21,0,-0.150000,30.678900,34,...,775,DairyCross,1,32.943505,0.892437,0.21886,-30.321100,0.0,0,-3.543505
3,SE-27c3257a-1492,5b581702,2022-11-28,27.03,29.40,-2.37,0,2.700000,37.007500,35,...,776,DairyCross,1,32.930733,0.820814,-0.071969,-23.992500,0.0,0,-5.900733
4,SE-27c3257a-1492,5b581702,2022-11-29,26.88,27.03,-0.15,0,2.850000,38.536020,36,...,777,DairyCross,1,32.91796,0.816576,-0.004557,-22.463980,0.0,0,-6.03796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289903,SE-ad0a39f5-2805,ad0a39f5,2023-04-12,47.59,54.39,-6.80,0,6.733333,47.223720,161,...,874,SLB,1,45.743231,1.040373,-0.148656,-13.776280,0.0,0,1.846769
289904,SE-ad0a39f5-2805,ad0a39f5,2023-04-13,47.12,47.59,-0.47,0,7.800000,45.869080,162,...,875,SLB,1,47.556218,0.990827,-0.009883,-15.130920,0.0,0,-0.436218
289905,SE-ad0a39f5-2805,ad0a39f5,2023-04-14,48.68,47.12,1.56,0,7.766667,46.105007,163,...,876,SLB,1,46.085059,1.056308,0.03385,-14.894993,0.0,0,2.594941
289906,SE-ad0a39f5-2805,ad0a39f5,2023-04-15,47.95,48.68,-0.73,0,9.300000,47.896260,164,...,877,SLB,1,46.482312,1.031575,-0.015705,-13.103740,0.0,0,1.467688


In [24]:
# Define the thresholds
mean_residual_threshold = 0.075
std_residual_threshold = 7.5
acf_threshold = 0.25
pacf_threshold = 0.25

# List to collect flagged combinations
flagged_combinations = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    for (se_number, lactation_number), cow_group in farm_group.groupby(['SE_Number', 'LactationNumber']):
        residuals = cow_group['Residuals'].dropna()
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            acf_values = acf(residuals, nlags=30, fft=False)
            pacf_values = pacf(residuals, nlags=min(30, len(residuals)//2))

            # Check against thresholds
            if (abs(acf_values[1]) > acf_threshold or 
                abs(pacf_values[1]) > pacf_threshold):
                
                # Collect the combination if it exceeds any threshold
                flagged_combinations.append({
                    'Farm': farm_name,
                    'SE_Number': se_number,
                    'LactationNumber': lactation_number,
                    'ACF[1]': acf_values[1],
                    'PACF[1]': pacf_values[1]
                })

# Convert to a DataFrame for easier inspection
flagged_df = pd.DataFrame(flagged_combinations)
flagged_df

  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()


Unnamed: 0,Farm,SE_Number,LactationNumber,ACF[1],PACF[1]
0,5c06d92d,SE-5c06d92d-2797,5,0.389987,0.392587
1,5c06d92d,SE-5c06d92d-3196,2,0.261058,0.262128
2,5c06d92d,SE-5c06d92d-3281,2,0.338292,0.340293
3,ad0a39f5,SE-a756bc39-1208,2,0.28223,0.283983
4,ad0a39f5,SE-a756bc39-1255,1,0.299475,0.301499
5,ad0a39f5,SE-ad0a39f5-2469,3,0.326158,0.328134


In [25]:
# Reorder columns
new_order = [
    "Date", "FarmName_Pseudo", "SE_Number", "Age", "BreedName", "LactationNumber", "DaysInMilk",'YearSeason', "DailyYield", "PreviousDailyYield", 
    "DailyYieldChange", "ExpectedYield", "NormalizedDailyYield", 
    "NormalizedDailyYieldChange", "Residuals", "HeatStress", "Temp15Threshold", "HW", 
    "cum_HW", "MeanTemperature", "MeanTHI_adj", "HeatLoad", "CumulativeHeatLoad"
]
data_cleaned = data_cleaned[new_order]
data_cleaned

Unnamed: 0,Date,FarmName_Pseudo,SE_Number,Age,BreedName,LactationNumber,DaysInMilk,YearSeason,DailyYield,PreviousDailyYield,...,NormalizedDailyYieldChange,Residuals,HeatStress,Temp15Threshold,HW,cum_HW,MeanTemperature,MeanTHI_adj,HeatLoad,CumulativeHeatLoad
0,2022-11-25,5b581702,SE-27c3257a-1492,773,DairyCross,1,32,202204,39.60,,...,,6.630949,0,0,0,0,1.500000,37.235887,-23.764113,0.0
1,2022-11-26,5b581702,SE-27c3257a-1492,774,DairyCross,1,33,202204,22.19,39.60,...,-0.528276,-10.766278,0,0,0,0,1.450000,36.869260,-24.130740,0.0
2,2022-11-27,5b581702,SE-27c3257a-1492,775,DairyCross,1,34,202204,29.40,22.19,...,0.21886,-3.543505,0,0,0,0,-0.150000,30.678900,-30.321100,0.0
3,2022-11-28,5b581702,SE-27c3257a-1492,776,DairyCross,1,35,202204,27.03,29.40,...,-0.071969,-5.900733,0,0,0,0,2.700000,37.007500,-23.992500,0.0
4,2022-11-29,5b581702,SE-27c3257a-1492,777,DairyCross,1,36,202204,26.88,27.03,...,-0.004557,-6.03796,0,0,0,0,2.850000,38.536020,-22.463980,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289903,2023-04-12,ad0a39f5,SE-ad0a39f5-2805,874,SLB,1,161,202302,47.59,54.39,...,-0.148656,1.846769,0,0,0,0,6.733333,47.223720,-13.776280,0.0
289904,2023-04-13,ad0a39f5,SE-ad0a39f5-2805,875,SLB,1,162,202302,47.12,47.59,...,-0.009883,-0.436218,0,0,0,0,7.800000,45.869080,-15.130920,0.0
289905,2023-04-14,ad0a39f5,SE-ad0a39f5-2805,876,SLB,1,163,202302,48.68,47.12,...,0.03385,2.594941,0,0,0,0,7.766667,46.105007,-14.894993,0.0
289906,2023-04-15,ad0a39f5,SE-ad0a39f5-2805,877,SLB,1,164,202302,47.95,48.68,...,-0.015705,1.467688,0,0,0,0,9.300000,47.896260,-13.103740,0.0


In [26]:
# Check if NormalizedDailyYield is centered around 1 for each unique farm
print("Mean of NormalizedDailyYield:", data_cleaned.groupby('FarmName_Pseudo')['NormalizedDailyYield'].mean())
print("Standard Deviation of NormalizedDailyYield:", data_cleaned.groupby('FarmName_Pseudo')['NormalizedDailyYield'].std())

Mean of NormalizedDailyYield: FarmName_Pseudo
5b581702    0.991666
5c06d92d    0.995968
a624fb9a    0.992171
ad0a39f5    0.997127
f454e660    0.992315
Name: NormalizedDailyYield, dtype: Float64
Standard Deviation of NormalizedDailyYield: FarmName_Pseudo
5b581702    0.183961
5c06d92d    0.118919
a624fb9a    0.172482
ad0a39f5    0.087399
f454e660    0.214883
Name: NormalizedDailyYield, dtype: Float64


In [27]:
# Count the number of HeatStress occurrences in each farm
heat_stress_counts = data_cleaned.groupby('FarmName_Pseudo')['HeatStress'].sum()
heat_stress_counts

FarmName_Pseudo
5b581702     5663
5c06d92d    38488
a624fb9a    14721
ad0a39f5    11076
f454e660     5407
Name: HeatStress, dtype: int64

In [28]:
# Save the reordered DataFrame to a CSV file
data_cleaned.to_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile.csv', index=False)

# Descriptive statistics

In [29]:
# Make Parity 1-3
df_lact = data_cleaned.copy()
df_lact["Parity"] = df_lact["LactationNumber"]
df_lact.loc[(df_lact['LactationNumber'] >= 3) & (df_lact['LactationNumber'] <= 7), 'Parity'] = 3

In [30]:
# By parity
for_my_rec5 = df_lact.drop_duplicates(subset=["SE_Number", "LactationNumber"])
print(f"No. of parities in milking file: {for_my_rec5.shape}")  # 1,207

count_my_rec = for_my_rec5.groupby(["Parity", "BreedName"])["SE_Number"].count().reset_index()
print(f"No. of parities from SRB, SH, SJB and dairy crosses: \n", count_my_rec.to_string(index=False))

# By cows
for_my_rec4 = df_lact.drop_duplicates(subset=["SE_Number"])
print(f"No. of cows in milking file: {for_my_rec4.shape}")  # 921

for_my_rec5 = for_my_rec4.drop_duplicates(subset=["SE_Number"])
count_my_rec = for_my_rec5.groupby(["BreedName"])["SE_Number"].count().reset_index()
print(f"No. of cows from SRB, SH, SJB and dairy crosses: \n", count_my_rec.to_string(index=False))

# Herd info
# df_lact = pd.read_csv("../Data/MY_weather_filtered.csv", low_memory=False)
df_lact = df_lact.drop_duplicates(subset=["FarmName_Pseudo"])
col_keep = ["FarmName_Pseudo"]
df_lact = df_lact[col_keep]
print(df_lact.shape)
print(f"Herds in filtered data: \n", df_lact.to_string(index=False))

No. of parities in milking file: (1051, 24)
No. of parities from SRB, SH, SJB and dairy crosses: 
  Parity  BreedName  SE_Number
      1 DairyCross         58
      1       NRDC         92
      1        SJB         11
      1        SLB        224
      2 DairyCross         50
      2       NRDC         65
      2        SJB          7
      2        SLB        152
      3 DairyCross         95
      3       NRDC         93
      3        SJB         14
      3        SLB        190
No. of cows in milking file: (809, 24)
No. of cows from SRB, SH, SJB and dairy crosses: 
  BreedName  SE_Number
DairyCross        142
      NRDC        204
       SJB         21
       SLB        442
(5, 1)
Herds in filtered data: 
 FarmName_Pseudo
       5b581702
       a624fb9a
       5c06d92d
       ad0a39f5
       f454e660
