In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.optimize import curve_fit, OptimizeWarning
from tqdm import tqdm
import warnings
from scipy.stats import zscore
from statsmodels.tsa.stattools import acf, pacf
from scipy.optimize import minimize
from vqr import VectorQuantileRegressor
from vqr.solvers.regularized_lse import RegularizedDualVQRSolver
import statsmodels.api as sm


sns.set_theme()
sns.set_context("notebook")
%load_ext autoreload
%autoreload 2

In [30]:
dtype_dict = {
    'FarmName_Pseudo': 'str',
    'SE_Number': 'str',
    'AnimalNumber': 'Int64',          
    'StartDate': 'str',
    'StartTime': 'str',
    'DateTime': 'str',
    'LactationNumber': 'Int64',       
    'DaysInMilk': 'Int64', 
    'YearSeason': 'str',           
    'TotalYield': 'float',
    'DateTime': 'str',
    'BreedName': 'str',
    'Age': 'Int64',
    'Mother': 'str',
    'Father': 'str',
    'CullDecisionDate': 'str',
    'Temperature': 'float',
    'RelativeHumidity': 'float',      
    'THI_adj': 'float',
    'HW': 'Int64',                    
    'cum_HW': 'Int64',                
    'Temp15Threshold': 'Int64'        
}


# Load the CSV with specified dtypes
data = pd.read_csv('../Data/MergedData/CleanedYieldData.csv', dtype=dtype_dict)

# Convert date and time columns back to datetime and time objects
data['DateTime'] = pd.to_datetime(data['DateTime'], errors='coerce')
data['StartTime'] = pd.to_datetime(data['StartTime'], format='%H:%M:%S', errors='coerce').dt.time
data['StartDate'] = pd.to_datetime(data['StartDate'], errors='coerce')
data['CullDecisionDate'] = pd.to_datetime(data['CullDecisionDate'], errors='coerce')
data['DateTime'] = pd.to_datetime(data['DateTime'], errors='coerce')
data.head()

Unnamed: 0,FarmName_Pseudo,SE_Number,AnimalNumber,StartDate,StartTime,DateTime,LactationNumber,DaysInMilk,YearSeason,TotalYield,...,Age,Mother,Father,CullDecisionDate,Temperature2,RelativeHumidity,THI_adj2,HW,cum_HW,Temp15Threshold
0,5b581702,SE-27c3257a-1492,1492,2022-11-25,07:13:00,2022-11-25 07:13:00,1,32,202204,18.57,...,773,SE-27c3257a-1355,SE-458cc45c-7488,NaT,1.2,0.968,35.706,0,0,0
1,5b581702,SE-27c3257a-1492,1492,2022-11-25,14:13:00,2022-11-25 14:13:00,1,32,202204,9.73,...,773,SE-27c3257a-1355,SE-458cc45c-7488,NaT,1.7,0.971,38.17406,0,0,0
2,5b581702,SE-27c3257a-1492,1492,2022-11-25,23:02:00,2022-11-25 23:02:00,1,32,202204,11.3,...,773,SE-27c3257a-1355,SE-458cc45c-7488,NaT,1.6,0.982,37.8276,0,0,0
3,5b581702,SE-27c3257a-1492,1492,2022-11-26,08:01:00,2022-11-26 08:01:00,1,33,202204,12.78,...,774,SE-27c3257a-1355,SE-458cc45c-7488,NaT,1.6,0.979,36.56612,0,0,0
4,5b581702,SE-27c3257a-1492,1492,2022-11-26,14:53:00,2022-11-26 14:53:00,1,33,202204,9.41,...,774,SE-27c3257a-1355,SE-458cc45c-7488,NaT,1.3,0.976,37.1724,0,0,0


In [31]:
# Calculate the DailyYield for each cow each day
data['DailyYield'] = data.groupby(['SE_Number', 'StartDate'])['TotalYield'].transform('sum')

# Sort the data by AnimalNumber and StartDate
data.sort_values(['AnimalNumber', 'StartDate'], inplace=True)

# Calculate the previous day's total yield for each cow
data['PreviousDailyYield'] = data.groupby('AnimalNumber')['DailyYield'].shift(1)

# Calculate the daily yield change for each cow
data['DailyYieldChange'] = data['DailyYield'] - data['PreviousDailyYield']

# Group and aggregate data ======================================================================>>> if running with filtered data: change Temperature to Temperature2 and THI to THI_adj2
data = data.groupby(['SE_Number', 'FarmName_Pseudo', 'StartDate']).agg({
    'DailyYield': 'first',
    'PreviousDailyYield': 'first',
    'DailyYieldChange': 'first',
    'HW': 'max',
    'Temperature2': 'mean',
    'THI_adj2': 'mean',
    'DaysInMilk': 'first',
    'YearSeason': 'first',
    'cum_HW': 'max',
    'Temp15Threshold': 'max',
    'Age': 'first',
    'BreedName': 'first',
    'LactationNumber': 'first'
}).reset_index()

# Renaming and formatting ======================================================================>>> if running with filtered data: change Temperature to Temperature2 and THI to THI_adj2
data.rename(columns={
    'Temperature2': 'MeanTemperature',
    'THI_adj2': 'MeanTHI_adj',
    'StartDate': 'Date'
}, inplace=True)
data['Date'] = pd.to_datetime(data['Date'])

# Display the first few rows of the transformed data
data.head()

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,YearSeason,cum_HW,Temp15Threshold,Age,BreedName,LactationNumber
0,SE-064c0cec-1189,a624fb9a,2022-05-29,18.96,18.96,0.0,0,9.766667,52.909813,5,202202,0,0,3243,SLB,8
1,SE-064c0cec-1189,a624fb9a,2022-05-30,22.64,18.96,3.68,0,9.833333,52.409547,6,202202,0,1,3244,SLB,8
2,SE-064c0cec-1189,a624fb9a,2022-05-31,26.49,22.64,3.85,0,11.2,52.27307,7,202202,0,0,3245,SLB,8
3,SE-064c0cec-1189,a624fb9a,2022-06-01,33.61,26.49,7.12,0,12.433333,55.124493,8,202203,0,1,3246,SLB,8
4,SE-064c0cec-1189,a624fb9a,2022-06-02,35.02,33.61,1.41,0,11.766667,54.947733,9,202203,0,1,3247,SLB,8


In [32]:
# Check if DailyYield is centered around approx the same for each farm
print("Mean of DailyYield:", data.groupby('FarmName_Pseudo')['DailyYield'].mean())
print("Standard Deviation of DailyYield:", data.groupby('FarmName_Pseudo')['DailyYield'].std())

Mean of DailyYield: FarmName_Pseudo
5b581702    36.400683
5c06d92d    38.001519
5f7f33d6    38.520080
752efd72    31.068550
a624fb9a    34.178373
ab18b151    42.075966
ad0a39f5    39.776830
afdd9a78    28.351850
f454e660    31.420576
Name: DailyYield, dtype: float64
Standard Deviation of DailyYield: FarmName_Pseudo
5b581702    11.830505
5c06d92d     9.364701
5f7f33d6    12.236713
752efd72     7.594911
a624fb9a    10.819056
ab18b151    10.654841
ad0a39f5     9.448257
afdd9a78     8.264598
f454e660    11.598719
Name: DailyYield, dtype: float64


In [33]:
# Define the Wilmink Lactation Curve function
def wilmink_lactation_curve(dim, a, b, c, d):
    return a + b * dim + c * np.exp(-d * dim)

# Function to remove outliers
def remove_outliers(group, threshold=3.5):
    mean = np.mean(group['DailyYield'])
    std_dev = np.std(group['DailyYield'])
    return group[(group['DailyYield'] > mean - threshold * std_dev) & (group['DailyYield'] < mean + threshold * std_dev)]

# Function to smooth the data using .loc to avoid SettingWithCopyWarning
def smooth_data(group, window=5):
    group.loc[:, 'DailyYield'] = group['DailyYield'].rolling(window, min_periods=1).mean()
    return group

# Function to fit curve_fit before applying Quantile Regression
def fit_with_curve_fit_before_quantreg(dataset, quantile=0.7, max_iter=100000):
    params_dict = {}
    valid_indices = []

    for (animal_number, lactation_number), group in tqdm(dataset.groupby(['SE_Number', 'LactationNumber']), unit=" Segments"):
        try:
            group = remove_outliers(group)
            group = smooth_data(group)
            x_data = group['DaysInMilk'].values.astype(float)
            y_data = group['DailyYield'].values.astype(float)

            # Ensure there are enough data points to fit the curve
            if (len(x_data) < 150) or (len(y_data) < 150):
                print(f"Insufficient data points for cow {animal_number}, lactation {lactation_number}, skipping.")
                continue

            valid_indices.extend(group.index)

            # Fit the model using curve_fit
            try:
                # Initial parameter guesses
                initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1]
                # Bounds on the parameters to prevent overflow
                bounds = ([-np.inf, -np.inf, -np.inf, 0], [np.inf, np.inf, np.inf, np.inf])

                with warnings.catch_warnings():
                    warnings.filterwarnings('error', category=OptimizeWarning)
                    popt, _ = curve_fit(
                        wilmink_lactation_curve, x_data, y_data,
                        p0=initial_guesses, bounds=bounds, maxfev=30000
                    )

                # Store the parameters in the dictionary
                params_dict[(animal_number, lactation_number)] = {'a': popt[0], 'b': popt[1], 'c': popt[2], 'd': popt[3]}

            except Exception as e:
                print(f"Curve fitting failed for cow {animal_number}, lactation {lactation_number}: {e}")
                continue

            # Now use the parameters from curve_fit for quantile regression
            X = np.column_stack([np.ones_like(x_data), x_data, np.exp(-x_data), -x_data * np.exp(-x_data)])
            quantreg_model = sm.QuantReg(y_data, X)
            quantreg_fit = quantreg_model.fit(q=quantile, max_iter=max_iter, start_params=popt)

            # Update parameters after quantile regression
            a, b, c, d = quantreg_fit.params
            dataset.loc[group.index, 'ExpectedYield'] = wilmink_lactation_curve(group['DaysInMilk'], a, b, c, d)
            params_dict[(animal_number, lactation_number)] = {'a': a, 'b': b, 'c': c, 'd': d}

        except Exception as e:
            print(f"Error processing cow {animal_number}, lactation {lactation_number}: {e}")

    return dataset, params_dict

# Apply the curve fitting before quantile regression
data, params_dict = fit_with_curve_fit_before_quantreg(data, quantile=0.7, max_iter=100000)

# Remove rows where ExpectedYield is NaN
data = data.dropna(subset=['ExpectedYield'])

# Calculate NormalizedDailyYield, PreviousDailyYield, DailyYieldChange, and NormalizedDailyYieldChange
data.loc[:, 'NormalizedDailyYield'] = data['DailyYield'] / data['ExpectedYield']
data.loc[:, 'PreviousDailyYield'] = data.groupby('SE_Number')['DailyYield'].shift(1)
data.loc[:, 'DailyYieldChange'] = data['DailyYield'] - data['PreviousDailyYield']
data.loc[:, 'NormalizedDailyYieldChange'] = data['DailyYieldChange'] / data['ExpectedYield']
data

  1%|          | 25/2176 [00:17<14:56,  2.40 Segments/s]  

Insufficient data points for cow SE-5b581702-1820, lactation 5, skipping.
Insufficient data points for cow SE-5b581702-1851, lactation 4, skipping.
Insufficient data points for cow SE-5b581702-1860, lactation 4, skipping.


  2%|▏         | 40/2176 [00:17<05:04,  7.01 Segments/s]

Insufficient data points for cow SE-5b581702-1912, lactation 3, skipping.


  2%|▏         | 50/2176 [00:52<52:52,  1.49s/ Segments]  

Insufficient data points for cow SE-5b581702-2002, lactation 3, skipping.


  3%|▎         | 69/2176 [00:54<13:06,  2.68 Segments/s]

Insufficient data points for cow SE-5b581702-2127, lactation 2, skipping.


  4%|▍         | 83/2176 [00:55<05:13,  6.68 Segments/s]

Insufficient data points for cow SE-5b581702-2206, lactation 1, skipping.
Insufficient data points for cow SE-5b581702-2207, lactation 1, skipping.
Insufficient data points for cow SE-5b581702-2209, lactation 1, skipping.


  4%|▍         | 96/2176 [01:05<15:50,  2.19 Segments/s]

Insufficient data points for cow SE-5c06d92d-2268, lactation 8, skipping.
Insufficient data points for cow SE-5c06d92d-2408, lactation 7, skipping.


  7%|▋         | 147/2176 [01:09<02:13, 15.24 Segments/s]

Insufficient data points for cow SE-5c06d92d-2744, lactation 4, skipping.
Insufficient data points for cow SE-5c06d92d-2762, lactation 5, skipping.


  7%|▋         | 161/2176 [01:09<01:21, 24.66 Segments/s]

Insufficient data points for cow SE-5c06d92d-2782, lactation 5, skipping.


  9%|▉         | 204/2176 [01:16<02:58, 11.06 Segments/s]

Insufficient data points for cow SE-5c06d92d-2877, lactation 5, skipping.


 10%|█         | 226/2176 [01:16<01:11, 27.12 Segments/s]

Insufficient data points for cow SE-5c06d92d-2941, lactation 3, skipping.
Insufficient data points for cow SE-5c06d92d-2950, lactation 5, skipping.


 12%|█▏        | 253/2176 [01:21<03:18,  9.71 Segments/s]

Insufficient data points for cow SE-5c06d92d-3017, lactation 4, skipping.


 13%|█▎        | 289/2176 [01:25<01:35, 19.78 Segments/s]

Insufficient data points for cow SE-5c06d92d-3049, lactation 4, skipping.
Insufficient data points for cow SE-5c06d92d-3054, lactation 3, skipping.


 14%|█▍        | 307/2176 [01:31<05:03,  6.16 Segments/s]

Insufficient data points for cow SE-5c06d92d-3076, lactation 4, skipping.


 15%|█▍        | 325/2176 [01:31<02:15, 13.62 Segments/s]

Insufficient data points for cow SE-5c06d92d-3106, lactation 4, skipping.


 16%|█▌        | 343/2176 [01:33<02:03, 14.87 Segments/s]

Insufficient data points for cow SE-5c06d92d-3124, lactation 4, skipping.


 18%|█▊        | 385/2176 [01:51<06:04,  4.92 Segments/s]

Insufficient data points for cow SE-5c06d92d-3176, lactation 3, skipping.


 21%|██        | 447/2176 [02:08<06:57,  4.14 Segments/s]

Insufficient data points for cow SE-5c06d92d-3225, lactation 3, skipping.
Insufficient data points for cow SE-5c06d92d-3226, lactation 3, skipping.


 21%|██        | 457/2176 [02:08<03:48,  7.53 Segments/s]

Insufficient data points for cow SE-5c06d92d-3251, lactation 3, skipping.


 22%|██▏       | 479/2176 [02:09<01:28, 19.14 Segments/s]

Insufficient data points for cow SE-5c06d92d-3266, lactation 2, skipping.


 24%|██▍       | 517/2176 [02:20<12:43,  2.17 Segments/s]

Insufficient data points for cow SE-5c06d92d-3292, lactation 1, skipping.


 24%|██▍       | 531/2176 [02:21<05:03,  5.42 Segments/s]

Insufficient data points for cow SE-5c06d92d-3323, lactation 2, skipping.


 25%|██▌       | 547/2176 [02:34<11:52,  2.29 Segments/s]

Insufficient data points for cow SE-5c06d92d-3330, lactation 2, skipping.


 27%|██▋       | 579/2176 [02:35<02:02, 13.03 Segments/s]

Insufficient data points for cow SE-5c06d92d-3374, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-3377, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-3384, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-3390, lactation 2, skipping.


 27%|██▋       | 597/2176 [02:35<01:01, 25.76 Segments/s]

Insufficient data points for cow SE-5c06d92d-3401, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-3403, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-3404, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-3406, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-3407, lactation 2, skipping.


 28%|██▊       | 611/2176 [02:35<00:45, 34.61 Segments/s]

Insufficient data points for cow SE-5c06d92d-3418, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-3424, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-3427, lactation 2, skipping.


 29%|██▉       | 627/2176 [02:36<00:38, 40.74 Segments/s]

Insufficient data points for cow SE-5c06d92d-3438, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-3441, lactation 2, skipping.
Insufficient data points for cow SE-5c06d92d-3443, lactation 2, skipping.


  result = getattr(ufunc, method)(*inputs2, **kwargs)
 32%|███▏      | 692/2176 [02:51<03:21,  7.37 Segments/s]

Insufficient data points for cow SE-5c06d92d-3589, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3593, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3594, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3595, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3599, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3601, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3602, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3603, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3605, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3607, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3618, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3624, lactation 1, skipping.
Insufficient data points for cow SE-5c06d92d-3626, lactation 1, skipping.
Insufficient data points for cow SE-5c

 33%|███▎      | 716/2176 [03:04<11:31,  2.11 Segments/s]

Insufficient data points for cow SE-5f7f33d6-0630, lactation 6, skipping.


 35%|███▌      | 765/2176 [04:00<25:07,  1.07s/ Segments]

Insufficient data points for cow SE-5f7f33d6-0870, lactation 5, skipping.


  result = getattr(ufunc, method)(*inputs2, **kwargs)
 38%|███▊      | 822/2176 [04:46<14:57,  1.51 Segments/s]

Insufficient data points for cow SE-5f7f33d6-0995, lactation 4, skipping.
Insufficient data points for cow SE-5f7f33d6-1004, lactation 4, skipping.


 40%|████      | 876/2176 [05:08<02:12,  9.83 Segments/s]

Insufficient data points for cow SE-5f7f33d6-1099, lactation 3, skipping.
Insufficient data points for cow SE-5f7f33d6-1101, lactation 3, skipping.


  result = getattr(ufunc, method)(*inputs2, **kwargs)
 41%|████      | 888/2176 [05:13<04:50,  4.43 Segments/s]

Insufficient data points for cow SE-5f7f33d6-1136, lactation 3, skipping.
Insufficient data points for cow SE-5f7f33d6-1138, lactation 3, skipping.


 41%|████      | 895/2176 [05:15<04:46,  4.48 Segments/s]

Insufficient data points for cow SE-5f7f33d6-1146, lactation 3, skipping.


 41%|████▏     | 902/2176 [05:16<03:47,  5.59 Segments/s]

Insufficient data points for cow SE-5f7f33d6-1151, lactation 3, skipping.
Insufficient data points for cow SE-5f7f33d6-1156, lactation 3, skipping.


 42%|████▏     | 909/2176 [05:31<22:08,  1.05s/ Segments]

Insufficient data points for cow SE-5f7f33d6-1164, lactation 3, skipping.


 47%|████▋     | 1027/2176 [06:42<17:08,  1.12 Segments/s]

Insufficient data points for cow SE-5f7f33d6-1266, lactation 2, skipping.
Insufficient data points for cow SE-5f7f33d6-1267, lactation 2, skipping.


 48%|████▊     | 1039/2176 [06:43<05:41,  3.33 Segments/s]

Insufficient data points for cow SE-5f7f33d6-1277, lactation 2, skipping.


 48%|████▊     | 1043/2176 [06:45<07:00,  2.70 Segments/s]

Insufficient data points for cow SE-5f7f33d6-1288, lactation 2, skipping.
Insufficient data points for cow SE-5f7f33d6-1289, lactation 2, skipping.
Insufficient data points for cow SE-5f7f33d6-1294, lactation 2, skipping.


 48%|████▊     | 1055/2176 [06:45<03:03,  6.12 Segments/s]

Insufficient data points for cow SE-5f7f33d6-1295, lactation 2, skipping.
Insufficient data points for cow SE-5f7f33d6-1296, lactation 2, skipping.


 49%|████▉     | 1066/2176 [06:48<03:58,  4.65 Segments/s]

Insufficient data points for cow SE-5f7f33d6-1313, lactation 2, skipping.
Insufficient data points for cow SE-5f7f33d6-1319, lactation 2, skipping.


 52%|█████▏    | 1129/2176 [06:52<00:31, 33.15 Segments/s]

Insufficient data points for cow SE-5f7f33d6-1414, lactation 1, skipping.


 53%|█████▎    | 1152/2176 [06:54<00:57, 17.92 Segments/s]

Insufficient data points for cow SE-5f7f33d6-1419, lactation 1, skipping.
Insufficient data points for cow SE-5f7f33d6-1421, lactation 1, skipping.
Insufficient data points for cow SE-5f7f33d6-1426, lactation 1, skipping.
Insufficient data points for cow SE-5f7f33d6-1427, lactation 1, skipping.
Insufficient data points for cow SE-5f7f33d6-1428, lactation 1, skipping.
Insufficient data points for cow SE-5f7f33d6-1431, lactation 1, skipping.
Insufficient data points for cow SE-5f7f33d6-1433, lactation 1, skipping.
Insufficient data points for cow SE-5f7f33d6-1437, lactation 1, skipping.
Insufficient data points for cow SE-5f7f33d6-1441, lactation 1, skipping.
Insufficient data points for cow SE-5f7f33d6-1442, lactation 1, skipping.
Insufficient data points for cow SE-5f7f33d6-1443, lactation 1, skipping.
Insufficient data points for cow SE-5f7f33d6-1445, lactation 1, skipping.
Insufficient data points for cow SE-5f7f33d6-1450, lactation 1, skipping.
Insufficient data points for cow SE-5f

 53%|█████▎    | 1159/2176 [06:55<01:01, 16.48 Segments/s]

Insufficient data points for cow SE-6d38bc90-2590, lactation 4, skipping.
Insufficient data points for cow SE-6d38bc90-2592, lactation 3, skipping.


 56%|█████▋    | 1226/2176 [07:18<05:05,  3.11 Segments/s]

Insufficient data points for cow SE-752efd72-0190, lactation 3, skipping.


 58%|█████▊    | 1252/2176 [07:27<03:21,  4.58 Segments/s]

Insufficient data points for cow SE-752efd72-0230, lactation 2, skipping.


 60%|█████▉    | 1299/2176 [07:43<03:28,  4.21 Segments/s]

Insufficient data points for cow SE-752efd72-0314, lactation 1, skipping.


  result = getattr(ufunc, method)(*inputs2, **kwargs)
 64%|██████▍   | 1401/2176 [08:15<01:19,  9.70 Segments/s]

Insufficient data points for cow SE-752efd72-0403, lactation 1, skipping.


 68%|██████▊   | 1474/2176 [08:37<00:43, 16.30 Segments/s]

Insufficient data points for cow SE-752efd72-2797, lactation 4, skipping.
Insufficient data points for cow SE-7b463eec-1624, lactation 2, skipping.
Insufficient data points for cow SE-a624fb9a-1187, lactation 6, skipping.


 69%|██████▉   | 1498/2176 [08:42<01:21,  8.35 Segments/s]

Insufficient data points for cow SE-a624fb9a-1275, lactation 5, skipping.
Insufficient data points for cow SE-a624fb9a-1287, lactation 5, skipping.


 70%|██████▉   | 1519/2176 [08:51<03:02,  3.61 Segments/s]

Insufficient data points for cow SE-a624fb9a-1322, lactation 4, skipping.
Insufficient data points for cow SE-a624fb9a-1330, lactation 5, skipping.


 70%|███████   | 1528/2176 [08:56<04:33,  2.37 Segments/s]

Insufficient data points for cow SE-a624fb9a-1342, lactation 4, skipping.
Insufficient data points for cow SE-a624fb9a-1348, lactation 3, skipping.


 77%|███████▋  | 1669/2176 [09:44<04:54,  1.72 Segments/s]

Insufficient data points for cow SE-a624fb9a-1521, lactation 1, skipping.


 78%|███████▊  | 1694/2176 [09:55<03:07,  2.57 Segments/s]

Insufficient data points for cow SE-a756bc39-1070, lactation 4, skipping.


 78%|███████▊  | 1703/2176 [09:59<03:43,  2.11 Segments/s]

Insufficient data points for cow SE-a756bc39-1157, lactation 3, skipping.


 78%|███████▊  | 1706/2176 [10:01<04:05,  1.91 Segments/s]

Insufficient data points for cow SE-a756bc39-1159, lactation 3, skipping.
Insufficient data points for cow SE-a756bc39-1162, lactation 3, skipping.


 79%|███████▉  | 1729/2176 [10:04<00:59,  7.57 Segments/s]

Insufficient data points for cow SE-a756bc39-1231, lactation 2, skipping.


 81%|████████  | 1752/2176 [10:04<00:18, 22.64 Segments/s]

Insufficient data points for cow SE-a756bc39-1268, lactation 1, skipping.
Insufficient data points for cow SE-a756bc39-1275, lactation 1, skipping.
Insufficient data points for cow SE-a756bc39-1276, lactation 1, skipping.
Insufficient data points for cow SE-a756bc39-1284, lactation 1, skipping.
Insufficient data points for cow SE-a756bc39-1288, lactation 1, skipping.


 81%|████████  | 1766/2176 [10:10<01:12,  5.62 Segments/s]

Insufficient data points for cow SE-ab18b151-1240, lactation 3, skipping.
Insufficient data points for cow SE-ab18b151-1244, lactation 3, skipping.
Insufficient data points for cow SE-ab18b151-1263, lactation 3, skipping.
Insufficient data points for cow SE-ab18b151-1330, lactation 2, skipping.
Insufficient data points for cow SE-ab18b151-1338, lactation 2, skipping.
Insufficient data points for cow SE-ab18b151-1341, lactation 2, skipping.


 81%|████████▏ | 1773/2176 [10:11<01:19,  5.06 Segments/s]

Insufficient data points for cow SE-ab18b151-1397, lactation 1, skipping.
Insufficient data points for cow SE-ab18b151-1402, lactation 1, skipping.
Insufficient data points for cow SE-ab18b151-1419, lactation 1, skipping.
Insufficient data points for cow SE-ab18b151-1430, lactation 1, skipping.


 83%|████████▎ | 1798/2176 [10:13<00:31, 12.13 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2139, lactation 5, skipping.
Insufficient data points for cow SE-ad0a39f5-2176, lactation 4, skipping.
Insufficient data points for cow SE-ad0a39f5-2205, lactation 4, skipping.
Insufficient data points for cow SE-ad0a39f5-2230, lactation 4, skipping.
Insufficient data points for cow SE-ad0a39f5-2280, lactation 4, skipping.
Insufficient data points for cow SE-ad0a39f5-2282, lactation 3, skipping.


 83%|████████▎ | 1811/2176 [10:14<00:17, 20.28 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2288, lactation 4, skipping.
Insufficient data points for cow SE-ad0a39f5-2295, lactation 4, skipping.
Insufficient data points for cow SE-ad0a39f5-2321, lactation 4, skipping.


 84%|████████▎ | 1817/2176 [10:16<00:50,  7.18 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2346, lactation 3, skipping.
Insufficient data points for cow SE-ad0a39f5-2349, lactation 4, skipping.


 85%|████████▍ | 1839/2176 [10:19<00:29, 11.46 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2420, lactation 3, skipping.


 85%|████████▍ | 1843/2176 [10:20<00:36,  9.02 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2488, lactation 3, skipping.


  result = getattr(ufunc, method)(*inputs2, **kwargs)
 86%|████████▋ | 1881/2176 [10:27<00:16, 17.50 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2599, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2625, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2628, lactation 2, skipping.


 87%|████████▋ | 1889/2176 [10:27<00:11, 25.12 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2642, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2644, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2653, lactation 2, skipping.
Insufficient data points for cow SE-ad0a39f5-2654, lactation 2, skipping.


 88%|████████▊ | 1904/2176 [10:29<00:23, 11.71 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2664, lactation 2, skipping.


 89%|████████▉ | 1938/2176 [10:31<00:07, 31.94 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2734, lactation 1, skipping.


 90%|████████▉ | 1949/2176 [10:31<00:08, 27.68 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2765, lactation 1, skipping.


 90%|█████████ | 1963/2176 [10:32<00:08, 23.79 Segments/s]

Insufficient data points for cow SE-ad0a39f5-2781, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2790, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2792, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2797, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2798, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2806, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2807, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2810, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2813, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2815, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2816, lactation 1, skipping.
Insufficient data points for cow SE-ad0a39f5-2817, lactation 1, skipping.


  result = getattr(ufunc, method)(*inputs2, **kwargs)
 91%|█████████ | 1985/2176 [10:55<01:45,  1.82 Segments/s]

Insufficient data points for cow SE-afdd9a78-1093, lactation 5, skipping.


 92%|█████████▏| 1993/2176 [11:00<01:43,  1.77 Segments/s]

Insufficient data points for cow SE-afdd9a78-1142, lactation 3, skipping.


  result = getattr(ufunc, method)(*inputs2, **kwargs)
 92%|█████████▏| 2000/2176 [11:02<01:08,  2.58 Segments/s]

Insufficient data points for cow SE-afdd9a78-1154, lactation 3, skipping.


  result = getattr(ufunc, method)(*inputs2, **kwargs)
 92%|█████████▏| 2010/2176 [11:14<02:14,  1.23 Segments/s]

Insufficient data points for cow SE-afdd9a78-1197, lactation 2, skipping.


 93%|█████████▎| 2016/2176 [11:16<01:18,  2.04 Segments/s]

Insufficient data points for cow SE-afdd9a78-1207, lactation 1, skipping.
Insufficient data points for cow SE-afdd9a78-1215, lactation 1, skipping.
Insufficient data points for cow SE-afdd9a78-1218, lactation 1, skipping.


 96%|█████████▋| 2096/2176 [11:57<00:04, 17.18 Segments/s]

Insufficient data points for cow SE-f454e660-0760, lactation 2, skipping.


 97%|█████████▋| 2107/2176 [12:24<01:45,  1.53s/ Segments]

Insufficient data points for cow SE-f454e660-0807, lactation 1, skipping.


100%|██████████| 2176/2176 [12:52<00:00,  2.82 Segments/s]


Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,YearSeason,cum_HW,Temp15Threshold,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange
0,SE-064c0cec-1189,a624fb9a,2022-05-29,18.96,,,0,9.766667,52.909813,5,202202,0,0,3243,SLB,8,41.739904,0.454242,
1,SE-064c0cec-1189,a624fb9a,2022-05-30,22.64,18.96,3.68,0,9.833333,52.409547,6,202202,0,1,3244,SLB,8,41.659523,0.543453,0.088335
2,SE-064c0cec-1189,a624fb9a,2022-05-31,26.49,22.64,3.85,0,11.200000,52.273070,7,202202,0,0,3245,SLB,8,41.579142,0.637098,0.092595
3,SE-064c0cec-1189,a624fb9a,2022-06-01,33.61,26.49,7.12,0,12.433333,55.124493,8,202203,0,1,3246,SLB,8,41.498761,0.809904,0.171571
4,SE-064c0cec-1189,a624fb9a,2022-06-02,35.02,33.61,1.41,0,11.766667,54.947733,9,202203,0,1,3247,SLB,8,41.41838,0.845518,0.034043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580238,SE-f454e660-799,f454e660,2024-03-29,22.41,26.74,-4.33,0,6.600000,41.551190,308,202402,0,0,,SLB,1,36.277424,0.61774,-0.119358
580239,SE-f454e660-799,f454e660,2024-03-30,22.00,22.41,-0.41,0,7.500000,49.420200,309,202402,0,0,,SLB,1,36.276903,0.606446,-0.011302
580240,SE-f454e660-799,f454e660,2024-03-31,25.11,22.00,3.11,0,6.400000,42.486960,310,202402,0,0,,SLB,1,36.276383,0.692186,0.085731
580241,SE-f454e660-799,f454e660,2024-04-01,20.84,25.11,-4.27,0,4.350000,39.095100,311,202402,0,0,,SLB,1,36.275862,0.574487,-0.117709


In [34]:
# Check if NormalizedDailyYield is centered around 1 for each unique farm
print("Mean of NormalizedDailyYield:", data.groupby('FarmName_Pseudo')['NormalizedDailyYield'].mean())
print("Standard Deviation of NormalizedDailyYield:", data.groupby('FarmName_Pseudo')['NormalizedDailyYield'].std())

Mean of NormalizedDailyYield: FarmName_Pseudo
5b581702     0.94171
5c06d92d    0.949423
5f7f33d6    0.939261
752efd72    0.955875
a624fb9a    0.933669
ab18b151    0.951866
ad0a39f5    0.952461
afdd9a78    0.870425
f454e660    0.931667
Name: NormalizedDailyYield, dtype: Float64
Standard Deviation of NormalizedDailyYield: FarmName_Pseudo
5b581702    0.182345
5c06d92d    0.127297
5f7f33d6     0.19153
752efd72    0.163624
a624fb9a    0.177036
ab18b151     0.15418
ad0a39f5    0.121083
afdd9a78    0.306046
f454e660    0.219041
Name: NormalizedDailyYield, dtype: Float64


In [35]:
# Define the THI threshold ================================================================================>>> Change threshold here to 61 alt 67
THI_THRESHOLD = 67

# Calculate the daily heat load based on the THI threshold
data['HeatLoad'] = data['MeanTHI_adj'].apply(lambda x: x - THI_THRESHOLD if x > THI_THRESHOLD else -(THI_THRESHOLD - x))

# Initialize the cumulative heat load column with float type
data['CumulativeHeatLoad'] = 0.0  # Explicitly set as float

data = data.reset_index(drop=True)

# Iterate through the data to calculate cumulative heat load correctly
for i in range(1, len(data)):
    previous_cumulative = data.at[i-1, 'CumulativeHeatLoad']
    current_heat_load = data.at[i, 'HeatLoad']
    
    if current_heat_load < 0:  # If current heat load is negative
        new_cumulative = previous_cumulative + 2 * current_heat_load
    else:
        new_cumulative = previous_cumulative + current_heat_load
    
    # Ensure the cumulative heat load never goes below zero
    if new_cumulative > 0:
        data.at[i, 'CumulativeHeatLoad'] = new_cumulative
    else:
        data.at[i, 'CumulativeHeatLoad'] = 0.0  # Ensure float is maintained

# Drop rows where the 'DailyYield' column has NaN values
data = data.dropna(subset=['DailyYield'])

data.head(-5)

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,cum_HW,Temp15Threshold,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad
0,SE-064c0cec-1189,a624fb9a,2022-05-29,18.96,,,0,9.766667,52.909813,5,...,0,0,3243,SLB,8,41.739904,0.454242,,-14.090187,0.0
1,SE-064c0cec-1189,a624fb9a,2022-05-30,22.64,18.96,3.68,0,9.833333,52.409547,6,...,0,1,3244,SLB,8,41.659523,0.543453,0.088335,-14.590453,0.0
2,SE-064c0cec-1189,a624fb9a,2022-05-31,26.49,22.64,3.85,0,11.200000,52.273070,7,...,0,0,3245,SLB,8,41.579142,0.637098,0.092595,-14.726930,0.0
3,SE-064c0cec-1189,a624fb9a,2022-06-01,33.61,26.49,7.12,0,12.433333,55.124493,8,...,0,1,3246,SLB,8,41.498761,0.809904,0.171571,-11.875507,0.0
4,SE-064c0cec-1189,a624fb9a,2022-06-02,35.02,33.61,1.41,0,11.766667,54.947733,9,...,0,1,3247,SLB,8,41.41838,0.845518,0.034043,-12.052267,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555141,SE-f454e660-799,f454e660,2024-03-24,23.66,27.18,-3.52,0,3.450000,39.685770,303,...,0,0,,SLB,1,36.280026,0.652149,-0.097023,-27.314230,0.0
555142,SE-f454e660-799,f454e660,2024-03-25,22.73,23.66,-0.93,0,2.350000,36.393390,304,...,0,0,,SLB,1,36.279506,0.626525,-0.025634,-30.606610,0.0
555143,SE-f454e660-799,f454e660,2024-03-26,11.51,22.73,-11.22,0,5.900000,44.226420,305,...,0,0,,SLB,1,36.278985,0.317264,-0.30927,-22.773580,0.0
555144,SE-f454e660-799,f454e660,2024-03-27,24.73,11.51,13.22,0,5.800000,40.532550,306,...,0,0,,SLB,1,36.278465,0.681672,0.364404,-26.467450,0.0


In [36]:
# When CumulativeHeatLoad is greater than 3, it indicates that the cow is under heat stress
data['HeatStress'] = (data['CumulativeHeatLoad'] > 3).astype(int)
data.head(-5)

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Temp15Threshold,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad,HeatStress
0,SE-064c0cec-1189,a624fb9a,2022-05-29,18.96,,,0,9.766667,52.909813,5,...,0,3243,SLB,8,41.739904,0.454242,,-14.090187,0.0,0
1,SE-064c0cec-1189,a624fb9a,2022-05-30,22.64,18.96,3.68,0,9.833333,52.409547,6,...,1,3244,SLB,8,41.659523,0.543453,0.088335,-14.590453,0.0,0
2,SE-064c0cec-1189,a624fb9a,2022-05-31,26.49,22.64,3.85,0,11.200000,52.273070,7,...,0,3245,SLB,8,41.579142,0.637098,0.092595,-14.726930,0.0,0
3,SE-064c0cec-1189,a624fb9a,2022-06-01,33.61,26.49,7.12,0,12.433333,55.124493,8,...,1,3246,SLB,8,41.498761,0.809904,0.171571,-11.875507,0.0,0
4,SE-064c0cec-1189,a624fb9a,2022-06-02,35.02,33.61,1.41,0,11.766667,54.947733,9,...,1,3247,SLB,8,41.41838,0.845518,0.034043,-12.052267,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555141,SE-f454e660-799,f454e660,2024-03-24,23.66,27.18,-3.52,0,3.450000,39.685770,303,...,0,,SLB,1,36.280026,0.652149,-0.097023,-27.314230,0.0,0
555142,SE-f454e660-799,f454e660,2024-03-25,22.73,23.66,-0.93,0,2.350000,36.393390,304,...,0,,SLB,1,36.279506,0.626525,-0.025634,-30.606610,0.0,0
555143,SE-f454e660-799,f454e660,2024-03-26,11.51,22.73,-11.22,0,5.900000,44.226420,305,...,0,,SLB,1,36.278985,0.317264,-0.30927,-22.773580,0.0,0
555144,SE-f454e660-799,f454e660,2024-03-27,24.73,11.51,13.22,0,5.800000,40.532550,306,...,0,,SLB,1,36.278465,0.681672,0.364404,-26.467450,0.0,0


In [37]:
# Make a dataframe from the parameters dictionary, it should contain Se_Number, LactationNumber, a, b, c, d
params_df = pd.DataFrame(params_dict).T.reset_index()
params_df.columns = ['SE_Number', 'LactationNumber', 'a', 'b', 'c', 'd']
params_df.head(-5)

Unnamed: 0,SE_Number,LactationNumber,a,b,c,d
0,SE-064c0cec-1189,8,42.141809,-0.080381,2.179164e+04,5.034495e+03
1,SE-27c3257a-1492,1,33.377780,-0.012773,-3.883773e-26,1.266609e-24
2,SE-27c3257a-1492,2,52.223603,-0.086165,1.962517e+04,4.178013e+03
3,SE-30dc5787-1389,6,53.606586,-0.078718,2.279885e+04,5.355650e+03
4,SE-4b8091ac-1472,1,34.455809,-0.022543,3.570575e-14,-7.041866e-13
...,...,...,...,...,...,...
1988,SE-f454e660-731,1,40.770711,-0.079710,-2.954020e+02,1.772412e+03
1989,SE-f454e660-735,1,26.431838,0.021503,-1.835149e+02,1.101089e+03
1990,SE-f454e660-737,1,27.996059,-0.038187,-4.488891e+01,2.693335e+02
1991,SE-f454e660-748,1,36.408805,-0.007535,-1.617378e+02,9.704269e+02


In [39]:
# Calculate Z-scores for each parameter
params_df['z_a'] = zscore(params_df['a'])
params_df['z_b'] = zscore(params_df['b'])
params_df['z_c'] = zscore(params_df['c'])
params_df['z_d'] = zscore(params_df['d'])

params_df.head(-5)

Unnamed: 0,SE_Number,LactationNumber,a,b,c,d,z_a,z_b,z_c,z_d
0,SE-064c0cec-1189,8,42.141809,-0.080381,2.179164e+04,5.034495e+03,-0.370900,-0.747926,0.784258,0.224763
1,SE-27c3257a-1492,1,33.377780,-0.012773,-3.883773e-26,1.266609e-24,-1.177486,1.126624,-1.121185,-0.575798
2,SE-27c3257a-1492,2,52.223603,-0.086165,1.962517e+04,4.178013e+03,0.556964,-0.908307,0.594824,0.088570
3,SE-30dc5787-1389,6,53.606586,-0.078718,2.279885e+04,5.355650e+03,0.684245,-0.701825,0.872328,0.275832
4,SE-4b8091ac-1472,1,34.455809,-0.022543,3.570575e-14,-7.041866e-13,-1.078271,0.855728,-1.121185,-0.575798
...,...,...,...,...,...,...,...,...,...,...
1988,SE-f454e660-731,1,40.770711,-0.079710,-2.954020e+02,1.772412e+03,-0.497088,-0.729331,-1.147015,-0.293957
1989,SE-f454e660-735,1,26.431838,0.021503,-1.835149e+02,1.101089e+03,-1.816747,2.076974,-1.137232,-0.400708
1990,SE-f454e660-737,1,27.996059,-0.038187,-4.488891e+01,2.693335e+02,-1.672786,0.421958,-1.125110,-0.532970
1991,SE-f454e660-748,1,36.408805,-0.007535,-1.617378e+02,9.704269e+02,-0.898530,1.271848,-1.135328,-0.421485


In [40]:
# Identify outliers (using Z-score > 3.5 or < -3.5 as threshold)
outliers = params_df[(np.abs(params_df[['z_a', 'z_b', 'z_c', 'z_d']]) > 3.5).any(axis=1)]

x = outliers.count()
print("Number of outliers:", x)

# Optionally, drop the outliers
params_df_cleaned = params_df.drop(outliers.index)
params_df_cleaned.head(-5)

Number of outliers: SE_Number          27
LactationNumber    27
a                  27
b                  27
c                  27
d                  27
z_a                27
z_b                27
z_c                27
z_d                27
dtype: int64


Unnamed: 0,SE_Number,LactationNumber,a,b,c,d,z_a,z_b,z_c,z_d
0,SE-064c0cec-1189,8,42.141809,-0.080381,2.179164e+04,5.034495e+03,-0.370900,-0.747926,0.784258,0.224763
1,SE-27c3257a-1492,1,33.377780,-0.012773,-3.883773e-26,1.266609e-24,-1.177486,1.126624,-1.121185,-0.575798
2,SE-27c3257a-1492,2,52.223603,-0.086165,1.962517e+04,4.178013e+03,0.556964,-0.908307,0.594824,0.088570
3,SE-30dc5787-1389,6,53.606586,-0.078718,2.279885e+04,5.355650e+03,0.684245,-0.701825,0.872328,0.275832
4,SE-4b8091ac-1472,1,34.455809,-0.022543,3.570575e-14,-7.041866e-13,-1.078271,0.855728,-1.121185,-0.575798
...,...,...,...,...,...,...,...,...,...,...
1988,SE-f454e660-731,1,40.770711,-0.079710,-2.954020e+02,1.772412e+03,-0.497088,-0.729331,-1.147015,-0.293957
1989,SE-f454e660-735,1,26.431838,0.021503,-1.835149e+02,1.101089e+03,-1.816747,2.076974,-1.137232,-0.400708
1990,SE-f454e660-737,1,27.996059,-0.038187,-4.488891e+01,2.693335e+02,-1.672786,0.421958,-1.125110,-0.532970
1991,SE-f454e660-748,1,36.408805,-0.007535,-1.617378e+02,9.704269e+02,-0.898530,1.271848,-1.135328,-0.421485


In [41]:
# Identify unique SE_Number and LactationNumber combinations from the outliers
outlier_combinations = outliers[['SE_Number', 'LactationNumber']].drop_duplicates()

# Merge with the original data to find rows that match these outlier combinations
data_cleaned = data.merge(outlier_combinations, on=['SE_Number', 'LactationNumber'], how='left', indicator=True)

# Keep only the rows that do not match the outlier combinations
data_cleaned = data_cleaned[data_cleaned['_merge'] == 'left_only'].drop(columns=['_merge'])

# Now data_cleaned contains the original data with the outlier combinations removed
print("Number of rows removed:", len(data) - len(data_cleaned))
data_cleaned.head(-5)

Number of rows removed: 6425


Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Temp15Threshold,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad,HeatStress
0,SE-064c0cec-1189,a624fb9a,2022-05-29,18.96,,,0,9.766667,52.909813,5,...,0,3243,SLB,8,41.739904,0.454242,,-14.090187,0.0,0
1,SE-064c0cec-1189,a624fb9a,2022-05-30,22.64,18.96,3.68,0,9.833333,52.409547,6,...,1,3244,SLB,8,41.659523,0.543453,0.088335,-14.590453,0.0,0
2,SE-064c0cec-1189,a624fb9a,2022-05-31,26.49,22.64,3.85,0,11.200000,52.273070,7,...,0,3245,SLB,8,41.579142,0.637098,0.092595,-14.726930,0.0,0
3,SE-064c0cec-1189,a624fb9a,2022-06-01,33.61,26.49,7.12,0,12.433333,55.124493,8,...,1,3246,SLB,8,41.498761,0.809904,0.171571,-11.875507,0.0,0
4,SE-064c0cec-1189,a624fb9a,2022-06-02,35.02,33.61,1.41,0,11.766667,54.947733,9,...,1,3247,SLB,8,41.41838,0.845518,0.034043,-12.052267,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555141,SE-f454e660-799,f454e660,2024-03-24,23.66,27.18,-3.52,0,3.450000,39.685770,303,...,0,,SLB,1,36.280026,0.652149,-0.097023,-27.314230,0.0,0
555142,SE-f454e660-799,f454e660,2024-03-25,22.73,23.66,-0.93,0,2.350000,36.393390,304,...,0,,SLB,1,36.279506,0.626525,-0.025634,-30.606610,0.0,0
555143,SE-f454e660-799,f454e660,2024-03-26,11.51,22.73,-11.22,0,5.900000,44.226420,305,...,0,,SLB,1,36.278985,0.317264,-0.30927,-22.773580,0.0,0
555144,SE-f454e660-799,f454e660,2024-03-27,24.73,11.51,13.22,0,5.800000,40.532550,306,...,0,,SLB,1,36.278465,0.681672,0.364404,-26.467450,0.0,0


In [42]:
# Check if NormalizedDailyYield is centered around 1 for each unique farm
print("Mean of NormalizedDailyYield:", data_cleaned.groupby('FarmName_Pseudo')['NormalizedDailyYield'].mean())
print("Standard Deviation of NormalizedDailyYield:", data_cleaned.groupby('FarmName_Pseudo')['NormalizedDailyYield'].std())

Mean of NormalizedDailyYield: FarmName_Pseudo
5b581702    0.941604
5c06d92d    0.949592
5f7f33d6    0.939203
752efd72    0.956095
a624fb9a    0.934005
ab18b151    0.953157
ad0a39f5     0.95253
afdd9a78    0.870425
f454e660    0.932097
Name: NormalizedDailyYield, dtype: Float64
Standard Deviation of NormalizedDailyYield: FarmName_Pseudo
5b581702    0.182503
5c06d92d    0.126901
5f7f33d6    0.191713
752efd72    0.163165
a624fb9a    0.176652
ab18b151    0.151519
ad0a39f5    0.121162
afdd9a78    0.306046
f454e660    0.219249
Name: NormalizedDailyYield, dtype: Float64


In [43]:
data_cleaned['Residuals'] = data_cleaned['DailyYield'] - data_cleaned['ExpectedYield']
data_cleaned

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad,HeatStress,Residuals
0,SE-064c0cec-1189,a624fb9a,2022-05-29,18.96,,,0,9.766667,52.909813,5,...,3243,SLB,8,41.739904,0.454242,,-14.090187,0.0,0,-22.779904
1,SE-064c0cec-1189,a624fb9a,2022-05-30,22.64,18.96,3.68,0,9.833333,52.409547,6,...,3244,SLB,8,41.659523,0.543453,0.088335,-14.590453,0.0,0,-19.019523
2,SE-064c0cec-1189,a624fb9a,2022-05-31,26.49,22.64,3.85,0,11.200000,52.273070,7,...,3245,SLB,8,41.579142,0.637098,0.092595,-14.726930,0.0,0,-15.089142
3,SE-064c0cec-1189,a624fb9a,2022-06-01,33.61,26.49,7.12,0,12.433333,55.124493,8,...,3246,SLB,8,41.498761,0.809904,0.171571,-11.875507,0.0,0,-7.888761
4,SE-064c0cec-1189,a624fb9a,2022-06-02,35.02,33.61,1.41,0,11.766667,54.947733,9,...,3247,SLB,8,41.41838,0.845518,0.034043,-12.052267,0.0,0,-6.39838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555146,SE-f454e660-799,f454e660,2024-03-29,22.41,26.74,-4.33,0,6.600000,41.551190,308,...,,SLB,1,36.277424,0.61774,-0.119358,-25.448810,0.0,0,-13.867424
555147,SE-f454e660-799,f454e660,2024-03-30,22.00,22.41,-0.41,0,7.500000,49.420200,309,...,,SLB,1,36.276903,0.606446,-0.011302,-17.579800,0.0,0,-14.276903
555148,SE-f454e660-799,f454e660,2024-03-31,25.11,22.00,3.11,0,6.400000,42.486960,310,...,,SLB,1,36.276383,0.692186,0.085731,-24.513040,0.0,0,-11.166383
555149,SE-f454e660-799,f454e660,2024-04-01,20.84,25.11,-4.27,0,4.350000,39.095100,311,...,,SLB,1,36.275862,0.574487,-0.117709,-27.904900,0.0,0,-15.435862


In [44]:
farm_results = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    farm_residuals = []
    
    for se_number, cow_group in farm_group.groupby('SE_Number'):
        residuals = cow_group['Residuals'].dropna()  # Drop NaN values
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            farm_residuals.append(residuals)
    
    if len(farm_residuals) > 0:
        # Combine residuals from all cows in the farm
        combined_residuals = np.concatenate(farm_residuals)
        
        if len(combined_residuals) > 1:  # Ensure enough data to perform calculations
            # Calculate farm-level statistics
            acf_values = acf(combined_residuals, nlags=30, fft=False)
            pacf_values = pacf(combined_residuals, nlags=min(30, len(combined_residuals)//2))

            # Print the farm-level statistics
            print(f"Farm: {farm_name}")
            print(f"ACF (first 5 lags): {acf_values[:5]}")
            print(f"PACF (first 5 lags): {pacf_values[:5]}")
        else:
            print(f"Farm: {farm_name} does not have enough data for reliable calculations.")
        
        print("=" * 50)

Farm: 5b581702
ACF (first 5 lags): [1.         0.08101558 0.32072019 0.2870013  0.26956239]
PACF (first 5 lags): [1.         0.08101894 0.31625869 0.27251247 0.19303314]


  xo = x - x.mean()
  x -= x.mean()


Farm: 5c06d92d
ACF (first 5 lags): [nan nan nan nan nan]
PACF (first 5 lags): [ 1. nan nan nan nan]


  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()


Farm: 5f7f33d6
ACF (first 5 lags): [nan nan nan nan nan]
PACF (first 5 lags): [ 1. nan nan nan nan]


  x -= x.mean()


Farm: 752efd72
ACF (first 5 lags): [nan nan nan nan nan]
PACF (first 5 lags): [ 1. nan nan nan nan]
Farm: a624fb9a
ACF (first 5 lags): [1.         0.15701826 0.39166732 0.35844783 0.33915495]
PACF (first 5 lags): [1.         0.15702105 0.37630359 0.31176642 0.21398586]
Farm: ab18b151
ACF (first 5 lags): [1.         0.0601037  0.28605826 0.26924812 0.22866758]
PACF (first 5 lags): [1.         0.06012607 0.28368167 0.26212904 0.16842051]


  xo = x - x.mean()
  x -= x.mean()
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Farm: ad0a39f5
ACF (first 5 lags): [nan nan nan nan nan]
PACF (first 5 lags): [ 1. nan nan nan nan]
Farm: afdd9a78
ACF (first 5 lags): [nan nan nan nan nan]
PACF (first 5 lags): [ 1. nan nan nan nan]
Farm: f454e660
ACF (first 5 lags): [1.         0.01635437 0.28461615 0.2706611  0.25362883]
PACF (first 5 lags): [1.         0.01635472 0.28443693 0.28589615 0.22259302]


In [45]:
# Group by 'FarmName_Pseudo', 'SE_Number', and 'LactationNumber' to perform individual calculations
farm_results = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    print(f"Farm: {farm_name}")
    
    for (se_number, lactation_number), cow_group in farm_group.groupby(['SE_Number', 'LactationNumber']):
        residuals = cow_group['Residuals']
        residuals = cow_group['Residuals'].dropna()  # Drop NaN values
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            acf_values = acf(residuals, nlags=30, fft=False)
            pacf_values = pacf(residuals, nlags=min(30, len(residuals)//2))

            # Print the statistics
            print(f"\nCow: {se_number}, Lactation Number: {lactation_number}")
            print(f"ACF (first 5 lags): {acf_values[:5]}")
            print(f"PACF (first 5 lags): {pacf_values[:5]}")
            print("-" * 50)
            
    print("=" * 50)

Farm: 5b581702

Cow: SE-27c3257a-1492, Lactation Number: 1
ACF (first 5 lags): [ 1.         -0.02097328  0.19864561  0.22198977  0.21537846]
PACF (first 5 lags): [ 1.         -0.02104983  0.1997463   0.24176238  0.21855336]
--------------------------------------------------

Cow: SE-27c3257a-1492, Lactation Number: 2
ACF (first 5 lags): [ 1.         -0.0257726   0.15859277  0.32194023  0.10841912]
PACF (first 5 lags): [ 1.         -0.02587569  0.15930372  0.34234577  0.1363466 ]
--------------------------------------------------

Cow: SE-4b8091ac-1472, Lactation Number: 1
ACF (first 5 lags): [1.         0.11608628 0.34493712 0.29705093 0.14983035]
PACF (first 5 lags): [1.         0.11648114 0.33831398 0.26779714 0.01779562]
--------------------------------------------------

Cow: SE-5b581702-1742, Lactation Number: 3
ACF (first 5 lags): [ 1.         -0.33950868  0.08122775  0.00108425  0.06716684]
PACF (first 5 lags): [ 1.         -0.34061457 -0.03875604  0.01905789  0.08788735]
------

  xo = x - x.mean()
  x -= x.mean()



Cow: SE-5f7f33d6-0296, Lactation Number: 8
ACF (first 5 lags): [ 1.         -0.05514268  0.21027559  0.1848198   0.13842945]
PACF (first 5 lags): [ 1.         -0.05534767  0.20942298  0.21795455  0.13536003]
--------------------------------------------------

Cow: SE-5f7f33d6-0425, Lactation Number: 6
ACF (first 5 lags): [ 1.         -0.11898196  0.19363495  0.22693795  0.16428835]
PACF (first 5 lags): [ 1.         -0.11934806  0.1831957   0.28331074  0.22059274]
--------------------------------------------------

Cow: SE-5f7f33d6-0577, Lactation Number: 5
ACF (first 5 lags): [1.         0.01186735 0.30042293 0.21196032 0.29177069]
PACF (first 5 lags): [1.         0.01191147 0.30256594 0.22923901 0.25042535]
--------------------------------------------------

Cow: SE-5f7f33d6-0593, Lactation Number: 4
ACF (first 5 lags): [ 1.         -0.06569188  0.07546237  0.27091735  0.09792719]
PACF (first 5 lags): [ 1.         -0.06591381  0.07194191  0.28578557  0.14696047]
---------------------

  xo = x - x.mean()
  x -= x.mean()



Cow: SE-5f7f33d6-0985, Lactation Number: 4
ACF (first 5 lags): [1.         0.04184509 0.2723597  0.21676754 0.16778146]
PACF (first 5 lags): [1.         0.04208151 0.27416936 0.21721623 0.10822706]
--------------------------------------------------

Cow: SE-5f7f33d6-0986, Lactation Number: 3
ACF (first 5 lags): [1.         0.12763647 0.27373763 0.27455751 0.35500545]
PACF (first 5 lags): [1.         0.12804953 0.26343798 0.23763997 0.29412186]
--------------------------------------------------

Cow: SE-5f7f33d6-0988, Lactation Number: 2
ACF (first 5 lags): [1.         0.16508969 0.31879829 0.35981494 0.35644644]
PACF (first 5 lags): [1.         0.16555083 0.30143876 0.31199199 0.27050695]
--------------------------------------------------

Cow: SE-5f7f33d6-0989, Lactation Number: 2
ACF (first 5 lags): [ 1.         -0.27787933  0.14173436  0.21196729  0.0236875 ]
PACF (first 5 lags): [ 1.         -0.2788087   0.07042581  0.29500548  0.1733522 ]
-----------------------------------------

  xo = x - x.mean()
  x -= x.mean()



Cow: SE-5f7f33d6-1126, Lactation Number: 3
ACF (first 5 lags): [nan nan nan nan nan]
PACF (first 5 lags): [ 1. nan nan nan nan]
--------------------------------------------------

Cow: SE-5f7f33d6-1128, Lactation Number: 2
ACF (first 5 lags): [ 1.         -0.11876327  0.21986513  0.19882037  0.20714426]
PACF (first 5 lags): [ 1.         -0.11931057  0.21066473  0.26286878  0.25120565]
--------------------------------------------------

Cow: SE-5f7f33d6-1132, Lactation Number: 1
ACF (first 5 lags): [ 1.         -0.22515996  0.11487186  0.07282656  0.09365412]
PACF (first 5 lags): [ 1.         -0.22611402  0.06820932  0.12027192  0.13726439]
--------------------------------------------------

Cow: SE-5f7f33d6-1132, Lactation Number: 2
ACF (first 5 lags): [ 1.         -0.16578453  0.30170484  0.18443995  0.25866624]
PACF (first 5 lags): [ 1.         -0.16632279  0.28385955  0.30160401  0.31161908]
--------------------------------------------------

Cow: SE-5f7f33d6-1133, Lactation Number

  xo = x - x.mean()
  x -= x.mean()



Cow: SE-752efd72-0367, Lactation Number: 2
ACF (first 5 lags): [1.         0.56953367 0.51834164 0.49333009 0.49422935]
PACF (first 5 lags): [1.         0.57120877 0.289616   0.19556575 0.1768651 ]
--------------------------------------------------

Cow: SE-752efd72-0370, Lactation Number: 1
ACF (first 5 lags): [1.         0.37631804 0.34617261 0.30210908 0.2244941 ]
PACF (first 5 lags): [1.         0.37782331 0.24053983 0.14208247 0.03468855]
--------------------------------------------------

Cow: SE-752efd72-0370, Lactation Number: 2
ACF (first 5 lags): [1.         0.71649657 0.66313169 0.59614273 0.52332707]
PACF (first 5 lags): [1.         0.71902836 0.31228474 0.10949921 0.00544939]
--------------------------------------------------

Cow: SE-752efd72-0371, Lactation Number: 1
ACF (first 5 lags): [1.         0.5985017  0.52034693 0.4958609  0.45809141]
PACF (first 5 lags): [1.         0.6002166  0.25491208 0.18907935 0.10607041]
--------------------------------------------------


  xo = x - x.mean()
  x -= x.mean()



Cow: SE-ad0a39f5-2602, Lactation Number: 2
ACF (first 5 lags): [1.         0.26112723 0.26304171 0.24470218 0.28342573]
PACF (first 5 lags): [1.         0.26224316 0.21104176 0.15468563 0.18086899]
--------------------------------------------------

Cow: SE-ad0a39f5-2612, Lactation Number: 2
ACF (first 5 lags): [1.         0.32071087 0.40825879 0.38893979 0.2456875 ]
PACF (first 5 lags): [1.00000000e+00 3.22586376e-01 3.44889773e-01 2.49940852e-01
 9.94200043e-04]
--------------------------------------------------

Cow: SE-ad0a39f5-2619, Lactation Number: 1
ACF (first 5 lags): [1.         0.54952178 0.47313769 0.405049   0.37520383]
PACF (first 5 lags): [1.         0.55094911 0.24704684 0.11440236 0.09818781]
--------------------------------------------------

Cow: SE-ad0a39f5-2622, Lactation Number: 2
ACF (first 5 lags): [1.         0.24423069 0.26412251 0.21897035 0.23400942]
PACF (first 5 lags): [1.         0.24558753 0.22003116 0.13136519 0.13516406]
------------------------------

  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()



Cow: SE-afdd9a78-1144, Lactation Number: 3
ACF (first 5 lags): [ 1.         -0.13673154  0.30772295  0.17278306  0.1913904 ]
PACF (first 5 lags): [ 1.         -0.13740179  0.29749188  0.27532554  0.19795763]
--------------------------------------------------

Cow: SE-afdd9a78-1147, Lactation Number: 3
ACF (first 5 lags): [nan nan nan nan nan]
PACF (first 5 lags): [ 1. nan nan nan nan]
--------------------------------------------------

Cow: SE-afdd9a78-1152, Lactation Number: 3
ACF (first 5 lags): [ 1.         -0.50826035  0.13448318 -0.02632332  0.00446561]
PACF (first 5 lags): [ 1.         -0.51153945 -0.16990144 -0.04494188 -0.0109897 ]
--------------------------------------------------

Cow: SE-afdd9a78-1156, Lactation Number: 3
ACF (first 5 lags): [ 1.         -0.15105348  0.11519684  0.2240444  -0.13010063]
PACF (first 5 lags): [ 1.         -0.15206726  0.09584548  0.26784169 -0.07916056]
--------------------------------------------------

Cow: SE-afdd9a78-1163, Lactation Number

In [46]:
# Define the thresholds
mean_residual_threshold = 0.075
std_residual_threshold = 7.5
acf_threshold = 0.25
pacf_threshold = 0.25

# List to collect flagged combinations
flagged_combinations = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    for (se_number, lactation_number), cow_group in farm_group.groupby(['SE_Number', 'LactationNumber']):
        residuals = cow_group['Residuals'].dropna()
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            acf_values = acf(residuals, nlags=30, fft=False)
            pacf_values = pacf(residuals, nlags=min(30, len(residuals)//2))

            # Check against thresholds
            if (abs(acf_values[1]) > acf_threshold or 
                abs(pacf_values[1]) > pacf_threshold):
                
                # Collect the combination if it exceeds any threshold
                flagged_combinations.append({
                    'Farm': farm_name,
                    'SE_Number': se_number,
                    'LactationNumber': lactation_number,
                    'ACF[1]': acf_values[1],
                    'PACF[1]': pacf_values[1]
                })

# Convert to a DataFrame for easier inspection
flagged_df = pd.DataFrame(flagged_combinations)
flagged_df

  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()


Unnamed: 0,Farm,SE_Number,LactationNumber,ACF[1],PACF[1]
0,5b581702,SE-5b581702-1742,3,-0.339509,-0.340615
1,5b581702,SE-5b581702-1755,4,-0.256556,-0.257517
2,5b581702,SE-5b581702-1816,3,-0.256527,-0.257829
3,5b581702,SE-5b581702-1829,3,0.348783,0.349774
4,5b581702,SE-5b581702-1855,4,-0.378438,-0.380008
...,...,...,...,...,...
1109,f454e660,SE-f454e660-634,2,0.347706,0.348708
1110,f454e660,SE-f454e660-685,2,-0.337085,-0.338142
1111,f454e660,SE-f454e660-688,1,-0.343149,-0.344271
1112,f454e660,SE-f454e660-731,1,0.255400,0.256186


In [47]:
# JOAKIM'S EDITS
# Define the Wilmink Lactation Curve function
def wilmink_lactation_curve(dim, a, b, c, d):
    dim = np.array(dim, dtype=float)
    return a + b * dim + c * np.exp(-d * dim)

# Function to directly refit the Wilmink Lactation Curve (Standard Process)
def refit_wilmink(cow_data):
    x_data = cow_data['DaysInMilk'].values
    y_data = cow_data['DailyYield'].values

    # Use initial guesses and bounds from the original fitting process
    initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1]
    bounds = ([-np.inf, -np.inf, -np.inf, 0], [np.inf, np.inf, np.inf, np.inf])

    popt, _ = curve_fit(wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=30000)
    
    # Calculate the expected yield with the refitted parameters
    cow_data['ExpectedYield'] = wilmink_lactation_curve(cow_data['DaysInMilk'], *popt)
    
    # Calculate new residuals
    cow_data['Residuals'] = cow_data['DailyYield'] - cow_data['ExpectedYield']
    
    return cow_data

# Function to add lagged variables for addressing autocorrelation
def add_lagged_variables(cow_data, max_lag=3):
    for lag in range(1, max_lag + 1):
        cow_data[f'lag_{lag}'] = cow_data['DailyYield'].shift(lag)
    return cow_data.dropna()

# Define the Robust Wilmink Lactation Curve function
def robust_wilmink_lactation_curve(dim, a, b, c, d, lag1, lag2, lag3):
    dim = np.array(dim, dtype=np.float64)
    days_in_milk = dim[0]
    lag_1 = dim[1]
    lag_2 = dim[2]
    lag_3 = dim[3]
    
    return a + b * days_in_milk + c * np.exp(-d * days_in_milk) + lag1 * lag_1 + lag2 * lag_2 + lag3 * lag_3

def fit_robust_wilmink(cow_data, lags=3):
    cow_data = add_lagged_variables(cow_data, max_lag=lags)

    # Extract individual columns from cow_data as separate arrays
    days_in_milk = cow_data['DaysInMilk'].values
    lag_1 = cow_data['lag_1'].values
    lag_2 = cow_data['lag_2'].values
    lag_3 = cow_data['lag_3'].values
    y_data = cow_data['DailyYield'].values

    # Ensure all arrays have the same shape
    assert len(days_in_milk) == len(lag_1) == len(lag_2) == len(lag_3) == len(y_data), "Mismatch in data lengths"

    # Prepare initial guesses and bounds
    initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1, 0, 0, 0]
    bounds = ([-np.inf, -np.inf, -np.inf, 0, -np.inf, -np.inf, -np.inf], 
              [np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf])
    
    try:
        # Pass individual components of x_data to curve_fit
        popt, _ = curve_fit(
            lambda dim, a, b, c, d, lag1, lag2, lag3: robust_wilmink_lactation_curve(dim, a, b, c, d, lag1, lag2, lag3), 
            (days_in_milk, lag_1, lag_2, lag_3), 
            y_data, 
            p0=initial_guesses, 
            bounds=bounds, 
            maxfev=50000
        )

        cow_data.loc[:, 'ExpectedYield'] = robust_wilmink_lactation_curve(
            (days_in_milk, lag_1, lag_2, lag_3), *popt
        )
        cow_data.loc[:, 'Residuals'] = cow_data['DailyYield'] - cow_data['ExpectedYield']

    
    except RuntimeError as e:
        print(f"Curve fitting failed: {e}")
        cow_data['ExpectedYield'] = np.nan
        cow_data['Residuals'] = np.nan
    
    return cow_data



# Function to add lagged variables for addressing autocorrelation
def add_lagged_variables(cow_data, max_lag=3):
    for lag in range(1, max_lag + 1):
        cow_data[f'lag_{lag}'] = cow_data['DailyYield'].shift(lag)
    
    # Check for missing values and drop rows with NaNs in the lagged columns or DailyYield
    cow_data_cleaned = cow_data.dropna(subset=['DailyYield'] + [f'lag_{lag}' for lag in range(1, max_lag + 1)])
    
    # Ensure we're not dropping too much data, and there's still sufficient data for fitting
    if len(cow_data_cleaned) == 0:
        raise ValueError("Insufficient data after adding lagged variables. Check for missing data.")

    return cow_data_cleaned


# Apply lagged variables to all cases, all lactations
for se_number in data_cleaned['SE_Number'].unique():
    for lactation_number in data_cleaned[data_cleaned['SE_Number'] == se_number]['LactationNumber'].unique():
        
        cow_data = data_cleaned[(data_cleaned['SE_Number'] == se_number) & 
                                (data_cleaned['LactationNumber'] == lactation_number)].copy()
        
        # Apply lagged variables for all cases, regardless of autocorrelation
        cow_data = add_lagged_variables(cow_data, max_lag=3)
        cow_data_refitted = fit_robust_wilmink(cow_data, lags=3)
        
        data_cleaned.update(cow_data_refitted)

# Remove rows where ExpectedYield is NaN
data_cleaned = data_cleaned.dropna(subset=['ExpectedYield']).reset_index(drop=True)

# Normalize yields
data_cleaned['NormalizedDailyYield'] = data_cleaned['DailyYield'] / data_cleaned['ExpectedYield']
data_cleaned['NormalizedDailyYieldChange'] = data_cleaned['DailyYieldChange'] / data_cleaned['ExpectedYield']


data_cleaned

  data_cleaned.update(cow_data_refitted)


Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad,HeatStress,Residuals
0,SE-064c0cec-1189,a624fb9a,2022-05-29,18.96,,,0,9.766667,52.909813,5,...,3243,SLB,8,41.739904,0.454242,,-14.090187,0.0,0,-22.779904
1,SE-064c0cec-1189,a624fb9a,2022-05-30,22.64,18.96,3.68,0,9.833333,52.409547,6,...,3244,SLB,8,41.659523,0.543453,0.088335,-14.590453,0.0,0,-19.019523
2,SE-064c0cec-1189,a624fb9a,2022-05-31,26.49,22.64,3.85,0,11.200000,52.273070,7,...,3245,SLB,8,41.579142,0.637098,0.092595,-14.726930,0.0,0,-15.089142
3,SE-064c0cec-1189,a624fb9a,2022-06-01,33.61,26.49,7.12,0,12.433333,55.124493,8,...,3246,SLB,8,41.498761,0.809904,0.171571,-11.875507,0.0,0,-7.888761
4,SE-064c0cec-1189,a624fb9a,2022-06-02,35.02,33.61,1.41,0,11.766667,54.947733,9,...,3247,SLB,8,41.41838,0.845518,0.034043,-12.052267,0.0,0,-6.39838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548721,SE-f454e660-799,f454e660,2024-03-29,22.41,26.74,-4.33,0,6.600000,41.551190,308,...,,SLB,1,24.489656,0.91508,-0.176809,-25.448810,0.0,0,-2.079656
548722,SE-f454e660-799,f454e660,2024-03-30,22.00,22.41,-0.41,0,7.500000,49.420200,309,...,,SLB,1,28.323794,0.776732,-0.014475,-17.579800,0.0,0,-6.323794
548723,SE-f454e660-799,f454e660,2024-03-31,25.11,22.00,3.11,0,6.400000,42.486960,310,...,,SLB,1,27.015507,0.929466,0.115119,-24.513040,0.0,0,-1.905507
548724,SE-f454e660-799,f454e660,2024-04-01,20.84,25.11,-4.27,0,4.350000,39.095100,311,...,,SLB,1,25.808584,0.807483,-0.165449,-27.904900,0.0,0,-4.968584


In [None]:
"""OLD CODE
# Define the Wilmink Lactation Curve function
def wilmink_lactation_curve(dim, a, b, c, d):
    dim = np.array(dim, dtype=float)
    return a + b * dim + c * np.exp(-d * dim)

# Function to directly refit the Wilmink Lactation Curve (Standard Process)
def refit_wilmink(cow_data):
    x_data = cow_data['DaysInMilk'].values
    y_data = cow_data['DailyYield'].values

    # Use initial guesses and bounds from the original fitting process
    initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1]
    bounds = ([-np.inf, -np.inf, -np.inf, 0], [np.inf, np.inf, np.inf, np.inf])

    popt, _ = curve_fit(wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=30000)
    
    # Calculate the expected yield with the refitted parameters
    cow_data['ExpectedYield'] = wilmink_lactation_curve(cow_data['DaysInMilk'], *popt)
    
    # Calculate new residuals
    cow_data['Residuals'] = cow_data['DailyYield'] - cow_data['ExpectedYield']
    
    return cow_data

# Function to add lagged variables for addressing autocorrelation
def add_lagged_variables(cow_data, max_lag=3):
    for lag in range(1, max_lag + 1):
        cow_data[f'lag_{lag}'] = cow_data['DailyYield'].shift(lag)
    return cow_data.dropna()

# Define the Robust Wilmink Lactation Curve function
def robust_wilmink_lactation_curve(dim, a, b, c, d, lag1, lag2, lag3):
    dim = np.array(dim, dtype=np.float64)
    days_in_milk = dim[0]
    lag_1 = dim[1]
    lag_2 = dim[2]
    lag_3 = dim[3]
    
    return a + b * days_in_milk + c * np.exp(-d * days_in_milk) + lag1 * lag_1 + lag2 * lag_2 + lag3 * lag_3

# Function to fit the robust Wilmink model
def fit_robust_wilmink(cow_data, lags=3):
    cow_data = add_lagged_variables(cow_data, max_lag=lags)
    
    x_data = cow_data[['DaysInMilk', 'lag_1', 'lag_2', 'lag_3']].values.T
    y_data = cow_data['DailyYield'].values
    
    initial_guesses = [np.mean(y_data), 0, np.mean(y_data) / 2, 0.1, 0, 0, 0]
    bounds = ([-np.inf, -np.inf, -np.inf, 0, -np.inf, -np.inf, -np.inf], 
              [np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf])
    
    try:
        popt, _ = curve_fit(robust_wilmink_lactation_curve, x_data, y_data, p0=initial_guesses, bounds=bounds, maxfev=50000)
        cow_data.loc[:, 'ExpectedYield'] = robust_wilmink_lactation_curve(x_data, *popt)
        cow_data.loc[:, 'Residuals'] = cow_data['DailyYield'] - cow_data['ExpectedYield']
    except RuntimeError as e:
        print(f"Curve fitting failed: {e}")
        cow_data.loc[:, 'ExpectedYield'] = np.nan
        cow_data.loc[:, 'Residuals'] = np.nan
    
    return cow_data

# Function to add lagged variables for addressing autocorrelation
def add_lagged_variables(cow_data, max_lag=3):
    for lag in range(1, max_lag + 1):
        cow_data[f'lag_{lag}'] = cow_data['DailyYield'].shift(lag)
    return cow_data.dropna()

# Example usage: Applying the robust model to flagged cases
for index, row in flagged_df.iterrows():
    se_number = row['SE_Number']
    lactation_number = row['LactationNumber']
    
    cow_data = data_cleaned[(data_cleaned['SE_Number'] == se_number) & 
                            (data_cleaned['LactationNumber'] == lactation_number)].copy()
    
    if abs(row['ACF[1]']) > 0.2:  # Significant autocorrelation
        cow_data = add_lagged_variables(cow_data, max_lag=3)
        cow_data_refitted = fit_robust_wilmink(cow_data, lags=3)
        data_cleaned.update(cow_data_refitted)
    else:
        cow_data_refitted = refit_wilmink(cow_data)
        data_cleaned.update(cow_data_refitted)

# Erase all rows where ExpectedYield is NaN
data_cleaned = data_cleaned.dropna(subset=['ExpectedYield']).reset_index(drop=True)

data_cleaned['NormalizedDailyYield'] = data_cleaned['DailyYield'] / data_cleaned['ExpectedYield']
data_cleaned['NormalizedDailyYieldChange'] = data_cleaned['DailyYieldChange'] / data_cleaned['ExpectedYield']

data_cleaned
"""

In [48]:
# Define the thresholds
mean_residual_threshold = 0.075
std_residual_threshold = 7.5
acf_threshold = 0.25
pacf_threshold = 0.25

# List to collect flagged combinations
flagged_combinations = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    for (se_number, lactation_number), cow_group in farm_group.groupby(['SE_Number', 'LactationNumber']):
        residuals = cow_group['Residuals'].dropna()
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            acf_values = acf(residuals, nlags=30, fft=False)
            pacf_values = pacf(residuals, nlags=min(30, len(residuals)//2))

            # Check against thresholds
            if (abs(acf_values[1]) > acf_threshold or 
                abs(pacf_values[1]) > pacf_threshold):
                
                # Collect the combination if it exceeds any threshold
                flagged_combinations.append({
                    'Farm': farm_name,
                    'SE_Number': se_number,
                    'LactationNumber': lactation_number,
                    'ACF[1]': acf_values[1],
                    'PACF[1]': pacf_values[1]
                })

# Convert to a DataFrame for easier inspection
flagged_df = pd.DataFrame(flagged_combinations)
flagged_df

  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()


Unnamed: 0,Farm,SE_Number,LactationNumber,ACF[1],PACF[1]
0,5b581702,SE-5b581702-2104,1,0.300572,0.301914
1,5c06d92d,SE-5c06d92d-2333,7,0.298203,0.299316
2,5c06d92d,SE-5c06d92d-2333,8,0.260937,0.262403
3,5c06d92d,SE-5c06d92d-2621,6,0.317620,0.319511
4,5c06d92d,SE-5c06d92d-2638,5,0.259595,0.260379
...,...,...,...,...,...
282,ad0a39f5,SE-ad0a39f5-2756,1,0.311762,0.313040
283,ad0a39f5,SE-ad0a39f5-2778,1,0.270706,0.272387
284,ad0a39f5,SE-ad0a39f5-2785,1,0.309062,0.311136
285,ad0a39f5,SE-ad0a39f5-2801,1,0.318474,0.320404


In [49]:
def remove_outliers(data, threshold=3.5):
    # Calculate z-scores of residuals
    data = data.copy()  # Create a copy to avoid the SettingWithCopyWarning
    data['z_score'] = (data['Residuals'] - data['Residuals'].mean()) / data['Residuals'].std()
    
    # Identify the number of outliers
    num_outliers = (data['z_score'].abs() >= threshold).sum()
    print(f"Number of outliers detected: {num_outliers}")
    
    # Remove rows where the z-score of the residual is greater than the threshold
    cleaned_data = data.loc[(data['z_score'].abs() < threshold)].drop(columns=['z_score'])
    
    # Print the number of rows before and after
    print(f"Number of rows before outlier removal: {len(data)}")
    print(f"Number of rows after outlier removal: {len(cleaned_data)}")
    
    return cleaned_data

# Apply to flagged cases
for index, row in flagged_df.iterrows():
    se_number = row['SE_Number']
    lactation_number = row['LactationNumber']
    
    # Select the cow data for the specific SE_Number and LactationNumber
    cow_data = data_cleaned.loc[(data_cleaned['SE_Number'] == se_number) & 
                                (data_cleaned['LactationNumber'] == lactation_number)]
    
    # Remove outliers
    cow_data_trimmed = remove_outliers(cow_data, threshold=3.5)
    
    # Recalculate the residuals and update the dataset
    cow_data_trimmed['Residuals'] = cow_data_trimmed['DailyYield'] - cow_data_trimmed['ExpectedYield']
    
    # Remove the old data for this cow from data_cleaned
    data_cleaned = data_cleaned.loc[~((data_cleaned['SE_Number'] == se_number) & 
                                      (data_cleaned['LactationNumber'] == lactation_number))]
    
    # Append the cleaned data back to data_cleaned
    data_cleaned = pd.concat([data_cleaned, cow_data_trimmed], ignore_index=True)

Number of outliers detected: 5
Number of rows before outlier removal: 225
Number of rows after outlier removal: 220
Number of outliers detected: 5
Number of rows before outlier removal: 269
Number of rows after outlier removal: 264
Number of outliers detected: 2
Number of rows before outlier removal: 179
Number of rows after outlier removal: 177
Number of outliers detected: 3
Number of rows before outlier removal: 169
Number of rows after outlier removal: 166
Number of outliers detected: 7
Number of rows before outlier removal: 332
Number of rows after outlier removal: 325
Number of outliers detected: 4
Number of rows before outlier removal: 274
Number of rows after outlier removal: 270
Number of outliers detected: 4
Number of rows before outlier removal: 278
Number of rows after outlier removal: 274
Number of outliers detected: 6
Number of rows before outlier removal: 313
Number of rows after outlier removal: 307
Number of outliers detected: 6
Number of rows before outlier removal: 28

In [50]:
data_cleaned

Unnamed: 0,SE_Number,FarmName_Pseudo,Date,DailyYield,PreviousDailyYield,DailyYieldChange,HW,MeanTemperature,MeanTHI_adj,DaysInMilk,...,Age,BreedName,LactationNumber,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatLoad,CumulativeHeatLoad,HeatStress,Residuals
0,SE-064c0cec-1189,a624fb9a,2022-05-29,18.96,,,0,9.766667,52.909813,5,...,3243,SLB,8,41.739904,0.454242,,-14.090187,0.0,0,-22.779904
1,SE-064c0cec-1189,a624fb9a,2022-05-30,22.64,18.96,3.68,0,9.833333,52.409547,6,...,3244,SLB,8,41.659523,0.543453,0.088335,-14.590453,0.0,0,-19.019523
2,SE-064c0cec-1189,a624fb9a,2022-05-31,26.49,22.64,3.85,0,11.200000,52.273070,7,...,3245,SLB,8,41.579142,0.637098,0.092595,-14.726930,0.0,0,-15.089142
3,SE-064c0cec-1189,a624fb9a,2022-06-01,33.61,26.49,7.12,0,12.433333,55.124493,8,...,3246,SLB,8,41.498761,0.809904,0.171571,-11.875507,0.0,0,-7.888761
4,SE-064c0cec-1189,a624fb9a,2022-06-02,35.02,33.61,1.41,0,11.766667,54.947733,9,...,3247,SLB,8,41.41838,0.845518,0.034043,-12.052267,0.0,0,-6.39838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547155,SE-f454e660-268,f454e660,2023-03-10,13.77,17.23,-3.46,0,-1.750000,31.399280,263,...,,NRDC,5,15.653922,0.879652,-0.221031,-35.600720,0.0,0,-1.883922
547156,SE-f454e660-268,f454e660,2023-03-11,16.01,13.77,2.24,0,-0.933333,31.383507,264,...,,NRDC,5,16.721973,0.957423,0.133955,-35.616493,0.0,0,-0.711973
547157,SE-f454e660-268,f454e660,2023-03-12,18.19,16.01,2.18,0,-0.600000,30.213073,265,...,,NRDC,5,16.071582,1.131811,0.135643,-36.786927,0.0,0,2.118418
547158,SE-f454e660-268,f454e660,2023-03-13,12.81,18.19,-5.38,0,1.100000,28.094420,266,...,,NRDC,5,14.4909,0.884003,-0.371267,-38.905580,0.0,0,-1.6809


In [51]:
# Define the thresholds
mean_residual_threshold = 0.075
std_residual_threshold = 7.5
acf_threshold = 0.25
pacf_threshold = 0.25

# List to collect flagged combinations
flagged_combinations = []

for farm_name, farm_group in data_cleaned.groupby('FarmName_Pseudo'):
    for (se_number, lactation_number), cow_group in farm_group.groupby(['SE_Number', 'LactationNumber']):
        residuals = cow_group['Residuals'].dropna()
        
        if len(residuals) > 1:  # Ensure there are residuals to analyze
            acf_values = acf(residuals, nlags=30, fft=False)
            pacf_values = pacf(residuals, nlags=min(30, len(residuals)//2))

            # Check against thresholds
            if (abs(acf_values[1]) > acf_threshold or 
                abs(pacf_values[1]) > pacf_threshold):
                
                # Collect the combination if it exceeds any threshold
                flagged_combinations.append({
                    'Farm': farm_name,
                    'SE_Number': se_number,
                    'LactationNumber': lactation_number,
                    'ACF[1]': acf_values[1],
                    'PACF[1]': pacf_values[1]
                })

# Convert to a DataFrame for easier inspection
flagged_df = pd.DataFrame(flagged_combinations)
flagged_df

  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()
  xo = x - x.mean()
  x -= x.mean()


Unnamed: 0,Farm,SE_Number,LactationNumber,ACF[1],PACF[1]
0,5c06d92d,SE-5c06d92d-2333,8,0.297954,0.299647
1,5c06d92d,SE-5c06d92d-2797,5,0.331905,0.334132
2,5c06d92d,SE-5c06d92d-3171,3,0.257152,0.258589
3,5c06d92d,SE-5c06d92d-3186,3,0.281275,0.283138
4,5c06d92d,SE-5c06d92d-3573,1,0.289456,0.290861
5,5c06d92d,SE-5c06d92d-3575,1,0.398158,0.400358
6,5c06d92d,SE-5c06d92d-3584,1,0.293768,0.295517
7,752efd72,SE-752efd72-0239,2,0.267739,0.268836
8,ad0a39f5,SE-a756bc39-1208,2,0.28223,0.283983
9,ad0a39f5,SE-a756bc39-1255,1,0.274536,0.276403


In [52]:
# Reorder columns
new_order = [
    "Date", "FarmName_Pseudo", "SE_Number", "Age", "BreedName", "LactationNumber", "DaysInMilk",'YearSeason', "DailyYield", "PreviousDailyYield", 
    "DailyYieldChange", "ExpectedYield", "NormalizedDailyYield", 
    "NormalizedDailyYieldChange", "Residuals", "HeatStress", "Temp15Threshold", "HW", 
    "cum_HW", "MeanTemperature", "MeanTHI_adj", "HeatLoad", "CumulativeHeatLoad"
]
data_cleaned = data_cleaned[new_order]
data_cleaned

Unnamed: 0,Date,FarmName_Pseudo,SE_Number,Age,BreedName,LactationNumber,DaysInMilk,YearSeason,DailyYield,PreviousDailyYield,...,NormalizedDailyYieldChange,Residuals,HeatStress,Temp15Threshold,HW,cum_HW,MeanTemperature,MeanTHI_adj,HeatLoad,CumulativeHeatLoad
0,2022-05-29,a624fb9a,SE-064c0cec-1189,3243,SLB,8,5,202202,18.96,,...,,-22.779904,0,0,0,0,9.766667,52.909813,-14.090187,0.0
1,2022-05-30,a624fb9a,SE-064c0cec-1189,3244,SLB,8,6,202202,22.64,18.96,...,0.088335,-19.019523,0,1,0,0,9.833333,52.409547,-14.590453,0.0
2,2022-05-31,a624fb9a,SE-064c0cec-1189,3245,SLB,8,7,202202,26.49,22.64,...,0.092595,-15.089142,0,0,0,0,11.200000,52.273070,-14.726930,0.0
3,2022-06-01,a624fb9a,SE-064c0cec-1189,3246,SLB,8,8,202203,33.61,26.49,...,0.171571,-7.888761,0,1,0,0,12.433333,55.124493,-11.875507,0.0
4,2022-06-02,a624fb9a,SE-064c0cec-1189,3247,SLB,8,9,202203,35.02,33.61,...,0.034043,-6.39838,0,1,0,0,11.766667,54.947733,-12.052267,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547155,2023-03-10,f454e660,SE-f454e660-268,,NRDC,5,263,202302,13.77,17.23,...,-0.221031,-1.883922,0,0,0,0,-1.750000,31.399280,-35.600720,0.0
547156,2023-03-11,f454e660,SE-f454e660-268,,NRDC,5,264,202302,16.01,13.77,...,0.133955,-0.711973,0,0,0,0,-0.933333,31.383507,-35.616493,0.0
547157,2023-03-12,f454e660,SE-f454e660-268,,NRDC,5,265,202302,18.19,16.01,...,0.135643,2.118418,0,0,0,0,-0.600000,30.213073,-36.786927,0.0
547158,2023-03-13,f454e660,SE-f454e660-268,,NRDC,5,266,202302,12.81,18.19,...,-0.371267,-1.6809,0,0,0,0,1.100000,28.094420,-38.905580,0.0


In [53]:
# Check if NormalizedDailyYield is centered around 1 for each unique farm
print("Mean of NormalizedDailyYield:", data_cleaned.groupby('FarmName_Pseudo')['NormalizedDailyYield'].mean())
print("Standard Deviation of NormalizedDailyYield:", data_cleaned.groupby('FarmName_Pseudo')['NormalizedDailyYield'].std())

Mean of NormalizedDailyYield: FarmName_Pseudo
5b581702    0.993346
5c06d92d    0.995186
5f7f33d6    0.993357
752efd72     0.99608
a624fb9a    0.992725
ab18b151    0.991595
ad0a39f5    0.997088
afdd9a78    0.991919
f454e660    0.992839
Name: NormalizedDailyYield, dtype: Float64
Standard Deviation of NormalizedDailyYield: FarmName_Pseudo
5b581702     0.17715
5c06d92d    0.117312
5f7f33d6      0.1757
752efd72    0.107645
a624fb9a    0.167264
ab18b151    0.143964
ad0a39f5    0.086675
afdd9a78    0.191227
f454e660    0.215719
Name: NormalizedDailyYield, dtype: Float64


In [54]:
# Count the number of HeatStress occurrences in each farm
heat_stress_counts = data_cleaned.groupby('FarmName_Pseudo')['HeatStress'].sum()
heat_stress_counts

FarmName_Pseudo
5b581702     806
5c06d92d    6854
5f7f33d6    2591
752efd72    2809
a624fb9a    2214
ab18b151     120
ad0a39f5    2841
afdd9a78     374
f454e660    2063
Name: HeatStress, dtype: int64

In [55]:
# Count number of observations within each farm
no_obs = data_cleaned.groupby('FarmName_Pseudo').size().reset_index(name='count')
no_obs

Unnamed: 0,FarmName_Pseudo,count
0,5b581702,24085
1,5c06d92d,156156
2,5f7f33d6,111728
3,752efd72,84098
4,a624fb9a,56372
5,ab18b151,2682
6,ad0a39f5,56253
7,afdd9a78,9033
8,f454e660,46753


In [56]:
# Save the reordered DataFrame to a CSV file - 61 degrees threshold
# data_cleaned.to_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile61.csv', index=False)
# print(data_cleaned.shape)

# 67 degrees threshold
data_cleaned.to_csv('../Data/MergedData/HeatApproachCleanedYieldDataTestQuantile67.csv', index=False)
print(data_cleaned.shape)

(547160, 23)


Desk stat for article

In [None]:
print(f"No. of milk days in file after Quantile regression program: {data_cleaned.shape}")

data_cleaned2 = data_cleaned.drop_duplicates(subset=["SE_Number", "LactationNumber"])
print(f"No. of lactations in file after Quantile regression program: {data_cleaned2.shape}")

data_cleaned2 = data_cleaned.drop_duplicates(subset=["SE_Number"])
print(f"No. of lactations in file after Quantile regression program: {data_cleaned2.shape}")

No. of milk days in file after Quantile regression program: (547160, 23)
No. of lactations in file after Quantile regression program: (1971, 23)
No. of lactations in file after Quantile regression program: (1480, 23)
