In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from Bayesian import BayesianLinearRegression
from sklearn.model_selection import train_test_split

sns.set_theme()
sns.set_context("notebook")
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
dtype_dict = {
    'Date': 'str',
    'FarmName_Pseudo': 'str',
    'SE_Number': 'str',
    'Age': 'Int64',
    'BreedName': 'str',
    'DailyYield': 'float',
    'PreviousDailyYield': 'float',
    'DailyYieldChange': 'float',
    'DaysInMilk': 'Int64',
    'YearSeason': 'str',
    'LactationNumber': 'Int64',
    'ExpectedYield': 'float',
    'NormalizedDailyYield': 'float',
    'NormalizedDailyYieldChange': 'float',
    'HeatStress': 'Int64',
    'Temp15Threshold': 'Int64',
    'HW': 'Int64',
    'cum_HW': 'Int64',
    'MeanTemperature': 'float',
    'MeanTHI_adj': 'float',
    'HeatLoad': 'float',
    'CumulativeHeatLoad': 'float',
}

milk_data = pd.read_csv('../Data/MergedData/HeatApproachYieldDataTest.csv', dtype=dtype_dict)
milk_data['Date'] = pd.to_datetime(milk_data['Date'], format='%Y-%m-%d')
milk_data.head(-5)

Unnamed: 0,Date,FarmName_Pseudo,SE_Number,Age,BreedName,LactationNumber,DaysInMilk,YearSeason,DailyYield,PreviousDailyYield,...,NormalizedDailyYieldChange,Residuals,HeatStress,Temp15Threshold,HW,cum_HW,MeanTemperature,MeanTHI_adj,HeatLoad,CumulativeHeatLoad
0,2022-05-28,a624fb9a,SE-064c0cec-1189,3242,02 SLB,8,3,2022-2,15.22,0.0000,...,0.000000,1.820438,0,0,0,0,9.912500,50.478673,-10.521327,0.0
1,2022-05-29,a624fb9a,SE-064c0cec-1189,3243,02 SLB,8,4,2022-2,18.96,15.2200,...,0.107655,1.589745,0,0,0,0,10.066667,53.841648,-7.158352,0.0
2,2022-05-30,a624fb9a,SE-064c0cec-1189,3244,02 SLB,8,5,2022-2,22.64,17.0900,...,0.089176,1.894598,0,1,0,0,10.466667,52.935959,-8.064041,0.0
3,2022-05-31,a624fb9a,SE-064c0cec-1189,3245,02 SLB,8,6,2022-2,26.49,18.9400,...,0.079936,2.877443,0,0,0,0,11.183333,52.872112,-8.127888,0.0
4,2022-06-01,a624fb9a,SE-064c0cec-1189,3246,02 SLB,8,7,2022-3,33.61,20.8275,...,0.098152,7.563598,0,1,0,0,12.704167,56.056547,-4.943453,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
466533,2022-03-19,f454e660,SE-fcdf259d-0044-0,3709,41 Fjällko,9,274,2022-2,20.41,12.4860,...,0.020560,4.456951,0,0,0,0,3.866667,43.381437,-17.618563,0.0
466534,2022-03-20,f454e660,SE-fcdf259d-0044-0,3710,41 Fjällko,9,275,2022-2,12.17,12.8140,...,0.051173,0.523267,0,0,0,0,2.450000,40.920659,-20.079341,0.0
466535,2022-03-21,f454e660,SE-fcdf259d-0044-0,3711,41 Fjällko,9,276,2022-2,13.59,13.4100,...,0.051860,1.133275,0,0,0,0,3.170833,42.103862,-18.896137,0.0
466536,2022-03-22,f454e660,SE-fcdf259d-0044-0,3712,41 Fjällko,9,277,2022-2,13.20,14.0560,...,-0.062114,-1.031806,0,0,0,0,4.691667,43.334997,-17.665003,0.0


In [20]:
from sklearn.preprocessing import StandardScaler

# Initialize an empty DataFrame to store results
results_df = pd.DataFrame(columns=['FarmName_Pseudo', 'FarmCumulativeHeatLoadMilkProduction'])

# Set subject type to 'farm'
subject_type = 'farm'

# Specify the farm ID for analysis
farm_id = 'a624fb9a'

# Filter data for the specific farm
farm_data = milk_data[milk_data['FarmName_Pseudo'] == farm_id]

# Check if NormalizedDailyYield is centered around 1
normalized_mean = farm_data['NormalizedDailyYield'].mean()
normalized_variance = farm_data['NormalizedDailyYield'].var()
print("Mean of NormalizedDailyYield:", normalized_mean)
print("Standard Deviation of NormalizedDailyYield:", farm_data['NormalizedDailyYield'].std())
print("Variance of NormalizedDailyYield:", normalized_variance)

# Define the target variable
target = 'NormalizedDailyYield'

# Split the data into train and validation sets
train_data, val_data = train_test_split(farm_data, test_size=0.3, random_state=42)

# Scale the CumulativeHeatLoad feature
scaler = StandardScaler()
train_data['CumulativeHeatLoad'] = scaler.fit_transform(train_data[['CumulativeHeatLoad']])
val_data['CumulativeHeatLoad'] = scaler.transform(val_data[['CumulativeHeatLoad']])

# Define the single feature
features = ['CumulativeHeatLoad']

# Calculate the correlation between CumulativeHeatLoad and NormalizedDailyYield
cumulative_heatload_correlation = train_data['CumulativeHeatLoad'].corr(train_data[target])
print("Correlation between CumulativeHeatLoad and NormalizedDailyYield:", cumulative_heatload_correlation)

# Function to fit and get the posterior mean for CumulativeHeatLoad
def fit_and_get_cumulative_heatload_posterior(farm_id, train_data, features, cumulative_heatload_correlation):
    print(f"\nSelected features: {features}")

    # Calculate prior means for the features
    prior_mean_values = [cumulative_heatload_correlation]  # Use correlation as prior mean for CumulativeHeatLoad
    prior_std_values = [train_data[feature].std() for feature in features]
    
    # Define priors
    prior_mean = np.array([normalized_mean] + prior_mean_values).reshape(-1, 1)
    prior_std = np.array([np.sqrt(normalized_variance)] + prior_std_values).reshape(-1, 1)
    prior_cov = np.eye(len(features) + 1) * 0.01  # Smaller value for stronger prior confidence
    beta = 1 / normalized_variance

    # Print prior means and standard deviations
    print(f"\nPrior Mean and Standard Deviation for each feature:\n")
    for feature, mean, std in zip(['Off-set'] + features, prior_mean.flatten(), prior_std.flatten()):
        print(f"{feature}: mean = {mean}, std_dev = {std}")

    # Initialize and fit the model
    model = BayesianLinearRegression(
        dataframe=train_data,
        subject_name=farm_id,
        selected_features=features,
        target=target,
        subject_type=subject_type,
        prior_mean=prior_mean,
        prior_cov=prior_cov,
        beta=beta
    )
    result = model.fit_model()

    # Print posterior mean and standard deviation for each feature
    print(f"\nPosterior Mean and Standard Deviation for each feature:\n")
    for feature, stats in result.items():
        print(f"{feature}: mean = {stats['mu']}, std_dev = {stats['sigma']}")

    # Extract the posterior mean for CumulativeHeatLoad
    cumulative_heatload_posterior_mean = result['CumulativeHeatLoad']['mu']

    return cumulative_heatload_posterior_mean

# Train the model with the CumulativeHeatLoad feature on the training set and print results
cumulative_heatload_posterior_mean = fit_and_get_cumulative_heatload_posterior(farm_id, train_data, features, cumulative_heatload_correlation)

# Create a new DataFrame for the current farm's result
new_result = pd.DataFrame([{
    'FarmName_Pseudo': farm_id,
    'FarmCumulativeHeatLoadMilkProduction': cumulative_heatload_posterior_mean
}])

# Check if results_df is empty before concatenation
if results_df.empty:
    results_df = new_result
else:
    results_df = pd.concat([results_df, new_result], ignore_index=True)

results_df

Mean of NormalizedDailyYield: 1.0001518293047704
Standard Deviation of NormalizedDailyYield: 0.1730768975053524
Variance of NormalizedDailyYield: 0.02995561245007826
Correlation between CumulativeHeatLoad and NormalizedDailyYield: -0.12259351110412624

Selected features: ['CumulativeHeatLoad']

Prior Mean and Standard Deviation for each feature:

Off-set: mean = 1.0001518293047704, std_dev = 0.1730768975053524
CumulativeHeatLoad: mean = -0.12259351110412624, std_dev = 1.0000123246054589

Posterior Mean and Standard Deviation for each feature:

Off-set: mean = 1.000042055722087, std_dev = 0.0008651448044649453
CumulativeHeatLoad: mean = -0.021362750390445277, std_dev = 0.0008651448044649504


Unnamed: 0,FarmName_Pseudo,FarmCumulativeHeatLoadMilkProduction
0,a624fb9a,-0.021363


In [21]:
# Initialize an empty DataFrame to store results
results_df = pd.DataFrame(columns=['FarmName_Pseudo', 'FarmCumulativeHeatLoadMilkProduction'])

# Set subject type to 'farm'
subject_type = 'farm'

# Specify the farm ID for analysis
farm_id = '5c06d92d'

# Filter data for the specific farm
farm_data = milk_data[milk_data['FarmName_Pseudo'] == farm_id]

# Check if NormalizedDailyYield is centered around 1
normalized_mean = farm_data['NormalizedDailyYield'].mean()
normalized_variance = farm_data['NormalizedDailyYield'].var()
print("Mean of NormalizedDailyYield:", normalized_mean)
print("Standard Deviation of NormalizedDailyYield:", farm_data['NormalizedDailyYield'].std())
print("Variance of NormalizedDailyYield:", normalized_variance)

# Define the target variable
target = 'NormalizedDailyYield'

# Split the data into train and validation sets
train_data, val_data = train_test_split(farm_data, test_size=0.3, random_state=42)

# Scale the CumulativeHeatLoad feature
scaler = StandardScaler()
train_data['CumulativeHeatLoad'] = scaler.fit_transform(train_data[['CumulativeHeatLoad']])
val_data['CumulativeHeatLoad'] = scaler.transform(val_data[['CumulativeHeatLoad']])

# Define the single feature
features = ['CumulativeHeatLoad']

# Calculate the correlation between CumulativeHeatLoad and NormalizedDailyYield
cumulative_heatload_correlation = train_data['CumulativeHeatLoad'].corr(train_data[target])
print("Correlation between CumulativeHeatLoad and NormalizedDailyYield:", cumulative_heatload_correlation)

# Function to fit and get the posterior mean for CumulativeHeatLoad
def fit_and_get_cumulative_heatload_posterior(farm_id, train_data, features, cumulative_heatload_correlation):
    print(f"\nSelected features: {features}")

    # Calculate prior means for the features
    prior_mean_values = [cumulative_heatload_correlation]  # Use correlation as prior mean for CumulativeHeatLoad
    prior_std_values = [train_data[feature].std() for feature in features]
    
    # Define priors
    prior_mean = np.array([normalized_mean] + prior_mean_values).reshape(-1, 1)
    prior_std = np.array([np.sqrt(normalized_variance)] + prior_std_values).reshape(-1, 1)
    prior_cov = np.eye(len(features) + 1) * 0.01  # Smaller value for stronger prior confidence
    beta = 1 / normalized_variance

    # Print prior means and standard deviations
    print(f"\nPrior Mean and Standard Deviation for each feature:\n")
    for feature, mean, std in zip(['Off-set'] + features, prior_mean.flatten(), prior_std.flatten()):
        print(f"{feature}: mean = {mean}, std_dev = {std}")

    # Initialize and fit the model
    model = BayesianLinearRegression(
        dataframe=train_data,
        subject_name=farm_id,
        selected_features=features,
        target=target,
        subject_type=subject_type,
        prior_mean=prior_mean,
        prior_cov=prior_cov,
        beta=beta
    )
    result = model.fit_model()

    # Print posterior mean and standard deviation for each feature
    print(f"\nPosterior Mean and Standard Deviation for each feature:\n")
    for feature, stats in result.items():
        print(f"{feature}: mean = {stats['mu']}, std_dev = {stats['sigma']}")

    # Extract the posterior mean for CumulativeHeatLoad
    cumulative_heatload_posterior_mean = result['CumulativeHeatLoad']['mu']

    return cumulative_heatload_posterior_mean

# Train the model with the CumulativeHeatLoad feature on the training set and print results
cumulative_heatload_posterior_mean = fit_and_get_cumulative_heatload_posterior(farm_id, train_data, features, cumulative_heatload_correlation)

# Create a new DataFrame for the current farm's result
new_result = pd.DataFrame([{
    'FarmName_Pseudo': farm_id,
    'FarmCumulativeHeatLoadMilkProduction': cumulative_heatload_posterior_mean
}])

# Check if results_df is empty before concatenation
if results_df.empty:
    results_df = new_result
else:
    results_df = pd.concat([results_df, new_result], ignore_index=True)

results_df

Mean of NormalizedDailyYield: 1.000901513816025
Standard Deviation of NormalizedDailyYield: 0.13308120943480695
Variance of NormalizedDailyYield: 0.01771060830463095
Correlation between CumulativeHeatLoad and NormalizedDailyYield: -0.024140673374584302

Selected features: ['CumulativeHeatLoad']

Prior Mean and Standard Deviation for each feature:

Off-set: mean = 1.000901513816025, std_dev = 0.13308120943480695
CumulativeHeatLoad: mean = -0.024140673374584302, std_dev = 1.0000038098358535

Posterior Mean and Standard Deviation for each feature:

Off-set: mean = 1.0007787162849593, std_dev = 0.0003690052566915841
CumulativeHeatLoad: mean = -0.0032271172752825882, std_dev = 0.000369005256691586


Unnamed: 0,FarmName_Pseudo,FarmCumulativeHeatLoadMilkProduction
0,5c06d92d,-0.003227


In [22]:
# Initialize an empty DataFrame to store results
results_df = pd.DataFrame(columns=['FarmName_Pseudo', 'FarmCumulativeHeatLoadMilkProduction'])

# Set subject type to 'farm'
subject_type = 'farm'

# Specify the farm ID for analysis
farm_id = '752efd72'

# Filter data for the specific farm
farm_data = milk_data[milk_data['FarmName_Pseudo'] == farm_id]

# Check if NormalizedDailyYield is centered around 1
normalized_mean = farm_data['NormalizedDailyYield'].mean()
normalized_variance = farm_data['NormalizedDailyYield'].var()
print("Mean of NormalizedDailyYield:", normalized_mean)
print("Standard Deviation of NormalizedDailyYield:", farm_data['NormalizedDailyYield'].std())
print("Variance of NormalizedDailyYield:", normalized_variance)

# Define the target variable
target = 'NormalizedDailyYield'

# Split the data into train and validation sets
train_data, val_data = train_test_split(farm_data, test_size=0.3, random_state=42)

# Scale the CumulativeHeatLoad feature
scaler = StandardScaler()
train_data['CumulativeHeatLoad'] = scaler.fit_transform(train_data[['CumulativeHeatLoad']])
val_data['CumulativeHeatLoad'] = scaler.transform(val_data[['CumulativeHeatLoad']])

# Define the single feature
features = ['CumulativeHeatLoad']

# Calculate the correlation between CumulativeHeatLoad and NormalizedDailyYield
cumulative_heatload_correlation = train_data['CumulativeHeatLoad'].corr(train_data[target])
print("Correlation between CumulativeHeatLoad and NormalizedDailyYield:", cumulative_heatload_correlation)

# Function to fit and get the posterior mean for CumulativeHeatLoad
def fit_and_get_cumulative_heatload_posterior(farm_id, train_data, features, cumulative_heatload_correlation):
    print(f"\nSelected features: {features}")

    # Calculate prior means for the features
    prior_mean_values = [cumulative_heatload_correlation]  # Use correlation as prior mean for CumulativeHeatLoad
    prior_std_values = [train_data[feature].std() for feature in features]
    
    # Define priors
    prior_mean = np.array([normalized_mean] + prior_mean_values).reshape(-1, 1)
    prior_std = np.array([np.sqrt(normalized_variance)] + prior_std_values).reshape(-1, 1)
    prior_cov = np.eye(len(features) + 1) * 0.01  # Smaller value for stronger prior confidence
    beta = 1 / normalized_variance

    # Print prior means and standard deviations
    print(f"\nPrior Mean and Standard Deviation for each feature:\n")
    for feature, mean, std in zip(['Off-set'] + features, prior_mean.flatten(), prior_std.flatten()):
        print(f"{feature}: mean = {mean}, std_dev = {std}")

    # Initialize and fit the model
    model = BayesianLinearRegression(
        dataframe=train_data,
        subject_name=farm_id,
        selected_features=features,
        target=target,
        subject_type=subject_type,
        prior_mean=prior_mean,
        prior_cov=prior_cov,
        beta=beta
    )
    result = model.fit_model()

    # Print posterior mean and standard deviation for each feature
    print(f"\nPosterior Mean and Standard Deviation for each feature:\n")
    for feature, stats in result.items():
        print(f"{feature}: mean = {stats['mu']}, std_dev = {stats['sigma']}")

    # Extract the posterior mean for CumulativeHeatLoad
    cumulative_heatload_posterior_mean = result['CumulativeHeatLoad']['mu']

    return cumulative_heatload_posterior_mean

# Train the model with the CumulativeHeatLoad feature on the training set and print results
cumulative_heatload_posterior_mean = fit_and_get_cumulative_heatload_posterior(farm_id, train_data, features, cumulative_heatload_correlation)

# Create a new DataFrame for the current farm's result
new_result = pd.DataFrame([{
    'FarmName_Pseudo': farm_id,
    'FarmCumulativeHeatLoadMilkProduction': cumulative_heatload_posterior_mean
}])

# Check if results_df is empty before concatenation
if results_df.empty:
    results_df = new_result
else:
    results_df = pd.concat([results_df, new_result], ignore_index=True)

results_df

Mean of NormalizedDailyYield: 1.001943739804758
Standard Deviation of NormalizedDailyYield: 0.11309283247537466
Variance of NormalizedDailyYield: 0.012789988757303158
Correlation between CumulativeHeatLoad and NormalizedDailyYield: -0.056771652695918655

Selected features: ['CumulativeHeatLoad']

Prior Mean and Standard Deviation for each feature:

Off-set: mean = 1.001943739804758, std_dev = 0.11309283247537466
CumulativeHeatLoad: mean = -0.056771652695918655, std_dev = 1.0000050018882127

Posterior Mean and Standard Deviation for each feature:

Off-set: mean = 1.0020886444324628, std_dev = 0.0003660068248299986
CumulativeHeatLoad: mean = -0.006569617290143355, std_dev = 0.00036600682482999846


Unnamed: 0,FarmName_Pseudo,FarmCumulativeHeatLoadMilkProduction
0,752efd72,-0.00657


In [23]:
# Initialize an empty DataFrame to store results
results_df = pd.DataFrame(columns=['FarmName_Pseudo', 'FarmCumulativeHeatLoadMilkProduction'])

# Set subject type to 'farm'
subject_type = 'farm'

# Specify the farm ID for analysis
farm_id = 'f454e660'

# Filter data for the specific farm
farm_data = milk_data[milk_data['FarmName_Pseudo'] == farm_id]

# Check if NormalizedDailyYield is centered around 1
normalized_mean = farm_data['NormalizedDailyYield'].mean()
normalized_variance = farm_data['NormalizedDailyYield'].var()
print("Mean of NormalizedDailyYield:", normalized_mean)
print("Standard Deviation of NormalizedDailyYield:", farm_data['NormalizedDailyYield'].std())
print("Variance of NormalizedDailyYield:", normalized_variance)

# Define the target variable
target = 'NormalizedDailyYield'

# Split the data into train and validation sets
train_data, val_data = train_test_split(farm_data, test_size=0.3, random_state=42)

# Scale the CumulativeHeatLoad feature
scaler = StandardScaler()
train_data['CumulativeHeatLoad'] = scaler.fit_transform(train_data[['CumulativeHeatLoad']])
val_data['CumulativeHeatLoad'] = scaler.transform(val_data[['CumulativeHeatLoad']])

# Define the single feature
features = ['CumulativeHeatLoad']

# Calculate the correlation between CumulativeHeatLoad and NormalizedDailyYield
cumulative_heatload_correlation = train_data['CumulativeHeatLoad'].corr(train_data[target])
print("Correlation between CumulativeHeatLoad and NormalizedDailyYield:", cumulative_heatload_correlation)

# Function to fit and get the posterior mean for CumulativeHeatLoad
def fit_and_get_cumulative_heatload_posterior(farm_id, train_data, features, cumulative_heatload_correlation):
    print(f"\nSelected features: {features}")

    # Calculate prior means for the features
    prior_mean_values = [cumulative_heatload_correlation]  # Use correlation as prior mean for CumulativeHeatLoad
    prior_std_values = [train_data[feature].std() for feature in features]
    
    # Define priors
    prior_mean = np.array([normalized_mean] + prior_mean_values).reshape(-1, 1)
    prior_std = np.array([np.sqrt(normalized_variance)] + prior_std_values).reshape(-1, 1)
    prior_cov = np.eye(len(features) + 1) * 0.01  # Smaller value for stronger prior confidence
    beta = 1 / normalized_variance

    # Print prior means and standard deviations
    print(f"\nPrior Mean and Standard Deviation for each feature:\n")
    for feature, mean, std in zip(['Off-set'] + features, prior_mean.flatten(), prior_std.flatten()):
        print(f"{feature}: mean = {mean}, std_dev = {std}")

    # Initialize and fit the model
    model = BayesianLinearRegression(
        dataframe=train_data,
        subject_name=farm_id,
        selected_features=features,
        target=target,
        subject_type=subject_type,
        prior_mean=prior_mean,
        prior_cov=prior_cov,
        beta=beta
    )
    result = model.fit_model()

    # Print posterior mean and standard deviation for each feature
    print(f"\nPosterior Mean and Standard Deviation for each feature:\n")
    for feature, stats in result.items():
        print(f"{feature}: mean = {stats['mu']}, std_dev = {stats['sigma']}")

    # Extract the posterior mean for CumulativeHeatLoad
    cumulative_heatload_posterior_mean = result['CumulativeHeatLoad']['mu']

    return cumulative_heatload_posterior_mean

# Train the model with the CumulativeHeatLoad feature on the training set and print results
cumulative_heatload_posterior_mean = fit_and_get_cumulative_heatload_posterior(farm_id, train_data, features, cumulative_heatload_correlation)

# Create a new DataFrame for the current farm's result
new_result = pd.DataFrame([{
    'FarmName_Pseudo': farm_id,
    'FarmCumulativeHeatLoadMilkProduction': cumulative_heatload_posterior_mean
}])

# Check if results_df is empty before concatenation
if results_df.empty:
    results_df = new_result
else:
    results_df = pd.concat([results_df, new_result], ignore_index=True)

results_df

Mean of NormalizedDailyYield: 1.0003233619647958
Standard Deviation of NormalizedDailyYield: 0.23075332989194564
Variance of NormalizedDailyYield: 0.05324709925622109
Correlation between CumulativeHeatLoad and NormalizedDailyYield: 0.004239590600669385

Selected features: ['CumulativeHeatLoad']

Prior Mean and Standard Deviation for each feature:

Off-set: mean = 1.0003233619647958, std_dev = 0.23075332989194564
CumulativeHeatLoad: mean = 0.004239590600669385, std_dev = 1.0000091233800301

Posterior Mean and Standard Deviation for each feature:

Off-set: mean = 0.9999345615474685, std_dev = 0.0009799657384188388
CumulativeHeatLoad: mean = 0.0009726191551353394, std_dev = 0.0009799657384188434


Unnamed: 0,FarmName_Pseudo,FarmCumulativeHeatLoadMilkProduction
0,f454e660,0.000973
