In [25]:
import pandas as pd
%matplotlib inline   
import seaborn as sns
import random as random


sns.set_theme()
sns.set_context("notebook")
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
dtype_dict = {
    'SE_Number': 'str',
    'FarmName_Pseudo': 'str',
    'InseminationDate': 'str',
    'CalvingDate': 'str',
    'Breeder': 'Int64',
    'BreedName': 'str',
    'BirthDate': 'str',
    'YearSeason': 'str',
    'Mother': 'str',
    'Father': 'str',
    'CalvingSireBullID': 'str',
    'CalvingEase': 'str',
    'LactationNumber': 'Int64',
    'PrevInsemination': 'str',
    'NextInsemination': 'str',
    'NINS': 'Int64',
    'NextCalving': 'str',
    'FirstInsemination': 'str',
    'LastInsemination': 'str',
    'FLI': 'Int64',
    'NextFirstInsemination': 'str',
    'NextLastInsemination': 'str',
    'CFI': 'Int64',
    'CLI': 'Int64',
    'GL': 'Int64',
    'CI': 'Int64',
    'MeanTemperature': 'float',
    'MeanRelativeHumidity': 'float',
    'MeanTHI_adj': 'float',
    'HW': 'Int64',
    'Cum_HW': 'Int64',
    'MaxTemp15Threshold': 'Int64'
}

# Load the data using the dtype_dict
data = pd.read_csv('../Data/MergedData/CleanedFertilityData.csv', dtype=dtype_dict)

# Convert the date columns to datetime
date_columns = [
    'InseminationDate',
    'CalvingDate',
    'BirthDate',
    'PrevInsemination',
    'NextInsemination',
    'NextCalving',
    'FirstInsemination',
    'LastInsemination',
    'NextFirstInsemination',
    'NextLastInsemination'
]

for column in date_columns:
    data[column] = pd.to_datetime(data[column], errors='coerce')

# Sort on the insemination date and SE number
data = data.sort_values(by=['InseminationDate', 'SE_Number'])

data.head(-5)

Unnamed: 0,SE_Number,FarmName_Pseudo,InseminationDate,CalvingDate,Breeder,YearSeason,BreedName,BirthDate,Mother,Father,...,CFI,CLI,GL,CI,MeanTemperature,MeanRelativeHumidity,MeanTHI_adj,HW,cum_HW,MaxTemp15Threshold
71,SE-169e580a-3766,169e580a,2022-01-01,NaT,8531,2022-1,02 SLB,2017-01-17,SE-169e580a-3083,9-7543 ColludeX,...,,,,,4.541667,0.984417,36.238206,0,0,0
122,SE-169e580a-3948,169e580a,2022-01-01,2022-11-05,1412,2022-1,02 SLB,2017-09-02,SE-169e580a-2919,9-7603 Fransisc,...,,,281,,4.541667,0.984417,36.238206,0,0,0
473,SE-169e580a-4555,169e580a,2022-01-01,2022-10-08,8531,2022-1,02 SLB,2018-10-22,SE-169e580a-3610,9-3891Dragonhea,...,53,53,280,,4.541667,0.984417,36.238206,0,0,0
560,SE-169e580a-4628,169e580a,2022-01-01,2022-10-02,1412,2022-1,02 SLB,2018-12-12,SE-169e580a-3420,9-7749 Foul,...,99,121,274,,4.541667,0.984417,36.238206,0,0,0
8926,SE-f454e660-729,f454e660,2022-01-01,2023-04-19,1423,2022-1,04 SJB,2020-09-25,SE-f454e660-433,Luxi 9-4471,...,,,283,,1.479167,0.909500,33.479002,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1009,SE-169e580a-4918,169e580a,2023-11-13,NaT,1412,2023-4,02 SLB,2019-08-02,SE-169e580a-3991,9-4538 Devine,...,,,,,4.100000,0.970000,38.820600,0,0,0
1898,SE-169e580a-5366,169e580a,2023-11-13,NaT,8531,2023-4,02 SLB,2020-08-01,SE-169e580a-4382,9-7909 Husky,...,,,,,4.100000,0.970000,38.820600,0,0,0
3204,SE-169e580a-5802,169e580a,2023-11-13,NaT,8531,2023-4,02 SLB,2021-07-14,SE-169e580a-3616,9-8355 Sparta,...,,,,,4.100000,0.970000,38.820600,0,0,0
4130,SE-169e580a-6380,169e580a,2023-11-13,NaT,8531,2023-4,02 SLB,2022-09-13,SE-169e580a-5065,9-8355 Sparta,...,,,,,4.100000,0.970000,38.820600,0,0,0


In [27]:
dtype_dict = {
    'Date': 'str',
    'FarmName_Pseudo': 'str',
    'SE_Number': 'str',
    'Age': 'Int64',
    'DailyYield': 'float',
    'PreviousDailyYield': 'float',
    'DailyYieldChange': 'float',
    'DaysInMilk': 'Int64',
    'YearSeason': 'str',
    'LactationNumber': 'Int64',
    'ExpectedYield': 'float',
    'NormalizedDailyYield': 'float',
    'NormalizedDailyYieldChange': 'float',
    'HeatStress': 'Int64',
    'Temp15Threshold': 'Int64',
    'HW': 'Int64',
    'cum_HW': 'Int64',
    'MeanTemperature': 'float',
    'MeanTHI_adj': 'float'
}

milk_data = pd.read_csv('../Data/MergedData/MilkApproachYieldData.csv', dtype=dtype_dict)
milk_data['Date'] = pd.to_datetime(milk_data['Date'], format='%Y-%m-%d')
milk_data.head(-5)

Unnamed: 0,Date,FarmName_Pseudo,SE_Number,Age,LactationNumber,DaysInMilk,YearSeason,DailyYield,PreviousDailyYield,DailyYieldChange,ExpectedYield,NormalizedDailyYield,NormalizedDailyYieldChange,HeatStress,Temp15Threshold,HW,cum_HW,MeanTemperature,MeanTHI_adj
0,2022-01-01,a624fb9a,SE-064c0cec-1189,3095,7,191,2022-1,30.77,0.00,0.00,29.739372,1.034655,0.000000,0,0,0,0,-3.025000,28.012944
1,2022-01-02,a624fb9a,SE-064c0cec-1189,3096,7,192,2022-1,48.22,30.77,17.45,29.692059,1.624003,0.587699,0,0,0,0,-0.279167,32.898193
2,2022-01-03,a624fb9a,SE-064c0cec-1189,3097,7,193,2022-1,30.53,48.22,-17.69,29.644756,1.029862,-0.596733,0,0,0,0,2.033333,36.760487
3,2022-01-04,a624fb9a,SE-064c0cec-1189,3098,7,194,2022-1,42.26,30.53,11.73,29.597463,1.427825,0.396318,0,0,0,0,0.066667,31.939524
4,2022-01-05,a624fb9a,SE-064c0cec-1189,3099,7,195,2022-1,38.49,42.26,-3.77,29.550181,1.302530,-0.127580,0,0,0,0,-3.700000,26.498206
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
487083,2023-06-03,f454e660,SE-fcdf259d-0044-0,4150,10,347,2023-3,12.67,15.75,-3.08,11.811455,1.072687,-0.260764,0,1,0,0,12.666667,53.132530
487084,2023-06-04,f454e660,SE-fcdf259d-0044-0,4151,10,348,2023-3,22.31,12.67,9.64,11.750492,1.898644,0.820391,0,1,0,0,13.079167,56.726870
487085,2023-06-05,f454e660,SE-fcdf259d-0044-0,4152,10,349,2023-3,12.84,22.31,-9.47,11.689810,1.098393,-0.810107,0,1,0,0,14.237500,58.482418
487086,2023-06-06,f454e660,SE-fcdf259d-0044-0,4153,10,350,2023-3,9.47,12.84,-3.37,11.629408,0.814315,-0.289783,0,1,0,0,15.345833,60.546358


In [28]:
# Merge the data based on Date and FarmName_Pseudo
merged_data = pd.merge(data, milk_data[['Date', 'FarmName_Pseudo', 'HeatStress']],
                       left_on=['InseminationDate', 'FarmName_Pseudo'],
                       right_on=['Date', 'FarmName_Pseudo'],
                       how='left')

# Drop the extra 'Date' column from the merge
merged_data = merged_data.drop(columns=['Date'])

merged_data.head(-5)

Unnamed: 0,SE_Number,FarmName_Pseudo,InseminationDate,CalvingDate,Breeder,YearSeason,BreedName,BirthDate,Mother,Father,...,CLI,GL,CI,MeanTemperature,MeanRelativeHumidity,MeanTHI_adj,HW,cum_HW,MaxTemp15Threshold,HeatStress
0,SE-169e580a-3766,169e580a,2022-01-01,NaT,8531,2022-1,02 SLB,2017-01-17,SE-169e580a-3083,9-7543 ColludeX,...,,,,4.541667,0.984417,36.238206,0,0,0,
1,SE-169e580a-3948,169e580a,2022-01-01,2022-11-05,1412,2022-1,02 SLB,2017-09-02,SE-169e580a-2919,9-7603 Fransisc,...,,281,,4.541667,0.984417,36.238206,0,0,0,
2,SE-169e580a-4555,169e580a,2022-01-01,2022-10-08,8531,2022-1,02 SLB,2018-10-22,SE-169e580a-3610,9-3891Dragonhea,...,53,280,,4.541667,0.984417,36.238206,0,0,0,
3,SE-169e580a-4628,169e580a,2022-01-01,2022-10-02,1412,2022-1,02 SLB,2018-12-12,SE-169e580a-3420,9-7749 Foul,...,121,274,,4.541667,0.984417,36.238206,0,0,0,
4,SE-f454e660-729,f454e660,2022-01-01,2023-04-19,1423,2022-1,04 SJB,2020-09-25,SE-f454e660-433,Luxi 9-4471,...,,283,,1.479167,0.909500,33.479002,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468605,SE-f454e660-0699,f454e660,2023-11-13,NaT,1423,2023-4,99 Korsning/obestämbar ras,2020-06-07,SE-f454e660-507,Hays 9-4443 x-vik,...,,,,4.600000,0.932000,35.695600,0,0,0,0
468606,SE-f454e660-0699,f454e660,2023-11-13,NaT,1423,2023-4,99 Korsning/obestämbar ras,2020-06-07,SE-f454e660-507,Hays 9-4443 x-vik,...,,,,4.600000,0.932000,35.695600,0,0,0,0
468607,SE-f454e660-0699,f454e660,2023-11-13,NaT,1423,2023-4,99 Korsning/obestämbar ras,2020-06-07,SE-f454e660-507,Hays 9-4443 x-vik,...,,,,4.600000,0.932000,35.695600,0,0,0,0
468608,SE-f454e660-0699,f454e660,2023-11-13,NaT,1423,2023-4,99 Korsning/obestämbar ras,2020-06-07,SE-f454e660-507,Hays 9-4443 x-vik,...,,,,4.600000,0.932000,35.695600,0,0,0,0


In [29]:
# Define the function to identify heat stress based on weather conditions
def identify_weather_based_heat_stress(row):
    if (row['HW'] == 1 or 
        row['cum_HW'] > 0 or 
        row['MeanTHI_adj'] > 61 or
        row['MeanTemperature'] > 15):
        return 1
    return 0

# Apply the function to fill NaN values in the HeatStress column
merged_data['HeatStress'] = merged_data.apply(
    lambda row: identify_weather_based_heat_stress(row) if pd.isna(row['HeatStress']) else row['HeatStress'], axis=1)

merged_data.head(-5)

Unnamed: 0,SE_Number,FarmName_Pseudo,InseminationDate,CalvingDate,Breeder,YearSeason,BreedName,BirthDate,Mother,Father,...,CLI,GL,CI,MeanTemperature,MeanRelativeHumidity,MeanTHI_adj,HW,cum_HW,MaxTemp15Threshold,HeatStress
0,SE-169e580a-3766,169e580a,2022-01-01,NaT,8531,2022-1,02 SLB,2017-01-17,SE-169e580a-3083,9-7543 ColludeX,...,,,,4.541667,0.984417,36.238206,0,0,0,0
1,SE-169e580a-3948,169e580a,2022-01-01,2022-11-05,1412,2022-1,02 SLB,2017-09-02,SE-169e580a-2919,9-7603 Fransisc,...,,281,,4.541667,0.984417,36.238206,0,0,0,0
2,SE-169e580a-4555,169e580a,2022-01-01,2022-10-08,8531,2022-1,02 SLB,2018-10-22,SE-169e580a-3610,9-3891Dragonhea,...,53,280,,4.541667,0.984417,36.238206,0,0,0,0
3,SE-169e580a-4628,169e580a,2022-01-01,2022-10-02,1412,2022-1,02 SLB,2018-12-12,SE-169e580a-3420,9-7749 Foul,...,121,274,,4.541667,0.984417,36.238206,0,0,0,0
4,SE-f454e660-729,f454e660,2022-01-01,2023-04-19,1423,2022-1,04 SJB,2020-09-25,SE-f454e660-433,Luxi 9-4471,...,,283,,1.479167,0.909500,33.479002,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468605,SE-f454e660-0699,f454e660,2023-11-13,NaT,1423,2023-4,99 Korsning/obestämbar ras,2020-06-07,SE-f454e660-507,Hays 9-4443 x-vik,...,,,,4.600000,0.932000,35.695600,0,0,0,0
468606,SE-f454e660-0699,f454e660,2023-11-13,NaT,1423,2023-4,99 Korsning/obestämbar ras,2020-06-07,SE-f454e660-507,Hays 9-4443 x-vik,...,,,,4.600000,0.932000,35.695600,0,0,0,0
468607,SE-f454e660-0699,f454e660,2023-11-13,NaT,1423,2023-4,99 Korsning/obestämbar ras,2020-06-07,SE-f454e660-507,Hays 9-4443 x-vik,...,,,,4.600000,0.932000,35.695600,0,0,0,0
468608,SE-f454e660-0699,f454e660,2023-11-13,NaT,1423,2023-4,99 Korsning/obestämbar ras,2020-06-07,SE-f454e660-507,Hays 9-4443 x-vik,...,,,,4.600000,0.932000,35.695600,0,0,0,0


In [30]:
# Save the reordered DataFrame to a CSV file
merged_data.to_csv('../Data/MergedData/HeatStressFertilityData.csv', index=False)

### Variables Explanation for `HeatStressFertilityData.csv`

1. **SE_Number**:
   - Description: A unique identifier for each cow.
   - Datatype: `str`
   - Example: `SE-169e580a-3766`

2. **FarmName_Pseudo**:
   - Description: A pseudonym for the farm where the data was collected.
   - Datatype: `str`
   - Example: `169e580a`

3. **InseminationDate**:
   - Description: The date when the insemination was performed.
   - Datatype: `str` (should be converted to `datetime` for operations)
   - Format: `YYYY-MM-DD`
   - Example: `2022-01-01`

4. **CalvingDate**:
   - Description: The date when the cow calved.
   - Datatype: `str` (should be converted to `datetime` for operations)
   - Format: `YYYY-MM-DD`
   - Example: `2022-11-05`

5. **Breeder**:
   - Description: An identifier for the breeder involved in the process.
   - Datatype: `Int64`
   - Example: `8531`

6. **YearSeason**:
   - Description: A categorical variable representing the season based on the month and year.
   - Datatype: `str`
   - Example: `2022-1`
   - YearSeason parameters:
     - 1: Dec-Feb
     - 2: Mar-May
     - 3: Jun-Aug
     - 4: Sep-Nov

7. **BreedName**:
   - Description: The breed of the cow.
   - Datatype: `str`
   - Example: `02 SLB`

8. **BirthDate**:
   - Description: The birth date of the cow.
   - Datatype: `str` (should be converted to `datetime` for operations)
   - Format: `YYYY-MM-DD`
   - Example: `2017-01-17`

9. **Mother**:
   - Description: The identifier for the cow's mother.
   - Datatype: `str`
   - Example: `SE-169e580a-3083`

10. **Father**:
    - Description: The identifier for the cow's father.
    - Datatype: `str`
    - Example: `9-7543 ColludeX`

11. **CalvingSireBullID**:
    - Description: The identifier for the bull used in the calving.
    - Datatype: `str`
    - Example: `Bull_001`

12. **CalvingEase**:
    - Description: A categorical variable indicating the ease of calving.
    - Datatype: `str`
    - Example: `11` (Easy, without assistance)

13. **LactationNumber**:
    - Description: The number indicating the lactation cycle of the cow.
    - Datatype: `Int64`
    - Example: `2`

14. **PrevInsemination**:
    - Description: The date of the previous insemination.
    - Datatype: `str` (should be converted to `datetime` for operations)
    - Format: `YYYY-MM-DD`
    - Example: `2021-09-15`

15. **NextInsemination**:
    - Description: The date of the next insemination after the current record.
    - Datatype: `str` (should be converted to `datetime` for operations)
    - Format: `YYYY-MM-DD`
    - Example: `2023-08-01`

16. **NINS**:
    - Description: Number of unique inseminations within the lactation cycle.
    - Datatype: `Int64`
    - Example: `3`

17. **NextCalving**:
    - Description: The expected date for the next calving.
    - Datatype: `str` (should be converted to `datetime` for operations)
    - Format: `YYYY-MM-DD`
    - Example: `2024-02-10`

18. **FirstInsemination**:
    - Description: The date of the first insemination in the current lactation.
    - Datatype: `str` (should be converted to `datetime` for operations)
    - Format: `YYYY-MM-DD`
    - Example: `2022-01-01`

19. **LastInsemination**:
    - Description: The date of the last insemination in the current lactation.
    - Datatype: `str` (should be converted to `datetime` for operations)
    - Format: `YYYY-MM-DD`
    - Example: `2022-06-01`

20. **FLI**:
    - Description: The number of days between the first and last insemination in the lactation cycle.
    - Datatype: `Int64`
    - Example: `150`

21. **NextFirstInsemination**:
    - Description: The date of the first insemination in the next lactation cycle.
    - Datatype: `str` (should be converted to `datetime` for operations)
    - Format: `YYYY-MM-DD`
    - Example: `2023-05-01`

22. **NextLastInsemination**:
    - Description: The date of the last insemination in the next lactation cycle.
    - Datatype: `str` (should be converted to `datetime` for operations)
    - Format: `YYYY-MM-DD`
    - Example: `2023-11-01`

23. **CFI**:
    - Description: Calving to first insemination interval, in days.
    - Datatype: `Int64`
    - Example: `60`

24. **CLI**:
    - Description: Calving to last insemination interval, in days.
    - Datatype: `Int64`
    - Example: `120`

25. **GL**:
    - Description: Gestation length, in days.
    - Datatype: `Int64`
    - Example: `280`

26. **CI**:
    - Description: Calving interval, in days.
    - Datatype: `Int64`
    - Example: `365`

27. **MeanTemperature**:
    - Description: The mean temperature recorded on the calving day.
    - Datatype: `float`
    - Example: `20.5`

28. **MeanRelativeHumidity**:
    - Description: The mean relative humidity recorded on the calving day.
    - Datatype: `float`
    - Example: `65.5`

29. **MeanTHI_adj**:
    - Description: The mean adjusted Temperature-Humidity Index for the calving day.
    - Datatype: `float`
    - Example: `72.3`

30. **HW**:
    - Description: The maximum heatwave indicator on the calving day (0: no heatwave, 1: heatwave).
    - Datatype: `Int64`
    - Example: `0`

31. **Cum_HW**:
    - Description: Cumulative heatwave days up to the calving day.
    - Datatype: `Int64`
    - Example: `2`

32. **MaxTemp15Threshold**:
    - Description: Indicator if the temperature exceeded 15°C on the calving day (0: no, 1: yes).
    - Datatype: `Int64`
    - Example: `1`

33. **HeatStress**:
    - Description: A binary variable indicating the presence of heat stress on the cow, based on specific weather conditions.
    - Datatype: `Int64`
    - Example: `0`