In [None]:
# Harshadeep Kambhampati (hk24873), Ian Wang, Arkady Marchenko, Andy Jiang

# ML Course Project

In [2]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time

%matplotlib inline

In [3]:
# Import data
df = pd.read_csv("cattle_data_train.csv")
df.describe()


# OBSERVATIONS from describe():
# No NaNs except for feed_quantity in both kg and lbs: not too many, 10000 ish out of 200000
# The min of Milk Yield for some reason is negative, which doesn't make sense?... what even is a negative yield
# Parity is the number of times a cow has given birth






Unnamed: 0,Age_Months,Weight_kg,Parity,Days_in_Milk,Feed_Quantity_kg,Feeding_Frequency,Water_Intake_L,Walking_Distance_km,Grazing_Duration_hrs,Rumination_Time_hrs,...,Anthrax_Vaccine,IBR_Vaccine,BVD_Vaccine,Rabies_Vaccine,Previous_Week_Avg_Yield,Body_Condition_Score,Milking_Interval_hrs,Feed_Quantity_lb,Mastitis,Milk_Yield_L
count,210000.0,210000.0,210000.0,210000.0,199519.0,210000.0,210000.0,210000.0,210000.0,210000.0,...,210000.0,210000.0,210000.0,210000.0,210000.0,210000.0,210000.0,199519.0,210000.0,210000.0
mean,83.483905,499.93043,3.500395,182.112967,12.014793,2.999119,80.03685,4.034754,6.05671,0.256557,...,0.600381,0.598814,0.599824,0.600824,8.747584,3.394726,12.302438,26.49272,0.099976,15.589156
std,34.648982,144.659172,1.707383,105.051486,3.969247,1.413147,14.987677,1.928529,2.867575,6.115351,...,0.489821,0.49014,0.489935,0.48973,5.901473,0.632831,4.298998,8.741282,0.299969,5.352079
min,24.0,250.0,1.0,1.0,2.370284,1.0,14.207737,0.5,1.0,-8.808053,...,0.0,0.0,0.0,0.0,0.0,2.0,6.0,6.615,0.0,-5.700324
25%,54.0,374.2,2.0,91.0,9.283265,2.0,69.919162,2.65,4.0,-4.383302,...,0.0,0.0,0.0,0.0,4.27,3.0,12.0,20.5065,0.0,11.822207
50%,83.0,500.2,3.0,182.0,12.002254,3.0,80.016973,4.0,6.0,-0.818631,...,1.0,1.0,1.0,1.0,7.71,3.5,12.0,26.46,0.0,15.145871
75%,114.0,625.7,5.0,273.0,14.70892,4.0,90.119812,5.35,8.0,4.051704,...,1.0,1.0,1.0,1.0,12.41,4.0,12.0,32.4135,0.0,18.884708
max,143.0,750.0,6.0,364.0,25.454207,5.0,149.96021,12.0,14.0,31.263406,...,1.0,1.0,1.0,1.0,38.67,5.0,24.0,55.125,1.0,44.555285


In [4]:
# DATA CLEANING (outside of pipeline)

# Dropping Cattle_ID as it is just an identifier, not useful for prediction
df = df.drop(columns=['Cattle_ID'])

# Handling NaN values for Feed_Quantity: We will keep only KG and convert LBS to KG for NaNs, then drop LBS
FEED_QTY_LBS_TO_KG = 0.453592
df['Feed_Quantity_kg'] = df['Feed_Quantity_kg'].fillna(df['Feed_Quantity_lb'] * FEED_QTY_LBS_TO_KG)
df = df.drop(columns=['Feed_Quantity_lb'])
print("Number of missing values after lb to kg conversion:", df.isna().sum().sum())

# We should drop our "negative" milk yields as those entries don't really make sense
df = df[df['Milk_Yield_L'] >= 0]

# Now we split our data into features and labels
features = df.drop(columns=['Milk_Yield_L'])
labels = df['Milk_Yield_L']


Number of missing values after lb to kg conversion: 16760


In [None]:
# DATA EXPLORATION

# target_corr = df.corr(numeric_only=True)['Milk_Yield_L'].sort_values(ascending=False)
# print(target_corr)


# Let's take a look at the correlation between each of the features and the label
df_encoded = df.copy()

for col in df_encoded.select_dtypes(include=['object']).columns:
    df_encoded[col] = df_encoded[col].astype('category').cat.codes

df_encoded.corr()['Milk_Yield_L'].sort_values(ascending=False)

# Observations:
# - Interestingly, Weight_kg, Feed_Quantity_kg, Age_Months, Parity all have the highest correlation over 0.2
# - We have a lot of very low correlation features that may be best to drop
# - Since Feed_Quantity_kg actually has a high correlation, we should have good imputation for it to avoid losing that info for missing entries

Milk_Yield_L               1.000000
Weight_kg                  0.300240
Feed_Quantity_kg           0.222856
Water_Intake_L             0.125022
Rumination_Time_hrs        0.089412
Previous_Week_Avg_Yield    0.089412
IBR_Vaccine                0.072186
Anthrax_Vaccine            0.069584
Rabies_Vaccine             0.068090
Milking_Interval_hrs       0.014635
Grazing_Duration_hrs       0.004278
Housing_Score              0.004054
Humidity_percent           0.002337
Brucellosis_Vaccine        0.002065
BVD_Vaccine                0.000692
Breed                      0.000515
Feeding_Frequency          0.000465
Farm_ID                    0.000105
HS_Vaccine                -0.000038
Climate_Zone              -0.000451
Feed_Type                 -0.000488
BQ_Vaccine                -0.000576
Walking_Distance_km       -0.001608
Resting_Hours             -0.001703
Body_Condition_Score      -0.001951
Management_System         -0.002054
FMD_Vaccine               -0.002629
Date                      -0

In [14]:
# Let's also check for variance across all features and see if we have any low variance features to consider filtering
feature_variances = df_encoded.var().sort_values()
print(feature_variances)

# Seems like all of the features demonstrate some variance, looks significant enough not to drop any based on variance alone



Housing_Score                  0.041013
Mastitis                       0.089968
Rabies_Vaccine                 0.239815
BQ_Vaccine                     0.239857
Anthrax_Vaccine                0.239919
Brucellosis_Vaccine            0.239986
BVD_Vaccine                    0.240043
HS_Vaccine                     0.240069
FMD_Vaccine                    0.240142
IBR_Vaccine                    0.240230
Body_Condition_Score           0.400456
Lactation_Stage                0.690784
Feeding_Frequency              1.997108
Management_System              1.999275
Breed                          2.256884
Parity                         2.914924
Climate_Zone                   2.917710
Walking_Distance_km            3.719208
Feed_Type                      5.246443
Resting_Hours                  8.211616
Grazing_Duration_hrs           8.223133
Feed_Quantity_kg              15.753011
Milking_Interval_hrs          18.482684
Milk_Yield_L                  28.551332
Previous_Week_Avg_Yield       34.828967


In [None]:
# DATA PREPROCESSING PIPELINE (convert into a pipeline to pass into models differently later)
# We should have different pipelines for NaN imputation, low variance filter, correlation filter, scaling

# Using ordinal encoding for Lactation_Stage because it has a natural order
order = ['Early', 'Mid', 'Late', 'Dry']
df['Lactation_Stage'] = pd.Categorical(df['Lactation_Stage'], categories=order, ordered=True).codes + 1



# df = df.get_dummies(df, columns=['Management_System', ])
# df = df = pd.get_dummies(df, columns=['Breed', 'Climate_Zone'], drop_first=False)



# df.head()

# DATA EXPLORATION NOTES
# Immediately it looks like we have some categorical data that may need to be encoded as numerical
# Data may need to be scaled in our CV-loop later based on the model we choose to use
# Categorical data: Breed, Climate_Zone, Management_System, Lactation_Stage, Feed_Type, Date, Farm_ID
# Feed_Quantity_kg has some NaN entries that need to be replaced
# Check for NaNs or blank entries
# Some features seem less relevant: parity?, date, farm_id
# more relevant features: breed, age, weight, lactation stage, Previous_Week_Avg_Yield,Body_Condition_Score, Mastitis
# PREVIOUS WEEK AVG YIELD SHOULD BE WEIGHTED HEAVILY


# TODO:
# visualize distributions of each feature, see if there is anything interesting
# test correlation across each feature with label
# imputation using mean/median (or ffill if we're lazy) for food quantity NaNs (about 16k of them)
# We have a LOT of low variance features, maybe filter them out if they seem irrelevant
