<a href="https://colab.research.google.com/github/jayagopalllm/s1/blob/main/BovineMetrics_Pilot_Model_v0_5_(Masked_Multi_View).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#
# Project BovineMetrics - Pilot Model v0.5
# Google Colab Notebook
#
# This version upgrades the multi-view architecture to use segmentation masks
# for BOTH the side-view and top-view images, leveraging the new top-view
# annotations for superior noise reduction and accuracy.
#

# ==============================================================================
# STEP 1: SETUP AND IMPORTS
# ==============================================================================
!pip install tensorflow pandas scikit-learn pycocotools matplotlib

import os
import numpy as np
import pandas as pd
import cv2
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling2D, concatenate, Dropout
from pycocotools.coco import COCO
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt

print("TensorFlow Version:", tf.__version__)

# ==============================================================================
# STEP 2: MOUNT GOOGLE DRIVE AND DEFINE PATHS
# ==============================================================================
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Drive mounted successfully.")

# --- IMPORTANT: UPDATE THESE PATHS ---
BASE_DRIVE_PATH = '/content/drive/MyDrive/BovineMetrics_Dataset_v1/'
SIDE_IMAGE_DIR = os.path.join(BASE_DRIVE_PATH, 'side_view_images')
TOP_IMAGE_DIR = os.path.join(BASE_DRIVE_PATH, 'top_view_images')
SIDE_ANNOTATIONS_PATH = os.path.join(BASE_DRIVE_PATH, 'side_view_annotations.json')
TOP_ANNOTATIONS_PATH = os.path.join(BASE_DRIVE_PATH, 'top_view_annotations.json')
METADATA_CSV_PATH = os.path.join(BASE_DRIVE_PATH, 'cattle_data.csv')
# ------------------------------------

# ==============================================================================
# STEP 3: LOAD AND PREPROCESS THE DATA (UPGRADED FOR DUAL ANNOTATIONS)
# ==============================================================================
print("\nLoading metadata from CSV...")
metadata_df = pd.read_csv(METADATA_CSV_PATH)
metadata_df.columns = metadata_df.columns.str.strip().str.lower()
print(f"Loaded {len(metadata_df)} records from CSV.")

print("Loading COCO annotations for side-views...")
coco_side = COCO(SIDE_ANNOTATIONS_PATH)
side_filename_to_id = {img['file_name']: img['id'] for img in coco_side.dataset['images']}
print(f"Loaded {len(coco_side.getImgIds())} side-view annotations.")

print("Loading COCO annotations for top-views...")
coco_top = COCO(TOP_ANNOTATIONS_PATH)
top_filename_to_id = {img['file_name']: img['id'] for img in coco_top.dataset['images']}
print(f"Loaded {len(coco_top.getImgIds())} top-view annotations.")

def load_and_preprocess_multi_view_data(metadata_df, coco_side, coco_top, side_dir, top_dir):
    side_images = []
    top_images = []
    tabular_data = []
    weights = []
    filenames_for_analysis = []

    IMG_HEIGHT, IMG_WIDTH = 224, 224
    placeholder_image = np.zeros((IMG_HEIGHT, IMG_WIDTH, 3), dtype="uint8")

    for index, row in metadata_df.iterrows():
        # --- Process Side View Image with Mask ---
        side_filename = row.get('side_view_filename')
        if pd.notna(side_filename) and isinstance(side_filename, str):
            side_path = os.path.join(side_dir, side_filename)
            img_id = side_filename_to_id.get(side_filename)
            if os.path.exists(side_path) and img_id:
                image = cv2.imread(side_path)
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

                mask = np.zeros(image.shape[:2], dtype="uint8")
                ann_ids = coco_side.getAnnIds(imgIds=img_id)
                anns = coco_side.loadAnns(ann_ids)
                for ann in anns:
                    for seg in ann['segmentation']:
                        poly = np.array(seg).reshape((-1, 2)).astype(np.int32)
                        cv2.fillPoly(mask, [poly], 255)

                masked_image = cv2.bitwise_and(image, image, mask=mask)
                side_images.append(cv2.resize(masked_image, (IMG_WIDTH, IMG_HEIGHT)))
            else:
                side_images.append(placeholder_image)
        else:
            side_images.append(placeholder_image)

        # --- Process Top View Image with Mask ---
        top_filename = row.get('top_view_filename')
        if pd.notna(top_filename) and isinstance(top_filename, str):
            top_path = os.path.join(top_dir, top_filename)
            img_id = top_filename_to_id.get(top_filename)
            if os.path.exists(top_path) and img_id:
                image = cv2.imread(top_path)
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

                mask = np.zeros(image.shape[:2], dtype="uint8")
                ann_ids = coco_top.getAnnIds(imgIds=img_id)
                anns = coco_top.loadAnns(ann_ids)
                for ann in anns:
                    for seg in ann['segmentation']:
                        poly = np.array(seg).reshape((-1, 2)).astype(np.int32)
                        cv2.fillPoly(mask, [poly], 255)

                masked_image = cv2.bitwise_and(image, image, mask=mask)
                top_images.append(cv2.resize(masked_image, (IMG_WIDTH, IMG_HEIGHT)))
            else:
                top_images.append(placeholder_image)
        else:
            top_images.append(placeholder_image)

        # --- Append Tabular Data and Weight ---
        tabular_data.append(row[['sex', 'age', 'breed']])
        weights.append(row['weight'])
        filenames_for_analysis.append(side_filename if pd.notna(side_filename) else top_filename)

    return (np.array(side_images, dtype="float32"),
            np.array(top_images, dtype="float32"),
            pd.DataFrame(tabular_data).reset_index(drop=True),
            np.array(weights, dtype="float32"),
            filenames_for_analysis)

side_images, top_images, tabular_data, weights, image_filenames = \
    load_and_preprocess_multi_view_data(metadata_df, coco_side, coco_top, SIDE_IMAGE_DIR, TOP_IMAGE_DIR)

print(f"\nSuccessfully processed {len(weights)} records.")

# --- Normalize and Preprocess ---
weight_mean = weights.mean()
weight_std = weights.std()
normalized_weights = (weights - weight_mean) / weight_std

categorical_features = ['sex', 'breed']
numerical_features = ['age']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])
processed_tabular = preprocessor.fit_transform(tabular_data)

# ==============================================================================
# STEP 4: DATA SPLITTING
# ==============================================================================
(train_side, val_side,
 train_top, val_top,
 train_tabular, val_tabular,
 train_weights, val_weights) = train_test_split(
    side_images, top_images, processed_tabular, normalized_weights, test_size=0.2, random_state=42)

# ==============================================================================
# STEP 5: BUILD THE MULTI-VIEW MODEL (v0.5)
# ==============================================================================
print("\nBuilding Masked Multi-View model with dual EfficientNetB0 inputs...")

def create_multi_view_model(image_shape, tabular_shape):
    # --- Create a SINGLE, shared base model to avoid name collisions and download errors ---
    base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=image_shape)
    base_model.trainable = False

    # --- Side-View Branch (reuses the base model) ---
    side_input = Input(shape=image_shape, name="side_view_input")
    x = base_model(side_input, training=False)
    x = GlobalAveragePooling2D()(x)
    side_features = Dense(64, activation='relu')(x)

    # --- Top-View Branch (reuses the SAME base model) ---
    top_input = Input(shape=image_shape, name="top_view_input")
    y = base_model(top_input, training=False)
    y = GlobalAveragePooling2D()(y)
    top_features = Dense(64, activation='relu')(y)

    # --- Tabular Branch ---
    tabular_input = Input(shape=(tabular_shape,), name="tabular_input")
    z = Dense(64, activation='relu')(tabular_input)
    z = Dropout(0.3)(z)
    z = Dense(32, activation='relu')(z)
    tabular_features = z

    # --- Combined Head ---
    combined = concatenate([side_features, top_features, tabular_features])
    c = Dense(128, activation='relu')(combined)
    c = Dropout(0.5)(c)
    c = Dense(64, activation='relu')(c)
    output = Dense(1, name="output")(c)

    model = Model(inputs=[side_input, top_input, tabular_input], outputs=output)
    return model

image_shape = (224, 224, 3)
tabular_shape = processed_tabular.shape[1]
model = create_multi_view_model(image_shape, tabular_shape)

lr_scheduler = ReduceLROnPlateau(factor=0.5, patience=5, min_lr=0.00001)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='mean_squared_error',
              metrics=['mean_absolute_error'])
model.summary()

# ==============================================================================
# STEP 6: TRAIN THE MODEL
# ==============================================================================
print("\nStarting model training...")
EPOCHS = 100
BATCH_SIZE = 16

# For simplicity in this version, we will not use the ImageDataGenerator.
# As the dataset grows, we can re-introduce it with a more complex generator.
history = model.fit(
    {"side_view_input": train_side, "top_view_input": train_top, "tabular_input": train_tabular},
    train_weights,
    validation_data=(
        {"side_view_input": val_side, "top_view_input": val_top, "tabular_input": val_tabular},
        val_weights
    ),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True), lr_scheduler]
)

# ==============================================================================
# STEP 7: EVALUATE THE MODEL
# ==============================================================================
print("\nEvaluating final model on validation data...")

val_input = {"side_view_input": val_side, "top_view_input": val_top, "tabular_input": val_tabular}
predictions_norm = model.predict(val_input).flatten()
predictions_kg = (predictions_norm * weight_std) + weight_mean
val_weights_kg = (val_weights * weight_std) + weight_mean

final_mae = np.mean(np.abs(val_weights_kg - predictions_kg))
final_mape = np.mean(np.abs((val_weights_kg - predictions_kg) / val_weights_kg)) * 100

print("\n--- PILOT MODEL v0.5 RESULTS ---")
print(f"Final Validation Mean Absolute Error (MAE): {final_mae:.2f} kg")
print(f"Final Validation Mean Absolute Percentage Error (MAPE): {final_mape:.2f}%")
print("---------------------------------")

# ==============================================================================
# STEP 8: ERROR ANALYSIS & VISUALIZATION
# ==============================================================================
print("\nStarting error analysis...")

full_dataset_input = {
    "side_view_input": side_images,
    "top_view_input": top_images,
    "tabular_input": processed_tabular
}
full_dataset_predictions_norm = model.predict(full_dataset_input).flatten()
full_dataset_predictions_kg = (full_dataset_predictions_norm * weight_std) + weight_mean

error_df = tabular_data.copy()
error_df['image_filename'] = image_filenames
error_df['actual_weight_kg'] = weights
error_df['predicted_weight_kg'] = full_dataset_predictions_kg
error_df['error_kg'] = error_df['predicted_weight_kg'] - error_df['actual_weight_kg']
error_df['abs_error_kg'] = np.abs(error_df['error_kg'])

error_df_sorted = error_df.sort_values(by='abs_error_kg', ascending=False)

print("\n--- TOP 20 WORST PREDICTIONS ---")
print(error_df_sorted[['image_filename', 'actual_weight_kg', 'predicted_weight_kg', 'error_kg']].head(20))
print("---------------------------------")

plt.figure(figsize=(10, 10))
plt.scatter(error_df['actual_weight_kg'], error_df['predicted_weight_kg'], alpha=0.6, edgecolors='k')
lims = [ np.min([plt.xlim(), plt.ylim()]), np.max([plt.xlim(), plt.ylim()]) ]
plt.plot(lims, lims, 'r--', alpha=0.75, zorder=0, label="Perfect Prediction")
plt.xlabel("Actual Weight (kg)")
plt.ylabel("Predicted Weight (kg)")
plt.title("Model Performance: Actual vs. Predicted Weight")
plt.legend()
plt.grid(True)
plt.show()



TensorFlow Version: 2.19.0
Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive mounted successfully.

Loading metadata from CSV...
Loaded 696 records from CSV.
Loading COCO annotations for side-views...
loading annotations into memory...
Done (t=0.04s)
creating index...
index created!
Loaded 608 side-view annotations.
Loading COCO annotations for top-views...
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
Loaded 106 top-view annotations.

Successfully processed 696 records.

Building Masked Multi-View model with dual EfficientNetB0 inputs...
Downloading data from https://storage.googleapis.com/keras-applications/efficientnet_side_notop.h5


Exception: URL fetch failure on https://storage.googleapis.com/keras-applications/efficientnet_side_notop.h5: 403 -- Forbidden