In [None]:
"""

Author: Annam.ai IIT Ropar
Team Name: SoilMate
Team Members: Kshitiz Jangra, Harshal Chaudhari
Leaderboard Rank: 62

"""

# This is the notebook used for training the model.

# Load and Explore Data
import os
import pandas as pd

dataset_path = '../data/soil-classification-part-2/soil_competition-2025'
train_dir = os.path.join(dataset_path, 'train')
test_dir = os.path.join(dataset_path, 'test')
train_labels_df = pd.read_csv(os.path.join(dataset_path, 'train_labels.csv'))
train_labels_df['label'] = 1

# Prepare Data
from src.preprocessing import prepare_datasets
from sklearn.model_selection import train_test_split

X_raw, X_cls, y = prepare_datasets(train_dir, train_labels_df)
X_raw_train, X_raw_val, X_cls_train, X_cls_val = train_test_split(X_raw, X_cls, test_size=0.1, random_state=42)

# Autoencoder Model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, Input, Conv2D, MaxPooling2D, UpSampling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

input_img = Input(shape=(224, 224, 3))
x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_img)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
encoded = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(encoded)
x = UpSampling2D((2, 2))(x)
x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
decoded = Conv2D(3, (3, 3), activation='sigmoid', padding='same')(x)

autoencoder = Model(input_img, decoded)
autoencoder.compile(optimizer=Adam(1e-4), loss='mse')
ae_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

autoencoder.fit(
    X_raw_train, X_raw_train,
    validation_data=(X_raw_val, X_raw_val),
    epochs=50,
    batch_size=32,
    shuffle=True,
    callbacks=[ae_stop],
    verbose=1
)

# Calculate Threshold
reconstructed_val = autoencoder.predict(X_raw_val)
mse_val = np.mean(np.square(X_raw_val - reconstructed_val), axis=(1, 2, 3))
ae_threshold = np.percentile(mse_val, 95)

# Classifier
from tensorflow.keras.applications import MobileNetV2

base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False

inputs = Input(shape=(224, 224, 3))
x = base_model(inputs, training=False)
x = GlobalAveragePooling2D()(x)
x = Dropout(0.3)(x)
outputs = Dense(1, activation='sigmoid')(x)
classifier = Model(inputs, outputs)

classifier.compile(optimizer=Adam(1e-4), loss='binary_crossentropy', metrics=['accuracy'])
cls_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

classifier.fit(
    X_cls_train, y[:len(X_cls_train)],
    validation_data=(X_cls_val, y[len(X_cls_train):]),
    epochs=50,
    batch_size=32,
    callbacks=[cls_stop],
    verbose=1
)

# Threshold and Evaluation
val_probs = classifier.predict(X_cls_val).flatten()
cls_threshold = np.percentile(val_probs, 5)

from src.postprocessing import evaluate_combined, save_metrics

f1_combined, y_pred_combined = evaluate_combined(mse_val, val_probs, ae_threshold, cls_threshold)
print(f"✅ Combined F1 Score: {f1_combined:.4f}")
save_metrics(f1_combined)
