In [0]:
import os

# For Spark DataFrame API, use 'dbfs:/...' (no extra '/dbfs' prefix)
pathC = os.environ.get('CLASSIFICATION_DIR', 'dbfs:/mnt/lab/unrestricted/KritiM/classification/')
training_file = os.environ.get('TRAINING_FILE', f"{pathC.rstrip('/')}/trainingSample.csv")

# Check if the file exists in DBFS
try:
    dbutils.fs.ls(training_file)
except Exception as e:
    raise FileNotFoundError(f"File not found at {training_file}. Please check the path and ensure the file exists.")

# Read the CSV using Spark
# Note: For pandas or local file APIs, use '/dbfs/mnt/...' instead

df = spark.read.csv(training_file, header=True, inferSchema=True)
display(df)

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import numpy as np

# Convert Spark DataFrame to pandas DataFrame
pdf = df.toPandas()

# 1) Display schema and check for missing values
print(pdf.info())
print("Missing values per column:\n", pdf.isnull().sum())

# 2) Handle missing values (drop rows with any missing values for simplicity)
pdf = pdf.dropna()

# 3) Define categorical and numerical columns
categorical_cols = [
    'Landcover_LE', 'Profile_depth', 'CaCO3_rank', 'Texture_group', 
    'Aggregate_texture', 'Aquifers', 'bedrock_raster_50m', 'ALC_old'
]
categorical_cols = [col for col in categorical_cols if col in pdf.columns]

# --- Robust label column detection ---
possible_label_names = ['label', 'target', 'class', 'Label', 'Target', 'Class']
label_col = None
for col in possible_label_names:
    if col in pdf.columns:
        label_col = col
        print(f"Using '{label_col}' as the label column.")
        break
if label_col is None:
    # Try last column as a fallback
    label_col = pdf.columns[-1]
    print(f"No standard label column found. Using last column '{label_col}' as the label column.")
    print(f"All columns: {list(pdf.columns)}")
    print("If this is not correct, please update 'label_col' in the code.")

# Numerical columns: all non-object, non-categorical, and not the label
numerical_cols = [
    col for col in pdf.select_dtypes(include=[np.number]).columns 
    if col not in categorical_cols and col != label_col
]

# 4) Split into features and label
X = pdf.drop(label_col, axis=1)
y = pdf[label_col]

# 5) Split into train/validation/test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# --- Label encoding: ensure labels are zero-based and contiguous ---
le = LabelEncoder()
le.fit(y_train.tolist() + y_val.tolist() + y_test.tolist())
y_train = le.transform(y_train)
y_val = le.transform(y_val)
y_test = le.transform(y_test)
print("Label encoding mapping:")
for idx, class_ in enumerate(le.classes_):
    print(f"{class_} -> {idx}")

# 6) Scale numerical features and encode categorical features AFTER splitting
scaler = StandardScaler()
if numerical_cols:
    X_train_num = scaler.fit_transform(X_train[numerical_cols])
    X_val_num = scaler.transform(X_val[numerical_cols])
    X_test_num = scaler.transform(X_test[numerical_cols])
else:
    X_train_num = np.empty((X_train.shape[0], 0))
    X_val_num = np.empty((X_val.shape[0], 0))
    X_test_num = np.empty((X_test.shape[0], 0))

if categorical_cols:
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    X_train_cat = encoder.fit_transform(X_train[categorical_cols])
    X_val_cat = encoder.transform(X_val[categorical_cols])
    X_test_cat = encoder.transform(X_test[categorical_cols])
else:
    X_train_cat = np.empty((X_train.shape[0], 0))
    X_val_cat = np.empty((X_val.shape[0], 0))
    X_test_cat = np.empty((X_test.shape[0], 0))

# Concatenate numerical and categorical features
X_train_final = np.hstack([X_train_num, X_train_cat])
X_val_final = np.hstack([X_val_num, X_val_cat])
X_test_final = np.hstack([X_test_num, X_test_cat])

# 7) Show basic stats and confirm shapes match
def print_shapes_and_counts(X, y, name):
    print(f"{name} shape: {X.shape}, y shape: {y.shape}")
    print(f"Label distribution in {name}:")
    print(pd.Series(y).value_counts())

print_shapes_and_counts(X_train_final, y_train, "Train")
print_shapes_and_counts(X_val_final, y_val, "Validation")
print_shapes_and_counts(X_test_final, y_test, "Test")

# Assert shapes match for model training
assert X_train_final.shape[0] == len(y_train), f"Train features/labels mismatch: {X_train_final.shape[0]} vs {len(y_train)}"
assert X_val_final.shape[0] == len(y_val), f"Validation features/labels mismatch: {X_val_final.shape[0]} vs {len(y_val)}"
assert X_test_final.shape[0] == len(y_test), f"Test features/labels mismatch: {X_test_final.shape[0]} vs {len(y_test)}"

In [0]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from tensorflow import keras
from tensorflow.keras import layers, regularizers, callbacks

# Use preprocessed data from Cell 2
# X_train_final, X_val_final, X_test_final, y_train, y_val, y_test must be defined in previous cell

# Determine number of features and classes
def get_num_classes(y):
    unique = np.unique(y)
    return len(unique)

num_features = X_train_final.shape[1]
num_classes = get_num_classes(y_train)

# Print class distribution
print("Class distribution in training set:")
unique, counts = np.unique(y_train, return_counts=True)
for u, c in zip(unique, counts):
    print(f"Class {u}: {c} samples")

# Compute class weights if imbalanced
if np.max(counts) / np.min(counts) > 1.5:
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weight_dict = {i: w for i, w in enumerate(class_weights)}
    print("Using class weights:", class_weight_dict)
else:
    class_weight_dict = None
    print("Class distribution is reasonably balanced; not using class weights.")

# Build a deeper ANN with less regularization
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(num_features,),
                 kernel_regularizer=regularizers.l2(0.001)),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    layers.Dropout(0.3),
    layers.Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')
])

# Print model summary
model.summary()

# Compile the model with a lower learning rate
optimizer = keras.optimizers.Adam(learning_rate=0.001)
model.compile(
    optimizer=optimizer,
    loss='sparse_categorical_crossentropy' if num_classes > 2 else 'binary_crossentropy',
    metrics=['accuracy']
)

# Early stopping and learning rate scheduler
early_stop = callbacks.EarlyStopping(
    monitor='val_loss', patience=10, restore_best_weights=True
)
lr_scheduler = callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=5, min_lr=1e-5, verbose=1
)

# Train the model
history = model.fit(
    X_train_final, y_train,
    epochs=200,
    batch_size=32,
    validation_data=(X_val_final, y_val),
    callbacks=[early_stop, lr_scheduler],
    verbose=2,
    class_weight=class_weight_dict
)

# Evaluate the model
train_loss, train_accuracy = model.evaluate(X_train_final, y_train, verbose=0)
val_loss, val_accuracy = model.evaluate(X_val_final, y_val, verbose=0)
test_loss, test_accuracy = model.evaluate(X_test_final, y_test, verbose=0)

# Calculate F1 scores
train_pred = model.predict(X_train_final)
val_pred = model.predict(X_val_final)
test_pred = model.predict(X_test_final)

if num_classes > 2:
    train_pred_classes = np.argmax(train_pred, axis=1)
    val_pred_classes = np.argmax(val_pred, axis=1)
    test_pred_classes = np.argmax(test_pred, axis=1)
else:
    train_pred_classes = (train_pred > 0.5).astype(int).flatten()
    val_pred_classes = (val_pred > 0.5).astype(int).flatten()
    test_pred_classes = (test_pred > 0.5).astype(int).flatten()

train_f1 = f1_score(y_train, train_pred_classes, average='weighted')
val_f1 = f1_score(y_val, val_pred_classes, average='weighted')
test_f1 = f1_score(y_test, test_pred_classes, average='weighted')

# Print accuracy and F1-score
print(f'Train Accuracy: {train_accuracy:.4f}, Train F1 Score: {train_f1:.4f}')
print(f'Validation Accuracy: {val_accuracy:.4f}, Validation F1 Score: {val_f1:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}, Test F1 Score: {test_f1:.4f}')

# Plot training/validation loss and accuracy curves
plt.figure(figsize=(12, 5))

# Plot loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss Curve')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Plot accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy Curve')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()