In [0]:
# Cell 1: Install required packages
%pip install --upgrade lightgbm xgboost scikit-learn joblib matplotlib

In [0]:
# Cell 2: Import libraries and setup
import os
import time
import logging
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
import xgboost as xgb
import lightgbm as lgb

In [0]:
# Cell 3: Utility functions
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
import lightgbm as lgb
import joblib
import os
import numpy as np
from sklearn.model_selection import train_test_split
import logging

def setup_logging(pathC):
    log_file = os.path.join(pathC, 'training.log')
    logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s')
    return log_file

def get_memory_usage():
    import psutil
    return f"{psutil.virtual_memory().used / 1e9:.2f} GB"

def convert(seconds):
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return f"{int(h)}h {int(m)}m {int(s)}s"

def compute_class_weights(y):
    from sklearn.utils.class_weight import compute_sample_weight
    return compute_sample_weight('balanced', y)

def split_data(df):
    train, temp = train_test_split(df, test_size=0.3, stratify=df['target'], random_state=42)
    validate, test = train_test_split(temp, test_size=0.5, stratify=temp['target'], random_state=42)
    return train, validate, test

def train_and_evaluate(X_train, y_train, X_valid, y_valid, X_test, y_test, pathC, model_name, report_name, selected_features, selected_importances):
    # Cross-validation for accuracy and f1 (no callbacks, fresh model each fold)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    base_estimator = lgb.LGBMClassifier(
        random_state=42,
        reg_alpha=1.0,
        reg_lambda=1.0,
        n_estimators=300
    )
    acc_scores = cross_val_score(base_estimator, X_train, y_train, cv=skf, scoring='accuracy', n_jobs=1)
    f1_scores = cross_val_score(base_estimator, X_train, y_train, cv=skf, scoring='f1_weighted', n_jobs=1)
    # Final model fit with early stopping
    model = lgb.LGBMClassifier(
        random_state=42,
        reg_alpha=1.0,
        reg_lambda=1.0,
        n_estimators=1000
    )
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='multi_logloss',
        callbacks=[lgb.early_stopping(10)]
    )
    # Evaluate on train, valid, test
    y_train_pred = model.predict(X_train)
    y_valid_pred = model.predict(X_valid)
    y_test_pred = model.predict(X_test)
    # Print confusion matrices and classification reports
    print("\nTraining Confusion Matrix:")
    train_cm = confusion_matrix(y_train, y_train_pred)
    print(train_cm)
    train_cr = classification_report(y_train, y_train_pred)
    print("Training Classification Report:")
    print(train_cr)
    print("\nValidation Confusion Matrix:")
    valid_cm = confusion_matrix(y_valid, y_valid_pred)
    print(valid_cm)
    valid_cr = classification_report(y_valid, y_valid_pred)
    print("Validation Classification Report:")
    print(valid_cr)
    print("\nTesting Confusion Matrix:")
    test_cm = confusion_matrix(y_test, y_test_pred)
    print(test_cm)
    test_cr = classification_report(y_test, y_test_pred)
    print("Testing Classification Report:")
    print(test_cr)
    metrics = {
        'cv_train_accuracy_mean': np.mean(acc_scores),
        'cv_train_accuracy_std': np.std(acc_scores),
        'cv_train_f1_mean': np.mean(f1_scores),
        'cv_train_f1_std': np.std(f1_scores),
        'train_accuracy': accuracy_score(y_train, y_train_pred),
        'valid_accuracy': accuracy_score(y_valid, y_valid_pred),
        'test_accuracy': accuracy_score(y_test, y_test_pred),
        'train_f1': f1_score(y_train, y_train_pred, average='weighted'),
        'valid_f1': f1_score(y_valid, y_valid_pred, average='weighted'),
        'test_f1': f1_score(y_test, y_test_pred, average='weighted')
    }
    report = {
        'features': list(selected_features),
        'importances': list(selected_importances),
        **metrics
    }
    joblib.dump(model, os.path.join(pathC, model_name))
    # Write metrics, confusion matrices, and classification reports to the report file
    with open(os.path.join(pathC, report_name), 'w') as f:
        f.write("Model Report\n")
        f.write("==============\n")
        for k, v in report.items():
            f.write(f"{k}: {v}\n")
        f.write("\nTraining Confusion Matrix:\n")
        f.write(str(train_cm) + "\n")
        f.write("Training Classification Report:\n")
        f.write(train_cr + "\n")
        f.write("\nValidation Confusion Matrix:\n")
        f.write(str(valid_cm) + "\n")
        f.write("Validation Classification Report:\n")
        f.write(valid_cr + "\n")
        f.write("\nTesting Confusion Matrix:\n")
        f.write(str(test_cm) + "\n")
        f.write("Testing Classification Report:\n")
        f.write(test_cr + "\n")
    return report

In [0]:
# Cell 4: Main pipeline
if __name__ == "__main__":
    start = time.time()
    pathC = os.environ.get('CLASSIFICATION_DIR', '/dbfs/mnt/lab/unrestricted/KritiM/classification/')
    training_file = os.environ.get('TRAINING_FILE', os.path.join(pathC, 'trainingSample.csv'))
    os.makedirs(pathC, exist_ok=True)
    log_file = setup_logging(pathC)
    logging.info(f"Starting training script - Output directory: {pathC}")
    import sklearn
    logging.info(f"Python packages: lightgbm {lgb.__version__}, sklearn {sklearn.__version__}, pandas {pd.__version__}")
    logging.info(f"Initial memory usage: {get_memory_usage()}")
    logging.info('Loading the labelled data...')
    try:
        df = pd.read_csv(training_file)
        logging.info(f"Successfully loaded data from {training_file}")
        logging.info(f"Initial dataframe shape: {df.shape}")
        logging.info(f"Memory usage after load: {get_memory_usage()}")
        df = df.drop_duplicates()
        df = df.dropna()
        logging.info(f"Shape after cleaning: {df.shape}")
    except Exception as e:
        logging.error(f"Error loading/cleaning data: {str(e)}")
        raise
    df['target'] = df['target'].astype(int) - 1
    print("Class distribution:")
    print(df['target'].value_counts(normalize=True))
    print('assign categorical and numerical columns...')
    categorical_cols = ['Landcover_LE', 'Profile_depth', 'CaCO3_rank', 'Texture_group', 
                        'Aggregate_texture', 'Aquifers', 'bedrock_raster_50m', 'ALC_old']
    for col in categorical_cols:
        if col in df.columns:
            df[col] = df[col].astype('category')
    for col in df.select_dtypes(include='number').columns:
        if col != 'target':
            df[col] = pd.to_numeric(df[col], downcast='float')
    print(f"Shape of training data: {df.shape}")
    print(f"Unique target values: {np.unique(df['target'])}")
    train, validate, test = split_data(df)
    X_train = train.drop('target', axis=1)
    y_train = train['target']
    X_valid = validate.drop('target', axis=1)
    y_validate = validate['target']
    X_test = test.drop('target', axis=1)
    y_test = test['target']
    num_cols = [col for col in X_train.columns if col not in categorical_cols]
    scaler = StandardScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_valid[num_cols] = scaler.transform(X_valid[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols])
    joblib.dump(scaler, os.path.join(pathC, 'scaler.joblib'))
    # --- PERMUTATION FEATURE IMPORTANCE FEATURE SELECTION ---
    initial_model = xgb.XGBClassifier(
        random_state=42, enable_categorical=True, objective='multi:softmax',
        num_class=len(np.unique(y_train)), eval_metric='mlogloss', tree_method='hist'
    )
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 4, 5]
    }
    # Set n_jobs=1 to avoid BrokenProcessPool error on Databricks/Azure
    random_search = RandomizedSearchCV(
        initial_model, param_grid, n_iter=20, cv=3, scoring='f1_weighted',
        n_jobs=1, random_state=42
    )
    random_search.fit(X_train, y_train, sample_weight=compute_class_weights(y_train))
    initial_model = random_search.best_estimator_
    perm_importance = permutation_importance(initial_model, X_valid, y_validate, n_repeats=3, random_state=42, n_jobs=1)
    feature_importance_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': perm_importance.importances_mean
    }).sort_values(by='Importance', ascending=False)
    feature_counts = range(3, 15, 1)
    results = []
    metrics_results = []  # For metrics_vs_features.csv
    for n in feature_counts:
        print(f"\nTraining with top {n} features...")
        selected_features = feature_importance_df['Feature'].head(n).values
        selected_importances = feature_importance_df['Importance'].head(n).values
        X_train_n = X_train[selected_features]
        X_valid_n = X_valid[selected_features]
        X_test_n = X_test[selected_features]
        model_name = f'model_{n}_features.joblib'
        report_name = f'report_{n}_features.txt'
        print(f"\nTraining and evaluating with top {n} features: {list(selected_features)}")
        res = train_and_evaluate(
            X_train_n, y_train, X_valid_n, y_validate, X_test_n, y_test, pathC,
            model_name, report_name, selected_features, selected_importances
        )
        results.append(res)
        metrics_results.append({
            'num_features': n,
            'train_accuracy': res['train_accuracy'],
            'valid_accuracy': res['valid_accuracy'],
            'test_accuracy': res['test_accuracy'],
            'train_f1': res['train_f1'],
            'valid_f1': res['valid_f1'],
            'test_f1': res['test_f1']
        })
    results_df = pd.DataFrame(results)
    results_df.to_csv(os.path.join(pathC, 'feature_selection_results.csv'), index=False)
    print(f"\nResults table saved to {os.path.join(pathC, 'feature_selection_results.csv')}")
    # Save metrics_vs_features.csv
    metrics_df = pd.DataFrame(metrics_results)
    metrics_csv_path = os.path.join(pathC, 'metrics_vs_features.csv')
    metrics_df.to_csv(metrics_csv_path, index=False)
    print(f"Metrics table saved to {metrics_csv_path}")
    # Plot accuracy vs. number of features
    plt.figure(figsize=(8, 5))
    plt.plot(metrics_df['num_features'], metrics_df['train_accuracy'], label='Train Accuracy')
    plt.plot(metrics_df['num_features'], metrics_df['valid_accuracy'], label='Validation Accuracy')
    plt.plot(metrics_df['num_features'], metrics_df['test_accuracy'], label='Test Accuracy')
    plt.xlabel('Number of Features')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs. Number of Features')
    plt.legend()
    plt.grid(True)
    plot_path = os.path.join(pathC, 'accuracy_vs_features.png')
    plt.savefig(plot_path, dpi=150, bbox_inches='tight')
    plt.close()
    print(f"Accuracy plot saved to {plot_path}")
    end = time.time()
    time_taken = convert(end-start)
    logging.info(f"Total processing time: {time_taken}")
    logging.info(f"Final memory usage: {get_memory_usage()}")
    logging.info(f"Log file saved to: {log_file}")
    print(f"\nScript completed successfully. Check the log file at: {log_file}")