In [4]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score

# ========== 1. Load Data ==========
df = pd.read_csv('dataset.csv')
# Ensure d18O, d13C, and MARBLE GROUP columns do not contain NaN
df = df.dropna(subset=['d18O', 'd13C', 'MARBLE GROUP'])

# ========== 2. Encode Labels ==========
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['MARBLE GROUP'])

# ========== 3. Prepare Features & Targets ==========
features = ['d18O', 'd13C']
X = df[features].values
y = df['Label'].values

# ========== 4. Train–Test Split & Scaling ==========
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ========== 5. Hyperparameter Tuning with GridSearchCV ==========

# Parameter grid to search over
# Adjust values for C, gamma, and possibly other parameters
# based on domain knowledge or empirical testing
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['rbf'],
    'class_weight': [None, 'balanced']
}

# SVC with probability=True to enable predict_proba for top-3 accuracy
svc = SVC(probability=True, random_state=42)

grid_search = GridSearchCV(
    svc,
    param_grid,
    cv=5,               # 5-fold cross-validation
    scoring='accuracy', # or 'f1_macro', 'precision_macro', etc.
    n_jobs=-1,          # use all available CPU cores
    verbose=1           # to see progress logs
)

grid_search.fit(X_train_scaled, y_train)

# The best model after hyperparameter tuning
best_svc = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best CV Score: {grid_search.best_score_:.4f}")

# ========== 6. Evaluate on Test Set (Top-1 & Top-3) ==========

# --- Top-1 Accuracy ---
y_pred = best_svc.predict(X_test_scaled)
top1_accuracy = accuracy_score(y_test, y_pred)

# --- Top-3 Accuracy ---
proba = best_svc.predict_proba(X_test_scaled)  # shape: [n_samples, n_classes]
num_samples = len(y_test)
correct_top3 = 0

for i in range(num_samples):
    # Sort probabilities and get indices of top 3 for sample i
    top3_indices = np.argsort(proba[i])[-3:]
    if y_test[i] in top3_indices:
        correct_top3 += 1

top3_accuracy = correct_top3 / num_samples

print(f"\nOptimized SVM Test Top-1 Accuracy: {top1_accuracy * 100:.2f}%")
print(f"Optimized SVM Test Top-3 Accuracy: {top3_accuracy * 100:.2f}%")

# ========== 7. Predict on New Samples ==========

# Example new samples (d18O, d13C)  
new_samples = np.array([
    [-2.63, 3.44],
    [-1.53, 3.83]
])

new_samples_scaled = scaler.transform(new_samples)

predicted_proba = best_svc.predict_proba(new_samples_scaled)
predicted_labels_top1 = best_svc.predict(new_samples_scaled)
predicted_marbles_top1 = label_encoder.inverse_transform(predicted_labels_top1)

print("\nPredictions for New Samples (Top-1):")
for sample, label in zip(new_samples, predicted_marbles_top1):
    print(f"d18O={sample[0]}, d13C={sample[1]} => {label}")

# Demonstrate top-3 probabilities for each new sample
for i, sample in enumerate(new_samples):
    top3_indices = np.argsort(predicted_proba[i])[-3:]
    top3_probs = predicted_proba[i][top3_indices]
    top3_labels = label_encoder.inverse_transform(top3_indices)
    print(f"\nFor sample {sample}:")
    for lbl, prob in zip(top3_labels[::-1], top3_probs[::-1]):
        print(f"  {lbl}: {prob * 100:.2f}%")

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'C': 10, 'class_weight': None, 'gamma': 1, 'kernel': 'rbf'}
Best CV Score: 0.5041

Optimized SVM Test Top-1 Accuracy: 49.59%
Optimized SVM Test Top-3 Accuracy: 73.44%

Predictions for New Samples (Top-1):
d18O=-2.63, d13C=3.44 => Pro
d18O=-1.53, d13C=3.83 => Pro

For sample [-2.63  3.44]:
  Pro: 51.06%
  Gok3: 10.61%
  MarathiL: 8.93%

For sample [-1.53  3.83]:
  Pro: 46.99%
  Thasos: 10.21%
  Eph2: 5.09%
