In [3]:
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.impute import SimpleImputer  # Only needed if imputing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# ---------------------------------------------
# Replace this section with your actual data loading
# For example:
db = pd.read_csv('dataset.csv')
# ---------------------------------------------
# Example DataFrame (replace with your actual data)
# ---------------------------------------------

# 1. Handling Missing Values

# Option 1: Remove rows with NaN in 'd18O', 'd13C', or 'MARBLE GROUP'
db_clean = db.dropna(subset=['d18O', 'd13C', 'MARBLE GROUP'])

# Option 2: Impute missing values (Uncomment to use)
# imputer = SimpleImputer(strategy='mean')  # You can choose 'median' or 'most_frequent'
# features_to_impute = ['d18O', 'd13C']
# db[features_to_impute] = imputer.fit_transform(db[features_to_impute])
# db_clean = db.dropna(subset=['MARBLE GROUP'])  # Ensure target variable has no NaN

# Verify that there are no NaNs in the relevant columns
print("After cleaning:")
print(db_clean.isnull().sum())

# 2. Prepare the Data for LDA

# Define your test input
i1, i2 = -3.77, 3.55  # Replace these with your actual test input values
test_values = [[i1, i2]]

# Prepare feature matrix X and target vector y
X = np.column_stack((db_clean["d18O"], db_clean["d13C"]))
y = db_clean["MARBLE GROUP"].values

# 3. Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,     # 20% of data in test set; adjust as needed
    random_state=42    # For reproducibility
)

# 4. Initialize and Fit the LDA Model
clf = LinearDiscriminantAnalysis()
clf.fit(X_train, y_train)

# 5. Standard Accuracy (Exact Match)
y_pred = clf.predict(X_test)
exact_accuracy = accuracy_score(y_test, y_pred)
print(f"Exact Match Accuracy on Test Set: {exact_accuracy:.4f}")

# 6. Top-3 Accuracy
#    We say it's a "hit" if the true class is among the top 3 predicted classes.
pred_probs = clf.predict_proba(X_test)   # Shape: [n_samples, n_classes]
classes = clf.classes_                   # The unique class labels

top_3_hits = 0
for i in range(len(X_test)):
    # Get probabilities for this sample
    row_probs = pred_probs[i]
    # Find indices of the top 3 probabilities
    top_3_indices = np.argsort(row_probs)[::-1][:3]
    top_3_classes = classes[top_3_indices]
    # Check if the true class is among these top 3
    if y_test[i] in top_3_classes:
        top_3_hits += 1

top_3_accuracy = top_3_hits / len(X_test)
print(f"Top-3 Accuracy on Test Set: {top_3_accuracy:.4f}")

# 7. Make Predictions for the Single Input (i1, i2)
predicted_class = clf.predict(test_values)[0]
probabilities = clf.predict_proba(test_values)[0]

# Identify the top three probabilities
top_indices = np.argsort(probabilities)[::-1][:3]
top_classes = clf.classes_[top_indices]
top_probabilities = probabilities[top_indices]

# Print predicted class and top three probabilities
print("\nSingle Prediction for [i1, i2]:")
print("--------------------------------")
print("Predicted class:", predicted_class, "\n")
print("Top Three Probabilities:")
print("------------------------")
for cls, prob in zip(top_classes, top_probabilities):
    print(f"{cls} : {prob * 100:.2f}%")


After cleaning:
Sample             0
SOURCE            51
SITE               0
MARBLE GROUP       0
SELECT          1841
MGS                5
d18O               0
d13C               0
dtype: int64
Exact Match Accuracy on Test Set: 0.4146
Top-3 Accuracy on Test Set: 0.6721

Single Prediction for [i1, i2]:
--------------------------------
Predicted class: Eph2 

Top Three Probabilities:
------------------------
Eph2 : 40.46%
Pro : 15.91%
MarathiL : 15.85%
