### SVM_Classifier

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv("../genome_sequences_cleaned.csv")

# Separate features and target label
X = df.drop(columns=['label_encoded'])
y = df['label_encoded']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize SVM model
svm = SVC(kernel='rbf', C=1.0, gamma='scale')  # You can also try 'linear' or 'poly' kernels

# Train the model
svm.fit(X_train_scaled, y_train)

# Make predictions
y_pred = svm.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.96

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.96      1526
           1       0.92      0.96      0.94      1134
           2       0.98      1.00      0.99      1961
           3       0.96      0.84      0.90      1020

    accuracy                           0.96      5641
   macro avg       0.95      0.94      0.95      5641
weighted avg       0.96      0.96      0.96      5641



### Confusion Matrix with Plotly

In [2]:
from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff
import pandas as pd

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=sorted(y.unique()), columns=sorted(y.unique()))

fig = ff.create_annotated_heatmap(
    z=cm_df.values,
    x=cm_df.columns.tolist(),
    y=cm_df.index.tolist(),
    colorscale='Blues',
    annotation_text=cm_df.values.astype(str),
    showscale=True
)

fig.update_layout(
    title='Confusion Matrix - SVM Classifier',
    xaxis_title='Predicted Label',
    yaxis_title='True Label'
)
fig.show()


###  Feature Importance (Permutation)

In [3]:
from sklearn.inspection import permutation_importance
import plotly.express as px

result = permutation_importance(svm, X_test_scaled, y_test, n_repeats=10, random_state=42)

importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': result.importances_mean
}).sort_values(by='Importance', ascending=False)

fig = px.bar(
    importance_df,
    x='Feature',
    y='Importance',
    title='Feature Importance (Permutation) - SVM Classifier'
)
fig.update_layout(xaxis_tickangle=45)
fig.show()


### KDE Plots

In [5]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from scipy.stats import gaussian_kde

# Rebuild X_test dataframe with column names and true labels
X_test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
X_test_df['label'] = y_test.values

# Get top 3 features from permutation importance
top_features = importance_df.head(3)['Feature'].tolist()

# Plot KDE for each top feature
for feature in top_features:
    fig = go.Figure()

    # Plot KDE for each class
    for label in sorted(X_test_df['label'].unique()):
        # Extract values for this label/class
        data = X_test_df[X_test_df['label'] == label][feature].values

        # Compute KDE
        kde = gaussian_kde(data)
        x_vals = np.linspace(data.min(), data.max(), 200)
        y_vals = kde(x_vals)

        # Add curve to figure
        fig.add_trace(go.Scatter(
            x=x_vals,
            y=y_vals,
            mode='lines',
            name=f'Label {label}'
        ))

    # Update layout
    fig.update_layout(
        title=f'KDE Plot for Feature "{feature}" by Label',
        xaxis_title=feature,
        yaxis_title='Density'
    )

    fig.show()


###  Training vs Validation Accuracy 

In [7]:
import numpy as np
import plotly.graph_objects as go
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

train_sizes = np.linspace(0.1, 0.9, 9)  # Avoid train_size=1.0
train_scores = []
val_scores = []

for size in train_sizes:
    X_train_part, _, y_train_part, _ = train_test_split(
        X_train_scaled, y_train, train_size=size, random_state=42
    )
    model = SVC(kernel='rbf', C=1.0, gamma='scale')
    model.fit(X_train_part, y_train_part)
    train_scores.append(model.score(X_train_part, y_train_part))
    val_scores.append(model.score(X_test_scaled, y_test))

fig = go.Figure()
fig.add_trace(go.Scatter(x=train_sizes, y=train_scores, mode='lines+markers', name='Training Accuracy'))
fig.add_trace(go.Scatter(x=train_sizes, y=val_scores, mode='lines+markers', name='Validation Accuracy'))

fig.update_layout(
    title='Learning Curve - SVM Classifier',
    xaxis_title='Training Set Size Proportion',
    yaxis_title='Accuracy',
    yaxis=dict(range=[0, 1])
)
fig.show()
