### KNN Classifier

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv("../genome_sequences_cleaned.csv")

# Separate features and label
X = df.drop(columns=['label_encoded'])
y = df['label_encoded']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the KNN model (k=3, can be tuned)
knn = KNeighborsClassifier(n_neighbors=3)

# Train the KNN model
knn.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = knn.predict(X_test_scaled)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.98

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1526
           1       0.94      0.96      0.95      1134
           2       1.00      0.99      1.00      1961
           3       0.96      0.94      0.95      1020

    accuracy                           0.98      5641
   macro avg       0.97      0.97      0.97      5641
weighted avg       0.98      0.98      0.98      5641



### Confusion Matrix with Plotly

In [3]:
import plotly.figure_factory as ff
from sklearn.metrics import confusion_matrix
import pandas as pd

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=sorted(y.unique()), columns=sorted(y.unique()))

fig = ff.create_annotated_heatmap(
    z=cm_df.values,
    x=cm_df.columns.tolist(),
    y=cm_df.index.tolist(),
    colorscale='Blues',
    annotation_text=cm_df.values.astype(str),
    showscale=True
)

fig.update_layout(
    title='Confusion Matrix - KNN Classifier',
    xaxis_title='Predicted Label',
    yaxis_title='True Label'
)
fig.show()


### Feature Importance (Permutation Method)

In [4]:
from sklearn.inspection import permutation_importance
import plotly.express as px

result = permutation_importance(knn, X_test_scaled, y_test, n_repeats=10, random_state=42)

importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': result.importances_mean
}).sort_values(by='Importance', ascending=False)

fig = px.bar(
    importance_df,
    x='Feature',
    y='Importance',
    title='Feature Importance (Permutation) - KNN Classifier'
)
fig.update_layout(xaxis_tickangle=45)
fig.show()


### KDE (Per Feature by Class Label)

In [9]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from scipy.stats import gaussian_kde

X_test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
X_test_df['label'] = y_test.values

top_features = importance_df.head(3)['Feature'].tolist()

for feature in top_features:
    fig = go.Figure()
    
    for label in sorted(X_test_df['label'].unique()):
        data = X_test_df[X_test_df['label'] == label][feature]
        
        # Compute KDE
        kde = gaussian_kde(data)
        x_range = np.linspace(data.min(), data.max(), 200)
        y_kde = kde(x_range)
        
        fig.add_trace(go.Scatter(
            x=x_range,
            y=y_kde,
            mode='lines',
            name=f'Label {label}'
        ))
    
    fig.update_layout(
        title=f"KDE Plot for Feature '{feature}' by Label",
        xaxis_title=feature,
        yaxis_title='Density'
    )
    
    fig.show()


###  Learning Curve (Train Size vs Accuracy)

In [7]:
import numpy as np
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split

train_sizes = np.linspace(0.1, 0.9, 9)  # Avoid 1.0
train_scores = []
val_scores = []

for size in train_sizes:
    X_train_part, _, y_train_part, _ = train_test_split(
        X_train_scaled, y_train, train_size=float(size), random_state=42
    )
    knn.fit(X_train_part, y_train_part)
    train_scores.append(knn.score(X_train_part, y_train_part))
    val_scores.append(knn.score(X_test_scaled, y_test))

fig = go.Figure()
fig.add_trace(go.Scatter(x=train_sizes, y=train_scores, mode='lines+markers', name='Training Accuracy'))
fig.add_trace(go.Scatter(x=train_sizes, y=val_scores, mode='lines+markers', name='Validation Accuracy'))

fig.update_layout(
    title='Learning Curve - KNN Classifier',
    xaxis_title='Training Set Size Proportion',
    yaxis_title='Accuracy',
    yaxis=dict(range=[0, 1])
)
fig.show()
