## Decision Tree

## Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import permutation_importance

# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.utils import to_categorical

from sklearn.base import BaseEstimator, ClassifierMixin

## Load and Preprocess the Dataset

In [17]:
csv_path = r"../genome_sequences_cleaned.csv"
df = pd.read_csv(csv_path)

# Display first few rows to confirm loading
df.head()

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,P,Q,R,S,T,V,W,Y,Sequence_Length,label_encoded
0,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0
1,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0
2,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0
3,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0
4,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0


## Split Data into Features and Target

In [None]:
# Features: A-Z + Sequence_Length
X = df.drop(columns=['label_encoded'])
y = df['label_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


## Train the Decision Tree Classifier

In [6]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)


## Evaluation Metrics

In [None]:
# Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")

# Classification Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.9743

Classification Report:

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1550
           1       0.93      0.95      0.94      1105
           2       1.00      0.99      1.00      1970
           3       0.97      0.93      0.95      1016

    accuracy                           0.97      5641
   macro avg       0.97      0.97      0.97      5641
weighted avg       0.97      0.97      0.97      5641



## Confusion Matrix Plot

In [8]:
# Create a confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = np.unique(y)

fig = px.imshow(
    cm,
    labels=dict(x="Predicted", y="Actual", color="Count"),
    x=labels,
    y=labels,
    color_continuous_scale='Blues',
    text_auto=True
)

fig.update_layout(title="Confusion Matrix (Plotly)")
fig.show()


## Feature Importance Plot

In [9]:
# Use the existing feature importance data
feat_imp_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': clf.feature_importances_
}).sort_values(by='Importance', ascending=False).head(10)

fig = px.bar(
    feat_imp_df,
    x='Importance',
    y='Feature',
    orientation='h',
    color='Importance',
    title='Top 10 Important Features (Plotly)',
    color_continuous_scale='viridis'
)

fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()



## KDE

In [10]:
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import LabelEncoder

kde_df = X_test[['A', 'G']].copy()
kde_df['Predicted_Label'] = y_pred

fig = px.density_contour(
    kde_df,
    x='A',
    y='G',
    color='Predicted_Label',
    title='KDE-like Density Contour: Feature A vs G by Predicted Class',
    color_discrete_sequence=px.colors.qualitative.Set1
)

fig.update_traces(contours_coloring="fill", contours_showlabels=True)
fig.update_layout(
    xaxis_title='Frequency of Amino Acid A',
    yaxis_title='Frequency of Amino Acid G',
    legend_title='Predicted Class'
)

fig.show()


##  Random Forest

## Prepare Data and Train

In [14]:
# Load data
csv_path = r"../genome_sequences_cleaned.csv"
df = pd.read_csv(csv_path)

# Features and label
X = df.drop(columns=["label_encoded"])  # Drop the target
y = df["label_encoded"]  # Target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict
y_pred = rf_model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9794362701648643
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1526
           1       0.93      0.97      0.95      1134
           2       1.00      1.00      1.00      1961
           3       0.97      0.94      0.95      1020

    accuracy                           0.98      5641
   macro avg       0.97      0.97      0.97      5641
weighted avg       0.98      0.98      0.98      5641



## Confusion Matrix Plot

In [13]:


# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = rf_model.classes_  # Unique class labels

# Convert to string for axis labels
labels_str = [str(label) for label in labels]

# Create annotated heatmap
fig = ff.create_annotated_heatmap(
    z=cm,
    x=labels_str,
    y=labels_str,
    colorscale='Blues',
    showscale=True,
    hoverinfo="z",
    xgap=3,
    ygap=3
)

fig.update_layout(
    title="Confusion Matrix - Random Forest",
    xaxis=dict(title='Predicted Label'),
    yaxis=dict(title='True Label')
)

fig.show()


NameError: name 'ff' is not defined

## Feature Importance Plot

In [15]:

# Get feature importances
importances = rf_model.feature_importances_
feature_names = X.columns

# Create DataFrame for plotting
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plotly bar chart
fig = px.bar(importance_df, x='Feature', y='Importance',
             title="Feature Importances - Random Forest",
             labels={'Importance': 'Importance Score'},
             color='Importance')
fig.show()


## KDE Plots

In [16]:
import plotly.figure_factory as ff

# Example: Select top 3 features for KDE
top_features = importance_df['Feature'].head(3).tolist()

# Prepare KDEs for each class
for feature in top_features:
    class_values = []
    class_names = []

    for label in y.unique():
        class_values.append(df[df['label_encoded'] == label][feature])
        class_names.append(str(label))
    
    fig = ff.create_distplot(class_values, class_names, show_hist=False, show_rug=False)
    fig.update_layout(title=f"KDE Plot for Feature: {feature}")
    fig.show()


# Logistic Regression

## Load and Preprocess the Dataset

In [None]:
csv_path = r"../genome_sequences_cleaned.csv"
df = pd.read_csv(csv_path)

# Display first few rows to confirm loading
df.head()


Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,P,Q,R,S,T,V,W,Y,Sequence_Length,label_encoded
0,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0
1,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0
2,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0
3,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0
4,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0


## Split Data into Features and Target

In [None]:
# Features: A-Z + Sequence_Length
X = df.drop(columns=['label_encoded'])
y = df['label_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Train Logistic regression

In [34]:
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



## Evaluation metrics

In [35]:
# Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")

# Classification Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.9236

Classification Report:

              precision    recall  f1-score   support

           0       0.90      0.94      0.92      1550
           1       0.88      0.84      0.86      1105
           2       0.99      0.99      0.99      1970
           3       0.87      0.86      0.86      1016

    accuracy                           0.92      5641
   macro avg       0.91      0.91      0.91      5641
weighted avg       0.92      0.92      0.92      5641



## Confusion Matrix Plot

In [36]:
cm = confusion_matrix(y_test, y_pred)

# Get class labels dynamically from unique y values
labels = [f"Class {i}" for i in np.unique(y_test)]

# Annotated heatmap
cm_fig = ff.create_annotated_heatmap(
    z=cm,
    x=[f"Predicted {i}" for i in labels],
    y=[f"Actual {i}" for i in labels],
    colorscale='Blues',
    showscale=True
)
cm_fig.update_layout(title="Confusion Matrix", margin=dict(t=50, l=50))
cm_fig.show()

## Feature Importance Plot

In [37]:
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': np.abs(logistic_model.coef_[0])
}).sort_values(by='Importance', ascending=False)

importance_fig = px.bar(feature_importance, x='Feature', y='Importance',
                        title="Feature Importance (Logistic Regression)",
                        labels={'Importance': 'Coefficient Magnitude'})
importance_fig.show()

## KDE Plots

In [38]:
top_features = feature_importance['Feature'].head(3).tolist()

# Prepare KDEs for each class
for feature in top_features:
    class_values = []
    class_names = []

    for label in y.unique():
        class_values.append(df[df['label_encoded'] == label][feature])
        class_names.append(str(label))
    
    fig = ff.create_distplot(class_values, class_names, show_hist=False, show_rug=False)
    fig.update_layout(title=f"KDE Plot for Feature: {feature}")
    fig.show()

## Artificial Neural Network

## Load and Preprocess the Dataset

In [40]:
csv_path = r"../genome_sequences_cleaned.csv"
df = pd.read_csv(csv_path)

# Display first few rows to confirm loading
df.head()

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,P,Q,R,S,T,V,W,Y,Sequence_Length,label_encoded
0,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0
1,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0
2,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0
3,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0
4,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0


## Split Data into Features and Target

In [42]:
# Features: A-Z + Sequence_Length
X = df.drop(columns=['label_encoded'])
y = df['label_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Train ANN

In [None]:
ANN_model = Sequential([
    Dense(10, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(10, activation='relu'),
    Dense(3, activation='softmax')  # 3 classes in Iris
])

# Compile model
ANN_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
history = ANN_model.fit(X_train, y_train, epochs=100, batch_size=8, verbose=0)

## Evaluation Metrics

In [None]:
# Evaluate model
loss, accuracy = ANN_model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy:.4f}")

# Predict on test set
y_pred_probs = ANN_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=df.target_names))

## Confusion Matrix Plot

In [None]:
cm = confusion_matrix(y_true, y_pred)
labels = [f"Class {i}" for i in np.unique(y_true)]

fig_cm = ff.create_annotated_heatmap(
    z=cm,
    x=[f"Predicted {l}" for l in labels],
    y=[f"Actual {l}" for l in labels],
    colorscale='Viridis'
)
fig_cm.update_layout(title_text='Confusion Matrix', margin=dict(t=50, l=100))
fig_cm.show()

## Feature Importance Plot

In [None]:
class ANNWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, model):
        self.model = model
    
    def fit(self, X, y): return self
    def predict(self, X):
        return np.argmax(self.model.predict(X), axis=1)

# Wrap your model
wrapped_model = ANNWrapper(ANN_model)
result = permutation_importance(wrapped_model, X_test, y_true, n_repeats=10, random_state=42)

# Plot
feat_names = [f"Feature {i}" for i in range(X_test.shape[1])]
fig_imp = px.bar(
    x=result.importances_mean,
    y=feat_names,
    orientation='h',
    labels={'x': 'Importance', 'y': 'Feature'},
    title='Feature Importance (Permutation)'
)
fig_imp.update_layout(yaxis=dict(autorange="reversed"))
fig_imp.show()


## KDE Plots

In [None]:
# Convert to DataFrame for Plotly
df_probs = pd.DataFrame(y_pred_probs, columns=[f"Class {i}" for i in range(y_pred_probs.shape[1])])
df_probs['True Class'] = y_true

# Melt to long format for KDE
df_melted = df_probs.melt(id_vars='True Class', var_name='Predicted Class', value_name='Probability')

# KDE plot
fig_kde = px.violin(df_melted, x="Predicted Class", y="Probability", color="True Class", box=True, points="all",
                    title="KDE of Prediction Probabilities by True Class")
fig_kde.show()

### KNN Classifier

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv("../genome_sequences_cleaned.csv")

# Separate features and label
X = df.drop(columns=['label_encoded'])
y = df['label_encoded']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the KNN model (k=3, can be tuned)
knn = KNeighborsClassifier(n_neighbors=3)

# Train the KNN model
knn.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = knn.predict(X_test_scaled)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.98

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1526
           1       0.94      0.96      0.95      1134
           2       1.00      0.99      1.00      1961
           3       0.96      0.94      0.95      1020

    accuracy                           0.98      5641
   macro avg       0.97      0.97      0.97      5641
weighted avg       0.98      0.98      0.98      5641



### Confusion Matrix with Plotly

In [2]:
import plotly.figure_factory as ff
from sklearn.metrics import confusion_matrix
import pandas as pd

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=sorted(y.unique()), columns=sorted(y.unique()))

fig = ff.create_annotated_heatmap(
    z=cm_df.values,
    x=cm_df.columns.tolist(),
    y=cm_df.index.tolist(),
    colorscale='Blues',
    annotation_text=cm_df.values.astype(str),
    showscale=True
)

fig.update_layout(
    title='Confusion Matrix - KNN Classifier',
    xaxis_title='Predicted Label',
    yaxis_title='True Label'
)
fig.show()


### Feature Importance (Permutation Method)

In [3]:
from sklearn.inspection import permutation_importance
import plotly.express as px

result = permutation_importance(knn, X_test_scaled, y_test, n_repeats=10, random_state=42)

importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': result.importances_mean
}).sort_values(by='Importance', ascending=False)

fig = px.bar(
    importance_df,
    x='Feature',
    y='Importance',
    title='Feature Importance (Permutation) - KNN Classifier'
)
fig.update_layout(xaxis_tickangle=45)
fig.show()


### KDE plot 

In [4]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from scipy.stats import gaussian_kde

X_test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
X_test_df['label'] = y_test.values

top_features = importance_df.head(3)['Feature'].tolist()

for feature in top_features:
    fig = go.Figure()
    
    for label in sorted(X_test_df['label'].unique()):
        data = X_test_df[X_test_df['label'] == label][feature]
        
        # Compute KDE
        kde = gaussian_kde(data)
        x_range = np.linspace(data.min(), data.max(), 200)
        y_kde = kde(x_range)
        
        fig.add_trace(go.Scatter(
            x=x_range,
            y=y_kde,
            mode='lines',
            name=f'Label {label}'
        ))
    
    fig.update_layout(
        title=f"KDE Plot for Feature '{feature}' by Label",
        xaxis_title=feature,
        yaxis_title='Density'
    )
    
    fig.show()


###  Learning Curve (Train Size vs Accuracy)

In [5]:
import numpy as np
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split

train_sizes = np.linspace(0.1, 0.9, 9)  # Avoid 1.0
train_scores = []
val_scores = []

for size in train_sizes:
    X_train_part, _, y_train_part, _ = train_test_split(
        X_train_scaled, y_train, train_size=float(size), random_state=42
    )
    knn.fit(X_train_part, y_train_part)
    train_scores.append(knn.score(X_train_part, y_train_part))
    val_scores.append(knn.score(X_test_scaled, y_test))

fig = go.Figure()
fig.add_trace(go.Scatter(x=train_sizes, y=train_scores, mode='lines+markers', name='Training Accuracy'))
fig.add_trace(go.Scatter(x=train_sizes, y=val_scores, mode='lines+markers', name='Validation Accuracy'))

fig.update_layout(
    title='Learning Curve - KNN Classifier',
    xaxis_title='Training Set Size Proportion',
    yaxis_title='Accuracy',
    yaxis=dict(range=[0, 1])
)
fig.show()


### SVM_Classifier

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv("../genome_sequences_cleaned.csv")

# Separate features and target label
X = df.drop(columns=['label_encoded'])
y = df['label_encoded']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize SVM model
svm = SVC(kernel='rbf', C=1.0, gamma='scale')  # You can also try 'linear' or 'poly' kernels

# Train the model
svm.fit(X_train_scaled, y_train)

# Make predictions
y_pred = svm.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.96

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.96      1526
           1       0.92      0.96      0.94      1134
           2       0.98      1.00      0.99      1961
           3       0.96      0.84      0.90      1020

    accuracy                           0.96      5641
   macro avg       0.95      0.94      0.95      5641
weighted avg       0.96      0.96      0.96      5641



### Confusion Matrix with Plotly

In [7]:
from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff
import pandas as pd

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=sorted(y.unique()), columns=sorted(y.unique()))

fig = ff.create_annotated_heatmap(
    z=cm_df.values,
    x=cm_df.columns.tolist(),
    y=cm_df.index.tolist(),
    colorscale='Blues',
    annotation_text=cm_df.values.astype(str),
    showscale=True
)

fig.update_layout(
    title='Confusion Matrix - SVM Classifier',
    xaxis_title='Predicted Label',
    yaxis_title='True Label'
)
fig.show()


###  Feature Importance (Permutation)

In [8]:
from sklearn.inspection import permutation_importance
import plotly.express as px

result = permutation_importance(svm, X_test_scaled, y_test, n_repeats=10, random_state=42)

importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': result.importances_mean
}).sort_values(by='Importance', ascending=False)

fig = px.bar(
    importance_df,
    x='Feature',
    y='Importance',
    title='Feature Importance (Permutation) - SVM Classifier'
)
fig.update_layout(xaxis_tickangle=45)
fig.show()


### KDE Plots

In [9]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from scipy.stats import gaussian_kde

# Rebuild X_test dataframe with column names and true labels
X_test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
X_test_df['label'] = y_test.values

# Get top 3 features from permutation importance
top_features = importance_df.head(3)['Feature'].tolist()

# Plot KDE for each top feature
for feature in top_features:
    fig = go.Figure()

    # Plot KDE for each class
    for label in sorted(X_test_df['label'].unique()):
        # Extract values for this label/class
        data = X_test_df[X_test_df['label'] == label][feature].values

        # Compute KDE
        kde = gaussian_kde(data)
        x_vals = np.linspace(data.min(), data.max(), 200)
        y_vals = kde(x_vals)

        # Add curve to figure
        fig.add_trace(go.Scatter(
            x=x_vals,
            y=y_vals,
            mode='lines',
            name=f'Label {label}'
        ))

    # Update layout
    fig.update_layout(
        title=f'KDE Plot for Feature "{feature}" by Label',
        xaxis_title=feature,
        yaxis_title='Density'
    )

    fig.show()


###  Training vs Validation Accuracy 

In [10]:
import numpy as np
import plotly.graph_objects as go
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

train_sizes = np.linspace(0.1, 0.9, 9)  # Avoid train_size=1.0
train_scores = []
val_scores = []

for size in train_sizes:
    X_train_part, _, y_train_part, _ = train_test_split(
        X_train_scaled, y_train, train_size=size, random_state=42
    )
    model = SVC(kernel='rbf', C=1.0, gamma='scale')
    model.fit(X_train_part, y_train_part)
    train_scores.append(model.score(X_train_part, y_train_part))
    val_scores.append(model.score(X_test_scaled, y_test))

fig = go.Figure()
fig.add_trace(go.Scatter(x=train_sizes, y=train_scores, mode='lines+markers', name='Training Accuracy'))
fig.add_trace(go.Scatter(x=train_sizes, y=val_scores, mode='lines+markers', name='Validation Accuracy'))

fig.update_layout(
    title='Learning Curve - SVM Classifier',
    xaxis_title='Training Set Size Proportion',
    yaxis_title='Accuracy',
    yaxis=dict(range=[0, 1])
)
fig.show()
