## Decision Tree

## Import Required Libraries

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import plotly.express as px
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestClassifier


## Load and Preprocess the Dataset

In [7]:
csv_path = r"C:\Users\Blue Sky\Desktop\Dav\COVID-Protein-Analysis\genome_sequences_cleaned.csv"
df = pd.read_csv(csv_path)

# Display first few rows to confirm loading
df.head()

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,P,Q,R,S,T,V,W,Y,Sequence_Length,label_encoded
0,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0
1,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0
2,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0
3,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0
4,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,31,0


## Split Data into Features and Target

In [29]:
# Features: A-Z + Sequence_Length
X = df.drop(columns=['label_encoded'])
y = df['label_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


## Train the Decision Tree Classifier

In [10]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)


## Evaluation Metrics

In [11]:
# Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")

# Classification Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.9743

Classification Report:

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1550
           1       0.93      0.95      0.94      1105
           2       1.00      0.99      1.00      1970
           3       0.97      0.93      0.95      1016

    accuracy                           0.97      5641
   macro avg       0.97      0.97      0.97      5641
weighted avg       0.97      0.97      0.97      5641



## Confusion Matrix Plot

In [12]:
# Create a confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = np.unique(y)

fig = px.imshow(
    cm,
    labels=dict(x="Predicted", y="Actual", color="Count"),
    x=labels,
    y=labels,
    color_continuous_scale='Blues',
    text_auto=True
)

fig.update_layout(title="Confusion Matrix (Plotly)")
fig.show()


## Feature Importance Plot

In [14]:
# Use the existing feature importance data
feat_imp_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': clf.feature_importances_
}).sort_values(by='Importance', ascending=False).head(10)

fig = px.bar(
    feat_imp_df,
    x='Importance',
    y='Feature',
    orientation='h',
    color='Importance',
    title='Top 10 Important Features (Plotly)',
    color_continuous_scale='viridis'
)

fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()



## KDE

In [36]:
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import LabelEncoder

kde_df = X_test[['A', 'G']].copy()
kde_df['Predicted_Label'] = y_pred

fig = px.density_contour(
    kde_df,
    x='A',
    y='G',
    color='Predicted_Label',
    title='KDE-like Density Contour: Feature A vs G by Predicted Class',
    color_discrete_sequence=px.colors.qualitative.Set1
)

fig.update_traces(contours_coloring="fill", contours_showlabels=True)
fig.update_layout(
    xaxis_title='Frequency of Amino Acid A',
    yaxis_title='Frequency of Amino Acid G',
    legend_title='Predicted Class'
)

fig.show()


##  Random Forest

## Prepare Data and Train

In [18]:
# Load data
csv_path = r"C:\Users\Blue Sky\Desktop\Dav\COVID-Protein-Analysis\genome_sequences_cleaned.csv"
df = pd.read_csv(csv_path)

# Features and label
X = df.drop(columns=["label_encoded"])  # Drop the target
y = df["label_encoded"]  # Target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict
y_pred = rf_model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9794362701648643
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1526
           1       0.93      0.97      0.95      1134
           2       1.00      1.00      1.00      1961
           3       0.97      0.94      0.95      1020

    accuracy                           0.98      5641
   macro avg       0.97      0.97      0.97      5641
weighted avg       0.98      0.98      0.98      5641



## Confusion Matrix Plot

In [22]:


# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = rf_model.classes_  # Unique class labels

# Convert to string for axis labels
labels_str = [str(label) for label in labels]

# Create annotated heatmap
fig = ff.create_annotated_heatmap(
    z=cm,
    x=labels_str,
    y=labels_str,
    colorscale='Blues',
    showscale=True,
    hoverinfo="z",
    xgap=3,
    ygap=3
)

fig.update_layout(
    title="Confusion Matrix - Random Forest",
    xaxis=dict(title='Predicted Label'),
    yaxis=dict(title='True Label')
)

fig.show()


## Feature Importance Plot

In [19]:

# Get feature importances
importances = rf_model.feature_importances_
feature_names = X.columns

# Create DataFrame for plotting
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plotly bar chart
fig = px.bar(importance_df, x='Feature', y='Importance',
             title="Feature Importances - Random Forest",
             labels={'Importance': 'Importance Score'},
             color='Importance')
fig.show()


## KDE Plots

In [21]:
import plotly.figure_factory as ff

# Example: Select top 3 features for KDE
top_features = importance_df['Feature'].head(3).tolist()

# Prepare KDEs for each class
for feature in top_features:
    class_values = []
    class_names = []

    for label in y.unique():
        class_values.append(df[df['label_encoded'] == label][feature])
        class_names.append(str(label))
    
    fig = ff.create_distplot(class_values, class_names, show_hist=False, show_rug=False)
    fig.update_layout(title=f"KDE Plot for Feature: {feature}")
    fig.show()
