In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read data
data = pd.read_csv('data/HCV-Egy-Data.csv')

In [3]:
# Number of numerical and categorical features
print("Number of numerical features", len(data.select_dtypes(exclude='object').columns))
cat_features = data.select_dtypes(include='object').columns
print("Number of categorical features", len(cat_features))
print(cat_features)

Number of numerical features 29
Number of categorical features 0
Index([], dtype='object')


In [4]:
data.columns

Index(['Age ', 'Gender', 'BMI', 'Fever', 'Nausea/Vomting', 'Headache ',
       'Diarrhea ', 'Fatigue & generalized bone ache ', 'Jaundice ',
       'Epigastric pain ', 'WBC', 'RBC', 'HGB', 'Plat', 'AST 1', 'ALT 1',
       'ALT4', 'ALT 12', 'ALT 24', 'ALT 36', 'ALT 48', 'ALT after 24 w',
       'RNA Base', 'RNA 4', 'RNA 12', 'RNA EOT', 'RNA EF',
       'Baseline histological Grading', 'Baselinehistological staging'],
      dtype='object')

In [5]:
# Separate features
X = data.drop(['Baselinehistological staging'], axis=1)
y = data['Baselinehistological staging']

In [6]:
# Check distribution of target classes
y.value_counts()

Baselinehistological staging
4    362
3    355
1    336
2    332
Name: count, dtype: int64

In [7]:
# Initialize decision tree classifier
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

In [8]:
# Perform k-fold cross-validation
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(clf, X, y, cv=kf)

In [9]:
# Compute evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
accuracy = cross_val_score(clf, X, y, cv=kf, scoring='accuracy')
precision = cross_val_score(clf, X, y, cv=kf, scoring='precision_macro')
recall = cross_val_score(clf, X, y, cv=kf, scoring='recall_macro')
f1 = cross_val_score(clf, X, y, cv=kf, scoring='f1_macro')

In [10]:
# Compute confusion matrix
conf_matrix = confusion_matrix(y, y_pred_cv)

In [11]:
# Print evaluation metrics
print("Accuracy:", accuracy.mean())
print("Precision:", precision.mean())
print("Recall:", recall.mean())
print("F1 Score:", f1.mean())
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.23898916967509026
Precision: 0.2472839227777667
Recall: 0.2399124771988844
F1 Score: 0.23844531803848862
Confusion Matrix:
 [[ 85  73 102  76]
 [ 77  91  81  83]
 [ 82  90  85  98]
 [ 99 105  88  70]]
