In [None]:
!pip install torch numpy scikit-learn iterative-stratification

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [None]:
file_path = "/content/drive/MyDrive/diseasedataset/finaldataset.csv"
df = pd.read_csv(file_path)

In [None]:
# One-hot encode diseases column
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(df[["diseases"]])

# split into features & labels
X = df.drop(columns=["diseases"]).values
y = y_encoded

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168499 entries, 0 to 168498
Columns: 379 entries, Unnamed: 0 to neck weakness
dtypes: int64(378), object(1)
memory usage: 487.2+ MB


In [None]:
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# splitting into train-test (one iteration only)
for train_index, test_index in mskf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    break  # only take the first split

In [None]:
class HDCClassifier:
    def __init__(self, dim=1000):
        self.dim = dim  # Hyperdimensional vector size
        self.prototypes = {}  # Stores prototype vectors per class

    def _encode(self, x):
        """ Convert input features into hyperdimensional binary vectors. """
        return torch.sign(torch.tensor(x, dtype=torch.float32))  # Convert to float

    def fit(self, X_train, y_train):
        """ Train by averaging class-specific hyperdimensional vectors. """
        for label in np.unique(y_train):
            class_vectors = [self._encode(X_train[i]) for i in range(len(y_train)) if y_train[i] == label]
            self.prototypes[label] = torch.mean(torch.stack(class_vectors), dim=0).float()  # Ensure float type

    def predict(self, X_test):
        """ Predict based on similarity to stored class prototypes. """
        predictions = []
        for x in X_test:
            encoded_x = self._encode(x)
            similarities = {label: torch.dot(encoded_x, proto) for label, proto in self.prototypes.items()}
            predictions.append(max(similarities, key=similarities.get))  # Closest class
        return np.array(predictions)

In [None]:
y_train_labels = y_train.argmax(axis=1)  # Convert from one-hot to single-label vector
y_test_labels = y_test.argmax(axis=1)

In [None]:
# Initialize & train
model = HDCClassifier()
model.fit(X_train, y_train_labels)

In [None]:
torch.save(model, "hdc_model.pth")

from google.colab import files
files.download("hdc_model.pth")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
y_pred = model.predict(X_test)

In [None]:
acc = accuracy_score(y_test_labels, y_pred)
print(acc)

0.871572700296736


In [None]:
precision = precision_score(y_test_labels, y_pred, average='macro', zero_division=1)
print("Precision:", precision)

Precision: 0.8763759661465015


In [None]:
recall = recall_score(y_test_labels, y_pred, average='macro', zero_division=1)
print("Recall:", recall)

Recall: 0.8781113494952544


In [None]:
clssf = classification_report(y_test_labels, y_pred)
print(clssf)

              precision    recall  f1-score   support

           0       0.94      0.74      0.83       182
           1       0.99      0.93      0.95       240
           2       0.92      0.67      0.77       242
           3       0.69      0.82      0.75       181
           4       0.96      0.95      0.95       182
           5       0.71      0.83      0.76       136
           6       0.97      0.86      0.91       241
           7       0.82      0.86      0.84       181
           8       0.92      0.93      0.93       182
           9       0.66      0.90      0.76       100
          10       0.91      0.87      0.89       102
          11       0.96      0.99      0.98       133
          12       0.93      0.97      0.95       181
          13       0.97      0.93      0.95       182
          14       0.92      0.96      0.94       121
          15       0.95      0.92      0.94       240
          16       0.82      0.93      0.88       181
          17       1.00    