In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')
from google.colab import files

Mounted at /content/drive


In [None]:
file_path = "/content/drive/MyDrive/diseasedataset/dataset.csv"
df = pd.read_csv(file_path)

In [None]:
disease_counts = df['diseases'].value_counts()

# filter only those diseases with more than 500 records (to ensure proper training)

common_diseases = disease_counts[disease_counts >= 500].index
filtered_df = df[df['diseases'].isin(common_diseases)]

In [None]:
df.shape

(246945, 378)

In [None]:
filtered_df.shape

(168499, 378)

In [None]:
# add noise (since it is a synthetic dataset)

def augment_with_partial_noise(filtered_df, target_col='disease', flip_prob=0.01, percent=0.25, seed=None):
    if seed is not None:
        np.random.seed(seed)

    # Split features and target
    X = filtered_df.drop(columns=[target_col])
    y = filtered_df[target_col]

    # Sample random rows to augment
    num_to_augment = int(len(filtered_df) * percent)
    sampled_indices = np.random.choice(filtered_df.index, size=num_to_augment, replace=False)
    X_sample = X.loc[sampled_indices].copy().reset_index(drop=True)
    y_sample = y.loc[sampled_indices].copy().reset_index(drop=True)

    # Flip symptom bits with probability
    flip_mask = np.random.rand(*X_sample.shape) < flip_prob
    X_flipped = np.abs(X_sample.values - flip_mask.astype(int))
    X_noisy = pd.DataFrame(X_flipped, columns=X_sample.columns)

    # Combine with original
    X_aug = pd.concat([X, X_noisy], ignore_index=True)
    y_aug = pd.concat([y, y_sample], ignore_index=True)

    filtered_df_augmented = X_aug.copy()
    filtered_df_augmented[target_col] = y_aug

    return filtered_df_augmented

In [None]:
df_augmented = augment_with_partial_noise(filtered_df, target_col='diseases', flip_prob=0.02, percent=0.25, seed=42)

print("Original shape:", filtered_df.shape)
print("Augmented shape:", df_augmented.shape)

Original shape: (168499, 378)
Augmented shape: (210623, 378)


In [None]:
unique_diseases = df['diseases'].unique()
dis_count = pd.Series(unique_diseases).value_counts()
print(dis_count)

open wound of the nose            1
panic disorder                    1
vocal cord polyp                  1
turner syndrome                   1
cryptorchidism                    1
                                 ..
headache after lumbar puncture    1
eye alignment disorder            1
cellulitis or abscess of mouth    1
fracture of the hand              1
atrophic vaginitis                1
Name: count, Length: 773, dtype: int64


In [None]:
unique_diseases = filtered_df['diseases'].unique()
dis_count = pd.Series(unique_diseases).value_counts()
print(dis_count)

panic disorder                 1
vaginitis                      1
transient ischemic attack      1
pyelonephritis                 1
chronic pain disorder          1
                              ..
gastroduodenal ulcer           1
eczema                         1
white blood cell disease       1
spondylolisthesis              1
conjunctivitis due to virus    1
Name: count, Length: 201, dtype: int64


In [None]:
# export

df_augmented.to_csv('properfinal.csv')
files.download('properfinal.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>