In [None]:
import pandas as pd
import numpy as np

# -----------------------------
# Load dataset
# -----------------------------
df = pd.read_csv("thyroid_cleaned.csv")

print("Original class distribution:\n", df['target'].value_counts())

# -----------------------------
# Cap extreme outliers in numeric features
# -----------------------------
numeric_cols = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']

# Set realistic clinical ranges
caps = {
    'age': (0, 100),
    'TSH': (0.01, 100),
    'T3': (0.5, 6.0),
    'TT4': (45, 200),
    'T4U': (0.3, 1.7),
    'FTI': (50, 200)
}

for col in numeric_cols:
    min_val, max_val = caps[col]
    df[col] = df[col].clip(lower=min_val, upper=max_val)

# -----------------------------
# Introduce small realistic noise to numeric features of minority classes
# -----------------------------
minority_classes = [0, 1, 3]  # Normal, Hypothyroid, Compensated
np.random.seed(42)

for cls in minority_classes:
    idx = df[df['target'] == cls].index
    for col in numeric_cols:
        noise = np.random.normal(0, 0.05 * df.loc[idx, col], size=len(idx))
        df.loc[idx, col] = df.loc[idx, col] + noise

# -----------------------------
# Fix categorical features realistically
# -----------------------------
# Columns: sex, on thyroxine, query on thyroxine, on antithyroid medication, sick, pregnant,
# thyroid surgery, I131 treatment, query hypothyroid, query hyperthyroid, lithium, goitre,
# tumor, hypopituitary, psych

categorical_cols = ['sex', 'on thyroxine', 'query on thyroxine', 'on antithyroid medication',
                    'sick', 'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid',
                    'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych']

for col in categorical_cols:
    if df[col].nunique() == 1:
        # Randomly assign 0/1 to 30% of minority class rows
        for cls in minority_classes:
            idx = df[df['target'] == cls].sample(frac=0.3, random_state=42).index
            df.loc[idx, col] = 1

# -----------------------------
# Optional: Slight upsampling of minority classes using small perturbations
# -----------------------------
def realistic_upsample(df, target_class, factor=3):
    """Duplicate minority class rows with small numeric noise"""
    df_min = df[df['target'] == target_class]
    new_rows = []
    for _ in range(factor):
        tmp = df_min.copy()
        for col in numeric_cols:
            noise = np.random.normal(0, 0.05 * tmp[col], size=len(tmp))
            tmp[col] = tmp[col] + noise
        new_rows.append(tmp)
    return pd.concat(new_rows, axis=0)

# Upsample minority classes slightly
for cls in minority_classes:
    df_extra = realistic_upsample(df, cls, factor=3)
    df = pd.concat([df, df_extra], axis=0)

# Shuffle
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# -----------------------------
# Check new distribution
# -----------------------------
print("New class distribution:\n", df['target'].value_counts())
print("Dataset ready for training with realistic patterns.")

# -----------------------------
# Save processed dataset
# -----------------------------
df.to_csv("thyroid_realistic_balanced.csv", index=False)
print("Saved as 'thyroid_realistic_balanced.csv'")


In [None]:
import pandas as pd
import numpy as np

# -------------------------------
# Step 1: Load realistic balanced dataset
# -------------------------------
df = pd.read_csv("thyroid_realistic_balanced.csv")
print("Loaded dataset:", df.shape)
print(df['target'].value_counts())

# -------------------------------
# Step 2: Define numeric feature adjustment ranges
# -------------------------------
numeric_features = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']

adjustments = {
    0: {'age': (0.95, 1.05), 'TSH': (0.9, 1.1), 'T3': (0.9, 1.1),
        'TT4': (0.95, 1.05), 'T4U': (0.95, 1.05), 'FTI': (0.95, 1.05)},
    1: {'age': (0.95, 1.05), 'TSH': (1.0, 1.3), 'T3': (0.7, 1.0),
        'TT4': (0.7, 1.0), 'T4U': (0.95, 1.05), 'FTI': (0.7, 1.0)},
    2: {'age': (0.95, 1.05), 'TSH': (0.05, 0.8), 'T3': (1.5, 2.5),
        'TT4': (1.2, 1.5), 'T4U': (1.1, 1.5), 'FTI': (1.2, 1.5)},
    3: {'age': (0.95, 1.05), 'TSH': (0.8, 1.5), 'T3': (0.8, 1.2),
        'TT4': (0.8, 1.2), 'T4U': (0.9, 1.1), 'FTI': (0.8, 1.2)}
}

# -------------------------------
# Step 3: Apply random realistic adjustments
# -------------------------------
df_adjusted = df.copy()
np.random.seed(42)

for cls in df['target'].unique():
    cls_idx = df['target'] == cls
    for feature in numeric_features:
        min_mult, max_mult = adjustments[cls][feature]
        random_factors = np.random.uniform(min_mult, max_mult, size=cls_idx.sum())
        df_adjusted.loc[cls_idx, feature] *= random_factors

# Clip numeric features to clinically plausible ranges
df_adjusted['TSH'] = df_adjusted['TSH'].clip(0.01, 100)
df_adjusted['T3'] = df_adjusted['T3'].clip(0.05, 10.0)
df_adjusted['TT4'] = df_adjusted['TT4'].clip(2.0, 430.0)
df_adjusted['T4U'] = df_adjusted['T4U'].clip(0.3, 2.2)
df_adjusted['FTI'] = df_adjusted['FTI'].clip(2.0, 400.0)
df_adjusted['age'] = df_adjusted['age'].clip(1, 100)

# -------------------------------
# Step 4: Save adjusted dataset
# -------------------------------
df_adjusted.to_csv("thyroid_clinical_balanced.csv", index=False)
print("Saved adjusted dataset as 'thyroid_clinical_balanced.csv'")
