In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import class_weight

# --- PART 1: LOAD AND CLEAN THE RAW DATA ---

COLUMN_NAMES = [
    'age', 'sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication',
    'sick', 'pregnant', 'thyroid_surgery', 'I131_treatment', 'query_hypothyroid',
    'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych',
    'TSH_measured', 'TSH', 'T3_measured', 'T3', 'TT4_measured', 'TT4',
    'T4U_measured', 'T4U', 'FTI_measured', 'FTI', 'TBG_measured', 'TBG',
    'referral_source', 'diagnosis'
]

def clean_thyroid_data(filename):
    """Loads and cleans a raw thyroid .data file."""
    try:
        df = pd.read_csv(filename, header=None, names=COLUMN_NAMES, na_values='?')
        diagnosis_split = df['diagnosis'].str.split('|', expand=True)
        df['diagnosis'] = diagnosis_split[0]
        numeric_cols = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG']
        for col in numeric_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        print(f"Successfully loaded and cleaned '{filename}'.")
        return df
    except FileNotFoundError:
        print(f"Error: The file '{filename}' was not found.")
        return None

# Load and clean both files
hypo_df = clean_thyroid_data('allhypo.data')
hyper_df = clean_thyroid_data('allhyper.data')

# --- NEW: Save the individually cleaned files ---
if hypo_df is not None:
    hypo_df.to_csv('allhypo_cleaned.csv', index=False)
    print("Saved cleaned hypo data to 'allhypo_cleaned.csv'")

if hyper_df is not None:
    hyper_df.to_csv('allhyper_cleaned.csv', index=False)
    print("Saved cleaned hyper data to 'allhyper_cleaned.csv'")

# Combine them into a single DataFrame
hypo_df['condition'] = 'hypothyroid'
hyper_df['condition'] = 'hyperthyroid'
combined_df = pd.concat([hypo_df, hyper_df], ignore_index=True)


# --- PART 2: CREATE THE 3-CLASS TARGET AND FINALIZE CLEANING ---
df_3class = combined_df.copy()

def classify_diagnosis(diagnosis):
    diagnosis = str(diagnosis).strip()
    if diagnosis == 'negative.': return 'Normal'
    elif 'hypothyroid' in diagnosis: return 'Hypothyroid'
    elif 'hyperthyroid' in diagnosis or 'toxic' in diagnosis: return 'Hyperthyroid'
    else: return 'Other'

df_3class['target_category'] = df_3class['diagnosis'].apply(classify_diagnosis)
df_3class = df_3class[df_3class['target_category'] != 'Other']
target_map = {'Normal': 0, 'Hypothyroid': 1, 'Hyperthyroid': 2}
df_3class['target'] = df_3class['target_category'].map(target_map)

# Encode categorical columns
for col in df_3class.columns:
    if df_3class[col].dtype == 'object':
        df_3class[col] = df_3class[col].str.strip()
binary_tf_cols = [
    'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'sick',
    'pregnant', 'thyroid_surgery', 'I131_treatment', 'query_hypothyroid',
    'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych'
]
for col in binary_tf_cols:
    df_3class[col] = df_3class[col].map({'f': 0, 't': 1})
df_3class['sex'] = df_3class['sex'].map({'F': 0, 'M': 1})

# Drop unnecessary columns
cols_to_drop = [
    'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured',
    'TBG_measured', 'TBG', 'referral_source', 'diagnosis', 'condition', 'target_category'
]
df_3class = df_3class.drop(columns=cols_to_drop)

# Handle missing numerical values
for col in df_3class.select_dtypes(include=np.number).columns:
    if df_3class[col].isnull().sum() > 0:
        median_val = df_3class[col].median()
        df_3class[col].fillna(median_val, inplace=True)

# --- NEW: Save the final prepared 3-class data ---
df_3class.to_csv('thyroid_final_3class_data.csv', index=False)
print("\nSaved final prepared 3-class data to 'thyroid_final_3class_data.csv'")
print(df_3class['target'].value_counts())