## Load and Combine Raw Data

In [8]:
print("--- Step 1: Loading Raw Data ---")
cols = [
    'age', 'sex', 'on thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick',
    'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid',
    'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH measured', 'TSH',
    'T3 measured', 'T3', 'TT4 measured', 'TT4', 'T4U measured', 'T4U', 'FTI measured', 'FTI',
    'TBG measured', 'TBG', 'referral source', 'diagnosis'
]
try:
    df_hypo = pd.read_csv('allhypo.data', header=None, names=cols, na_values='?')
    df_hyper = pd.read_csv('allhyper.data', header=None, names=cols, na_values='?')
    print("✅ Raw data loaded.")
except FileNotFoundError:
    print("❌ Error: Raw data files ('allhypo.data', 'allhyper.data') not found.")
    exit()
    

--- Step 1: Loading and Combining Raw Data ---
✅ Raw data loaded and combined.


## Initial Cleaning

In [14]:
print("\n--- Step 2: Combining Files Safely ---")
# Clean the diagnosis column in both dataframes
df_hypo['diagnosis'] = df_hypo['diagnosis'].apply(lambda x: str(x).split('.')[0].strip())
df_hyper['diagnosis'] = df_hyper['diagnosis'].apply(lambda x: str(x).split('.')[0].strip())

df_hyper_positive = df_hyper[df_hyper['diagnosis'] != 'negative']

df_combined = pd.concat([df_hypo, df_hyper_positive], ignore_index=True)
print("✅ Files safely combined.")


--- Step 2: Combining Files Safely ---
✅ Files safely combined.


## Create Target and Drop Useless Columns 


In [15]:
print("\n--- Step 3: Creating Target and Dropping Columns ---")
class_mapping = {
    'negative': 'Negative', 'compensated hypothyroid': 'Compensated Hypothyroid',
    'primary hypothyroid': 'Primary Hypothyroid', 'secondary hypothyroid': 'Primary Hypothyroid',
    'hyperthyroid': 'Hyperthyroid', 'T3 toxic': 'Hyperthyroid',
    'toxic goitre': 'Hyperthyroid', 'secondary toxic': 'Hyperthyroid', 'goitre': 'Hyperthyroid'
}
df_combined['target'] = df_combined['diagnosis'].map(class_mapping)
df_combined = df_combined.dropna(subset=['target'])
measured_cols = ['TSH measured', 'T3 measured', 'TT4 measured', 'T4U measured', 'FTI measured', 'TBG measured']
df_full_processed = df_combined.drop(columns=['diagnosis', 'TBG', 'referral source'] + measured_cols)
print("✅ Target column created and unnecessary columns dropped.")


--- Step 3: Creating Target and Dropping Columns ---
✅ Target column created and unnecessary columns dropped.


 ## Encode All Categorical & Binary Columns

In [16]:
print("\n--- Step 4: Encoding Categorical Features ---")
binary_cols = ['on thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych']
for col in binary_cols:
    df_full_processed[col] = df_full_processed[col].map({'f': 0, 't': 1}).fillna(0).astype(int)
sex_mode = df_full_processed['sex'].mode()[0]
df_full_processed['sex'] = df_full_processed['sex'].fillna(sex_mode).map({'F': 0, 'M': 1}).fillna(0).astype(int)
print("✅ Categorical features encoded.")


--- Step 4: Encoding Categorical Features ---
✅ Categorical features encoded.


## Impute Missing Numerical Values

In [17]:
print("\n--- Step 5: Imputing Numerical Features ---")
numerical_cols = df_full_processed.select_dtypes(include=np.number).columns.tolist()
# Ensure 'target' isn't in this list if it's already encoded
numerical_cols = [col for col in numerical_cols if 'target' not in col]
for col in numerical_cols:
    median_val = df_full_processed[col].median()
    df_full_processed[col] = df_full_processed[col].fillna(median_val)
print("✅ Numerical features imputed.")


--- Step 5: Imputing Numerical Features ---
✅ Numerical features imputed.


## Encode Target 

In [18]:
print("\n--- Step 6: Finalizing and Saving Files ---")
label_encoder = LabelEncoder()
df_full_processed['target_encoded'] = label_encoder.fit_transform(df_full_processed['target'])
print(f"Encoder has been fitted. Learned classes: {list(label_encoder.classes_)}")


--- Step 6: Finalizing and Saving Files ---
Encoder has been fitted. Learned classes: ['Compensated Hypothyroid', 'Hyperthyroid', 'Negative', 'Primary Hypothyroid']


## Save Final Files

In [19]:
encoder_filename = 'target_label_encoder.pkl'
with open(encoder_filename, 'wb') as f:
    pickle.dump(label_encoder, f)
print(f"✅ Correctly FITTED encoder object saved to '{encoder_filename}'.")

df_final = df_full_processed.drop('target', axis=1)
df_final.rename(columns={'target_encoded': 'target'}, inplace=True)
csv_filename = 'thyroid_final_cleaned.csv'
df_final.to_csv(csv_filename, index=False)
print(f"✅ Final clean data saved to '{csv_filename}'.")

print("\n\nData preparation complete.")

✅ Correctly FITTED encoder object saved to 'target_label_encoder.pkl'.
✅ Final clean data saved to 'thyroid_final_cleaned.csv'.


Data preparation complete.


## Printing the Data

In [20]:
df_final.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,target
0,41.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1.3,2.5,125.0,1.14,109.0,2
1,23.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,4.1,2.0,102.0,0.98,108.0,2
2,46.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0.98,2.0,109.0,0.91,120.0,2
3,70.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0.16,1.9,175.0,0.98,108.0,2
4,70.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.72,1.2,61.0,0.87,70.0,2
