##Setup and Loading

In [29]:
import pandas as pd
import numpy as np

# Define the column names for the dataset
cols = [
    'age', 'sex', 'on thyroxine', 'query on thyroxine', 'on antithyroid medication',
    'sick', 'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid',
    'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych',
    'TSH measured', 'TSH', 'T3 measured', 'T3', 'TT4 measured', 'TT4',
    'T4U measured', 'T4U', 'FTI measured', 'FTI', 'TBG measured', 'TBG',
    'referral source', 'diagnosis'
]

# Load the datasets
try:
    df_hypo = pd.read_csv('allhypo.data', header=None, names=cols, na_values='?')
    df_hyper = pd.read_csv('allhyper.data', header=None, names=cols, na_values='?')
    print("✅ Datasets loaded successfully!")
except FileNotFoundError:
    print("❌ Error: Make sure 'allhypo.data' and 'allhyper.data' are in the correct directory.")



✅ Datasets loaded successfully!


In [6]:
df_hypo.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,diagnosis
0,41.0,F,f,f,f,f,f,f,f,f,...,t,125.0,t,1.14,t,109.0,f,,SVHC,negative.|3733
1,23.0,F,f,f,f,f,f,f,f,f,...,t,102.0,f,,f,,f,,other,negative.|1442
2,46.0,M,f,f,f,f,f,f,f,f,...,t,109.0,t,0.91,t,120.0,f,,other,negative.|2965
3,70.0,F,t,f,f,f,f,f,f,f,...,t,175.0,f,,f,,f,,other,negative.|806
4,70.0,F,f,f,f,f,f,f,f,f,...,t,61.0,t,0.87,t,70.0,f,,SVI,negative.|2807


In [8]:
df_hyper.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,diagnosis
0,41.0,F,f,f,f,f,f,f,f,f,...,t,125.0,t,1.14,t,109.0,f,,SVHC,negative.|3733
1,23.0,F,f,f,f,f,f,f,f,f,...,t,102.0,f,,f,,f,,other,negative.|1442
2,46.0,M,f,f,f,f,f,f,f,f,...,t,109.0,t,0.91,t,120.0,f,,other,negative.|2965
3,70.0,F,t,f,f,f,f,f,f,f,...,t,175.0,f,,f,,f,,other,negative.|806
4,70.0,F,f,f,f,f,f,f,f,f,...,t,61.0,t,0.87,t,70.0,f,,SVI,negative.|2807


##Converting Hidden Strings(Diagnosis) to readable data

In [30]:
# We split the string by '.' and take the first part
df_hypo['diagnosis'] = df_hypo['diagnosis'].apply(lambda x: x.split('.')[0])
df_hyper['diagnosis'] = df_hyper['diagnosis'].apply(lambda x: x.split('.')[0])

print("Classes in allhyper dataset:")
print(df_hyper['diagnosis'].unique())

Classes in allhyper dataset:
['negative' 'hyperthyroid' 'T3 toxic' 'goitre']


In [31]:
df_hypo.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,diagnosis
0,41.0,F,f,f,f,f,f,f,f,f,...,t,125.0,t,1.14,t,109.0,f,,SVHC,negative
1,23.0,F,f,f,f,f,f,f,f,f,...,t,102.0,f,,f,,f,,other,negative
2,46.0,M,f,f,f,f,f,f,f,f,...,t,109.0,t,0.91,t,120.0,f,,other,negative
3,70.0,F,t,f,f,f,f,f,f,f,...,t,175.0,f,,f,,f,,other,negative
4,70.0,F,f,f,f,f,f,f,f,f,...,t,61.0,t,0.87,t,70.0,f,,SVI,negative


##Creating new with hyper and hypo

In [36]:
# Based on your output, these are the classes we'll group as 'Hyperthyroid'
hyper_classes = ['hyperthyroid', 'T3 toxic', 'goitre']

# Filter the hyperthyroid dataframe to only include these classes
df_hyper_filtered = df_hyper[df_hyper['diagnosis'].isin(hyper_classes)].copy()

# Create the 'target' column for the filtered hyperthyroid cases
df_hyper_filtered['target'] = 'Hyperthyroid'

# Now, let's process the hypo dataframe. We'll map its classes to the final targets.
# We will also include the 'negative' cases from this file.
class_mapping = {
    'negative': 'Negative',
    'compensated hypothyroid': 'Compensated Hypothyroid',
    'primary hypothyroid': 'Primary Hypothyroid',
    'secondary hypothyroid': 'Primary Hypothyroid'  # Grouping secondary with primary
}

# Create the 'target' column in the hypo dataframe
df_hypo['target'] = df_hypo['diagnosis'].map(class_mapping)

# Combine the processed hypo dataframe with the filtered hyperthyroid dataframe
df_combined = pd.concat([df_hypo, df_hyper_filtered], ignore_index=True)

# Clean up by dropping the original diagnosis column and any rows that weren't mapped
df_combined = df_combined.drop(columns=['diagnosis'])
df_combined = df_combined.dropna(subset=['target'])

print("✅ Datasets combined and target column created!")

# Now you can proceed to Step 4 to verify the result
print("\nFinal class distribution:")
print(df_combined['target'].value_counts())

✅ Datasets combined and target column created!

Final class distribution:
target
Negative                   2580
Compensated Hypothyroid     154
Hyperthyroid                 77
Primary Hypothyroid          66
Name: count, dtype: int64


In [38]:
df_combined.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2877 entries, 0 to 2876
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        2876 non-null   float64
 1   sex                        2762 non-null   object 
 2   on thyroxine               2877 non-null   object 
 3   query on thyroxine         2877 non-null   object 
 4   on antithyroid medication  2877 non-null   object 
 5   sick                       2877 non-null   object 
 6   pregnant                   2877 non-null   object 
 7   thyroid surgery            2877 non-null   object 
 8   I131 treatment             2877 non-null   object 
 9   query hypothyroid          2877 non-null   object 
 10  query hyperthyroid         2877 non-null   object 
 11  lithium                    2877 non-null   object 
 12  goitre                     2877 non-null   object 
 13  tumor                      2877 non-null   objec

##Removing Unnecessary field 

In [40]:
# Create a list of all '... measured' columns to drop
measured_cols = [
    'TSH measured', 'T3 measured', 'TT4 measured',
    'T4U measured', 'FTI measured', 'TBG measured'
]

# Drop the specified columns from the original combined dataframe
df_full_processed = df_combined.drop(columns=['TBG', 'referral source'] + measured_cols)

print(" Unnecessary columns dropped from the full dataset.")
print(f"DataFrame shape is now: {df_full_processed.shape}")

 Unnecessary columns dropped from the full dataset.
DataFrame shape is now: (2877, 22)


##Encoding All Categorical & Binary Columns with Numerics

In [51]:
binary_cols = [
    'on thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick',
    'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid',
    'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych'
]

for col in binary_cols:
    df_full_processed[col] = df_full_processed[col].map({'f': 0, 't': 1})
    df_full_processed[col] = df_full_processed[col].fillna(0) # The fix
    df_full_processed[col] = df_full_processed[col].astype(int)

sex_mode_value = df_full_processed['sex'].mode()
if not sex_mode_value.empty:
    df_full_processed['sex'] = df_full_processed['sex'].fillna(sex_mode_value[0]) # The fix

df_full_processed['sex'] = df_full_processed['sex'].map({'F': 0, 'M': 1})
df_full_processed['sex'] = df_full_processed['sex'].fillna(0) # The fix
df_full_processed['sex'] = df_full_processed['sex'].astype(int)

print(" All binary and 'sex' columns cleaned and encoded without warnings.")



##Imputing missing/NaN values with median

In [53]:
# Create a boolean mask for rows containing any NaN values
nan_rows_mask = df_full_processed.isnull().any(axis=1)

# Filter the DataFrame to see only the rows with NaNs
rows_with_nan_before = df_full_processed[nan_rows_mask]


print(f"Found {len(rows_with_nan_before)} rows with at least one missing value.")
print("Here are some of them:")

# Using display() gives a nicely formatted table in notebooks
display(rows_with_nan_before.head())

Found 786 rows with at least one missing value.
Here are some of them:


Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,target
1,23.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,4.1,2.0,102.0,,,Negative
2,46.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.98,,109.0,0.91,120.0,Negative
3,70.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.16,1.9,175.0,,,Negative
5,18.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.03,,183.0,1.3,141.0,Negative
6,59.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,,,72.0,0.92,78.0,Negative


In [55]:
numerical_cols = df_full_processed.select_dtypes(include=np.number).columns.tolist()
for col in numerical_cols:
    median_val = df_full_processed[col].median()
    df_full_processed[col] = df_full_processed[col].fillna(median_val) 

print("\n All missing numerical values imputed without warnings.")




In [57]:
nan_rows_mask = df_full_processed.isnull().any(axis=1)

rows_with_nan_before = df_full_processed[nan_rows_mask]


print(f"Found {len(rows_with_nan_before)} rows with at least one missing value.")
print("Here are some of them:")

display(rows_with_nan_before.head())

Found 0 rows with at least one missing value.
Here are some of them:


Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,target


##Saving the Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
import joblib

label_encoder = LabelEncoder()
df_full_processed['target_encoded'] = label_encoder.fit_transform(df_full_processed['target'])


df_final = df_full_processed.drop('target', axis=1)
df_final.rename(columns={'target_encoded': 'target'}, inplace=True)
joblib.dump(label_encoder, 'target_label_encoder.pkl')

print(" Target label encoder saved")
print("\nTarget encoded. Final dataset is ready!")
df_final.info()

 Target label encoder saved

Target encoded. Final dataset is ready!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2877 entries, 0 to 2876
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        2877 non-null   float64
 1   sex                        2877 non-null   int64  
 2   on thyroxine               2877 non-null   int64  
 3   query on thyroxine         2877 non-null   int64  
 4   on antithyroid medication  2877 non-null   int64  
 5   sick                       2877 non-null   int64  
 6   pregnant                   2877 non-null   int64  
 7   thyroid surgery            2877 non-null   int64  
 8   I131 treatment             2877 non-null   int64  
 9   query hypothyroid          2877 non-null   int64  
 10  query hyperthyroid         2877 non-null   int64  
 11  lithium                    2877 non-null   int64  
 12  goitre                     2877 non

##Saving the Cleaned Dataset

In [62]:
df_final.to_csv('thyroid_final_cleaned.csv', index=False)

print(" Successfully saved ")

 Successfully saved 
