In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
!pip install xgboost
!pip install imbalanced-learn



In [2]:
df = pd.read_csv("hypothyroid.csv")

In [3]:
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,binaryClass
0,41,F,f,f,f,f,f,f,f,f,...,t,125,t,1.14,t,109,f,?,SVHC,P
1,23,F,f,f,f,f,f,f,f,f,...,t,102,f,?,f,?,f,?,other,P
2,46,M,f,f,f,f,f,f,f,f,...,t,109,t,0.91,t,120,f,?,other,P
3,70,F,t,f,f,f,f,f,f,f,...,t,175,f,?,f,?,f,?,other,P
4,70,F,f,f,f,f,f,f,f,f,...,t,61,t,0.87,t,70,f,?,SVI,P


In [4]:
df.replace('?', np.nan, inplace=True)

In [5]:
# Dropping TBG column as the entire column has no entries
df.drop('TBG', axis=1, inplace=True)

In [6]:
df.dropna(subset=['sex'], inplace=True)
df.dropna(subset=['age'], inplace=True)

In [7]:
df.drop("referral source", axis=1, inplace = True)

In [8]:
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,T3 measured,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,binaryClass
0,41,F,f,f,f,f,f,f,f,f,...,t,2.5,t,125,t,1.14,t,109.0,f,P
1,23,F,f,f,f,f,f,f,f,f,...,t,2.0,t,102,f,,f,,f,P
2,46,M,f,f,f,f,f,f,f,f,...,f,,t,109,t,0.91,t,120.0,f,P
3,70,F,t,f,f,f,f,f,f,f,...,t,1.9,t,175,f,,f,,f,P
4,70,F,f,f,f,f,f,f,f,f,...,t,1.2,t,61,t,0.87,t,70.0,f,P


## Replacing T/F , M/F and P/N with 1 and 0 

In [9]:
columns_with_tf_values = df.columns[df.isin(['t', 'f']).all()]
print(columns_with_tf_values)

Index(['on thyroxine', 'query on thyroxine', 'on antithyroid medication',
       'sick', 'pregnant', 'thyroid surgery', 'I131 treatment',
       'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor',
       'hypopituitary', 'psych', 'TSH measured', 'T3 measured', 'TT4 measured',
       'T4U measured', 'FTI measured', 'TBG measured'],
      dtype='object')


In [10]:
columns_to_replace = ['on thyroxine', 'query on thyroxine', 'on antithyroid medication',
                      'sick', 'pregnant', 'thyroid surgery', 'I131 treatment',
                      'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor',
                      'hypopituitary', 'psych', 'TSH measured', 'T3 measured', 'TT4 measured',
                      'T4U measured', 'FTI measured', 'TBG measured']

replace_dict = {'t': 1, 'f': 0}

df[columns_to_replace] = df[columns_to_replace].replace(replace_dict)

In [11]:
columns_to_replace = ['sex']

replace_dict = {'M': 0, 'F': 1}

df[columns_to_replace] = df[columns_to_replace].replace(replace_dict)

In [12]:
columns_to_replace = ['binaryClass']

replace_dict = {'P': 1, 'N': 0}

df[columns_to_replace] = df[columns_to_replace].replace(replace_dict)

print("Unique values after replacement:", df['binaryClass'].unique())
unique_counts = df['binaryClass'].value_counts()
print(unique_counts)

Unique values after replacement: [1 0]
binaryClass
1    3341
0     280
Name: count, dtype: int64


In [13]:
df['age'] = pd.to_numeric(df['age'], errors='coerce')

## Replacing missing attributes with median

In [14]:
missing_values = df.isnull().sum()
print(missing_values)

age                            0
sex                            0
on thyroxine                   0
query on thyroxine             0
on antithyroid medication      0
sick                           0
pregnant                       0
thyroid surgery                0
I131 treatment                 0
query hypothyroid              0
query hyperthyroid             0
lithium                        0
goitre                         0
tumor                          0
hypopituitary                  0
psych                          0
TSH measured                   0
TSH                          352
T3 measured                    0
T3                           745
TT4 measured                   0
TT4                          217
T4U measured                   0
T4U                          367
FTI measured                   0
FTI                          365
TBG measured                   0
binaryClass                    0
dtype: int64


In [15]:
columns_to_fill = ['TSH', 'T3', 'TT4', 'T4U', 'FTI']
for column in columns_to_fill:
    df[column] = pd.to_numeric(df[column], errors='coerce')
    median_value = np.median(df[column].dropna())
    df[column].fillna(median_value, inplace=True)

In [16]:
df.dtypes

age                            int64
sex                            int64
on thyroxine                   int64
query on thyroxine             int64
on antithyroid medication      int64
sick                           int64
pregnant                       int64
thyroid surgery                int64
I131 treatment                 int64
query hypothyroid              int64
query hyperthyroid             int64
lithium                        int64
goitre                         int64
tumor                          int64
hypopituitary                  int64
psych                          int64
TSH measured                   int64
TSH                          float64
T3 measured                    int64
T3                           float64
TT4 measured                   int64
TT4                          float64
T4U measured                   int64
T4U                          float64
FTI measured                   int64
FTI                          float64
TBG measured                   int64
b

## SMOTE Oversampling

In [26]:
from imblearn.over_sampling import SMOTENC
import pandas as pd
import numpy as np

# Define the list of numerical column names
numerical_cols = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']

# Define the list of categorical column names
categorical_cols = ['sex', 'on thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick', 'pregnant',
                    'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium',
                    'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH measured', 'T3 measured', 'TT4 measured',
                    'T4U measured', 'FTI measured', 'TBG measured', 'referral source']

# Separate the feature columns (X) and the target column (y)
X = df.drop(columns=['binaryClass'])
y = df['binaryClass']

# Apply SMOTE-NC to oversample the minority class
smote_nc = SMOTENC(categorical_features=np.arange(len(categorical_cols)))
X_resampled, y_resampled = smote_nc.fit_resample(X, y)

# Convert the resampled arrays back to a DataFrame
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['binaryClass'] = y_resampled

# Update the original DataFrame with the resampled data
df = df_resampled.copy()

## Feature Scaling

In [27]:
from sklearn.preprocessing import MinMaxScaler
numerical_cols = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']
scaler = MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [28]:
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,T3 measured,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,binaryClass
0,0.088106,1,0,0,0,0,0,0,0,0,...,1,0.232227,1,0.287383,1,0.429952,1,0.272265,0,1
1,0.048458,1,0,0,0,0,0,0,0,0,...,1,0.184834,1,0.233645,0,0.352657,0,0.267176,0,1
2,0.099119,0,0,0,0,0,0,0,0,0,...,0,0.184834,1,0.25,1,0.318841,1,0.300254,0,1
3,0.151982,1,1,0,0,0,0,0,0,0,...,1,0.175355,1,0.404206,0,0.352657,0,0.267176,0,1
4,0.151982,1,0,0,0,0,0,0,0,0,...,1,0.109005,1,0.13785,1,0.299517,1,0.173028,0,1


In [29]:
num_entries = df.shape[0]  # Access the number of rows (entries)
print("Number of entries:", num_entries)

Number of entries: 6396


## Chi-Square Test for feature selection

In [30]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Define the numerical columns for feature scaling and selection
numerical_cols = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']

# Select the relevant columns for feature selection
selected_cols = ['sex', 'on thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick', 'pregnant',
                 'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium',
                 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH measured', 'T3 measured', 'TT4 measured',
                 'T4U measured', 'FTI measured', 'TBG measured', 'binaryClass'] + numerical_cols

# Subset the DataFrame with the selected columns
df_selected = df[selected_cols]

# Perform feature selection using chi-square test
X = df_selected.drop('binaryClass', axis=1)  # Independent variables
y = df_selected['binaryClass']  # Dependent variable

selector = SelectKBest(score_func=chi2, k=5)  # Select top 5 features
X_selected = selector.fit_transform(X, y)

# Get the selected feature names
selected_feature_names = X.columns[selector.get_support()]

# Print the selected features
print("Selected features:")
print(selected_feature_names)

Selected features:
Index(['on thyroxine', 'query hyperthyroid', 'tumor', 'psych', 'TSH'], dtype='object')


# Model training and testing

# 1.) KNN

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Select the desired features from the resampled DataFrame
selected_features = ['on thyroxine', 'query hypothyroid', 'TSH', 'TT4', 'FTI']
X_resampled_selected = df_resampled[selected_features]
y_resampled = df_resampled['binaryClass']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled_selected, y_resampled, test_size=0.2, random_state=42)

# Create a KNN classifier
knn = KNeighborsClassifier()

# Fit the classifier to the training data
knn.fit(X_train, y_train)

# Predict the classes for the test set
y_pred = knn.predict(X_test)

# Calculate and print the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

Accuracy: 0.96640625
Precision: 0.9750778816199377
Recall: 0.9586523736600306
F1 Score: 0.9667953667953668
Confusion Matrix:
[[611  16]
 [ 27 626]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       627
           1       0.98      0.96      0.97       653

    accuracy                           0.97      1280
   macro avg       0.97      0.97      0.97      1280
weighted avg       0.97      0.97      0.97      1280



# 2.) SVM

In [37]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Select the desired features from the resampled DataFrame
selected_features = ['on thyroxine', 'query hypothyroid', 'TSH', 'TT4', 'FTI']
X_resampled_selected = df[selected_features]
y_resampled = df['binaryClass']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled_selected, y_resampled, test_size=0.2, random_state=42)

# Create an SVM classifier
svm = SVC()

# Fit the classifier to the training data
svm.fit(X_train, y_train)

# Predict the classes for the test set
y_pred = svm.predict(X_test)

# Calculate and print the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

Accuracy: 0.9578125
Precision: 0.9600614439324117
Recall: 0.9571209800918836
F1 Score: 0.9585889570552149
Confusion Matrix:
[[601  26]
 [ 28 625]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       627
           1       0.96      0.96      0.96       653

    accuracy                           0.96      1280
   macro avg       0.96      0.96      0.96      1280
weighted avg       0.96      0.96      0.96      1280



## 3.) Random Forest 

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Select the desired features from the resampled DataFrame
selected_features = ['on thyroxine', 'query hypothyroid', 'TSH', 'TT4', 'FTI']
X_resampled_selected = df[selected_features]
y_resampled = df['binaryClass']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled_selected, y_resampled, test_size=0.2, random_state=42)

# Create a Random Forest classifier
rf = RandomForestClassifier()

# Fit the classifier to the training data
rf.fit(X_train, y_train)

# Predict the classes for the test set
y_pred = rf.predict(X_test)

# Calculate and print the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)
# Check the sizes of the training and test sets
print("Training set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])

Accuracy: 0.99375
Precision: 0.9969183359013868
Recall: 0.9908116385911179
F1 Score: 0.9938556067588324
Confusion Matrix:
[[625   2]
 [  6 647]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       627
           1       1.00      0.99      0.99       653

    accuracy                           0.99      1280
   macro avg       0.99      0.99      0.99      1280
weighted avg       0.99      0.99      0.99      1280

Training set size: 5116
Test set size: 1280


## 3.) Random Forest WITH GRID SEARCH CV

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the desired features from the resampled DataFrame
selected_features = ['on thyroxine', 'query hypothyroid', 'TSH', 'TT4', 'FTI']
X_resampled_selected = df_resampled[selected_features]
y_resampled = df_resampled['binaryClass']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled_selected, y_resampled, test_size=0.2, random_state=42)

# Define the parameter grid to search through
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Create the RandomForestClassifier
rf = RandomForestClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(rf, param_grid, scoring='accuracy', cv=5)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and model performance
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the results
print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Use the best model for prediction
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate and print the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: 0.9979420018709074
Accuracy: 0.9970082273747195
Precision: 1.0
Recall: 0.9941860465116279
F1 Score: 0.9970845481049563
Confusion Matrix:
[[649   0]
 [  4 684]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       649
           1       1.00      0.99      1.00       688

    accuracy                           1.00      1337
   macro avg       1.00      1.00      1.00      1337
weighted avg       1.00      1.00      1.00      1337

