In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
!pip install xgboost



In [2]:
df = pd.read_csv("hypothyroid.csv")

In [3]:
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,binaryClass
0,41,F,f,f,f,f,f,f,f,f,...,t,125,t,1.14,t,109,f,?,SVHC,P
1,23,F,f,f,f,f,f,f,f,f,...,t,102,f,?,f,?,f,?,other,P
2,46,M,f,f,f,f,f,f,f,f,...,t,109,t,0.91,t,120,f,?,other,P
3,70,F,t,f,f,f,f,f,f,f,...,t,175,f,?,f,?,f,?,other,P
4,70,F,f,f,f,f,f,f,f,f,...,t,61,t,0.87,t,70,f,?,SVI,P


In [4]:
df.replace('?', np.nan, inplace=True)

In [5]:
# Dropping TBG column as the entire column has no entries
df.drop('TBG', axis=1, inplace=True)

In [6]:
df.dropna(subset=['sex'], inplace=True)
df.dropna(subset=['age'], inplace=True)

In [7]:
df.drop("referral source", axis=1, inplace = True)

## Replacing T/F , M/F and P/N with 1 and 0 

In [8]:
columns_with_tf_values = df.columns[df.isin(['t', 'f']).all()]
print(columns_with_tf_values)

Index(['on thyroxine', 'query on thyroxine', 'on antithyroid medication',
       'sick', 'pregnant', 'thyroid surgery', 'I131 treatment',
       'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor',
       'hypopituitary', 'psych', 'TSH measured', 'T3 measured', 'TT4 measured',
       'T4U measured', 'FTI measured', 'TBG measured'],
      dtype='object')


In [9]:
columns_to_replace = ['on thyroxine', 'query on thyroxine', 'on antithyroid medication',
                      'sick', 'pregnant', 'thyroid surgery', 'I131 treatment',
                      'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor',
                      'hypopituitary', 'psych', 'TSH measured', 'T3 measured', 'TT4 measured',
                      'T4U measured', 'FTI measured', 'TBG measured']

replace_dict = {'t': 1, 'f': 0}

df[columns_to_replace] = df[columns_to_replace].replace(replace_dict)

In [10]:
columns_to_replace = ['sex']

replace_dict = {'M': 0, 'F': 1}

df[columns_to_replace] = df[columns_to_replace].replace(replace_dict)

In [11]:
columns_to_replace = ['binaryClass']

replace_dict = {'P': 1, 'N': 0}

df[columns_to_replace] = df[columns_to_replace].replace(replace_dict)

print("Unique values after replacement:", df['binaryClass'].unique())
unique_counts = df['binaryClass'].value_counts()
print(unique_counts)

Unique values after replacement: [1 0]
binaryClass
1    3341
0     280
Name: count, dtype: int64


In [12]:
df['age'] = pd.to_numeric(df['age'], errors='coerce')

## Finding and removing Outliers

In [13]:
# import pandas as pd
# import numpy as np

# # Define the numerical columns in your dataset
# numerical_cols = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']

# # Convert the numerical columns to numeric data type
# df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric, errors='coerce')

# # Drop rows with missing values in the numerical columns
# df.dropna(subset=numerical_cols, inplace=True)

# # Calculate the quartiles and IQR for each numerical column
# Q1 = df[numerical_cols].quantile(0.25)
# Q3 = df[numerical_cols].quantile(0.75)
# IQR = Q3 - Q1

# # Define the lower and upper bounds for outlier detection
# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR

# # Identify the outliers in each numerical column
# outliers = (df[numerical_cols] < lower_bound) | (df[numerical_cols] > upper_bound)

# # Count the number of outliers removed
# num_outliers_removed = outliers.any(axis=1).sum()

# # Remove the rows containing outliers
# df = df[~outliers.any(axis=1)]

# # Print the number of outliers removed
# print("Number of outliers removed:", num_outliers_removed)

## Replacing missing attributes with median

In [14]:
missing_values = df.isnull().sum()
print(missing_values)

age                            0
sex                            0
on thyroxine                   0
query on thyroxine             0
on antithyroid medication      0
sick                           0
pregnant                       0
thyroid surgery                0
I131 treatment                 0
query hypothyroid              0
query hyperthyroid             0
lithium                        0
goitre                         0
tumor                          0
hypopituitary                  0
psych                          0
TSH measured                   0
TSH                          352
T3 measured                    0
T3                           745
TT4 measured                   0
TT4                          217
T4U measured                   0
T4U                          367
FTI measured                   0
FTI                          365
TBG measured                   0
binaryClass                    0
dtype: int64


In [15]:
columns_to_fill = ['TSH', 'T3', 'TT4', 'T4U', 'FTI']
for column in columns_to_fill:
    df[column] = pd.to_numeric(df[column], errors='coerce')
    median_value = np.median(df[column].dropna())
    df[column].fillna(median_value, inplace=True)

In [16]:
df.dtypes

age                            int64
sex                            int64
on thyroxine                   int64
query on thyroxine             int64
on antithyroid medication      int64
sick                           int64
pregnant                       int64
thyroid surgery                int64
I131 treatment                 int64
query hypothyroid              int64
query hyperthyroid             int64
lithium                        int64
goitre                         int64
tumor                          int64
hypopituitary                  int64
psych                          int64
TSH measured                   int64
TSH                          float64
T3 measured                    int64
T3                           float64
TT4 measured                   int64
TT4                          float64
T4U measured                   int64
T4U                          float64
FTI measured                   int64
FTI                          float64
TBG measured                   int64
b

## Feature Scaling

In [17]:
from sklearn.preprocessing import MinMaxScaler
numerical_cols = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']
scaler = MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [18]:
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,T3 measured,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,binaryClass
0,0.088106,1,0,0,0,0,0,0,0,0,...,1,0.232227,1,0.287383,1,0.429952,1,0.272265,0,1
1,0.048458,1,0,0,0,0,0,0,0,0,...,1,0.184834,1,0.233645,0,0.352657,0,0.267176,0,1
2,0.099119,0,0,0,0,0,0,0,0,0,...,0,0.184834,1,0.25,1,0.318841,1,0.300254,0,1
3,0.151982,1,1,0,0,0,0,0,0,0,...,1,0.175355,1,0.404206,0,0.352657,0,0.267176,0,1
4,0.151982,1,0,0,0,0,0,0,0,0,...,1,0.109005,1,0.13785,1,0.299517,1,0.173028,0,1


## Chi-Square Test for feature selection

In [19]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Define the numerical columns for feature scaling and selection
numerical_cols = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']

# Select the relevant columns for feature selection
selected_cols = ['sex', 'on thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick', 'pregnant',
                 'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium',
                 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH measured', 'T3 measured', 'TT4 measured',
                 'T4U measured', 'FTI measured', 'TBG measured', 'binaryClass'] + numerical_cols

# Subset the DataFrame with the selected columns
df_selected = df[selected_cols]

# Perform feature selection using chi-square test
X = df_selected.drop('binaryClass', axis=1)  # Independent variables
y = df_selected['binaryClass']  # Dependent variable

selector = SelectKBest(score_func=chi2, k=5)  # Select top 5 features
X_selected = selector.fit_transform(X, y)

# Get the selected feature names
selected_feature_names = X.columns[selector.get_support()]

# Print the selected features
print("Selected features:")
print(selected_feature_names)

Selected features:
Index(['on thyroxine', 'query hypothyroid', 'TSH', 'TT4', 'FTI'], dtype='object')


# Model training and testing

# 1.) KNN

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Create a KNN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the classifier to the training data
knn.fit(X_train, y_train)

# Predict the classes for the test set
y_pred = knn.predict(X_test)

# Calculate and print the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

Accuracy: 0.9641379310344828
Precision: 0.9647577092511013
Recall: 0.9969650986342944
F1 Score: 0.9805970149253731
Confusion Matrix:
[[ 42  24]
 [  2 657]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.64      0.76        66
           1       0.96      1.00      0.98       659

    accuracy                           0.96       725
   macro avg       0.96      0.82      0.87       725
weighted avg       0.96      0.96      0.96       725



# 2.) SVM

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Create an SVM classifier
svm = SVC()

# Fit the classifier to the training data
svm.fit(X_train, y_train)

# Predict the classes for the test set
y_pred = svm.predict(X_test)

# Calculate and print the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)


Accuracy: 0.9337931034482758
Precision: 0.9321074964639321
Recall: 1.0
F1 Score: 0.9648609077598829
Confusion Matrix:
[[ 18  48]
 [  0 659]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.27      0.43        66
           1       0.93      1.00      0.96       659

    accuracy                           0.93       725
   macro avg       0.97      0.64      0.70       725
weighted avg       0.94      0.93      0.92       725



# 3.) Random Forest

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Create a Random Forest classifier
rf = RandomForestClassifier()

# Fit the classifier to the training data
rf.fit(X_train, y_train)

# Predict the classes for the test set
y_pred = rf.predict(X_test)

# Calculate and print the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)


Accuracy: 0.9944827586206897
Precision: 1.0
Recall: 0.9939301972685888
F1 Score: 0.9969558599695586
Confusion Matrix:
[[ 66   0]
 [  4 655]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        66
           1       1.00      0.99      1.00       659

    accuracy                           0.99       725
   macro avg       0.97      1.00      0.98       725
weighted avg       0.99      0.99      0.99       725



# 4.) Naive Bayes

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Create a Naive Bayes classifier
naive_bayes = GaussianNB()

# Fit the classifier to the training data
naive_bayes.fit(X_train, y_train)

# Predict the classes for the test set
y_pred = naive_bayes.predict(X_test)

# Calculate and print the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)


Accuracy: 0.9475862068965517
Precision: 0.9506531204644412
Recall: 0.9939301972685888
F1 Score: 0.9718100890207716
Confusion Matrix:
[[ 32  34]
 [  4 655]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.48      0.63        66
           1       0.95      0.99      0.97       659

    accuracy                           0.95       725
   macro avg       0.92      0.74      0.80       725
weighted avg       0.95      0.95      0.94       725



# 5.) XGBoost

In [24]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Create an XGBoost classifier
xgb = XGBClassifier()

# Fit the classifier to the training data
xgb.fit(X_train, y_train)

# Predict the classes for the test set
y_pred = xgb.predict(X_test)

# Calculate and print the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)


Accuracy: 0.9944827586206897
Precision: 1.0
Recall: 0.9939301972685888
F1 Score: 0.9969558599695586
Confusion Matrix:
[[ 66   0]
 [  4 655]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        66
           1       1.00      0.99      1.00       659

    accuracy                           0.99       725
   macro avg       0.97      1.00      0.98       725
weighted avg       0.99      0.99      0.99       725



# 6.) Kernel SVM

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Create an SVM classifier with a kernel (e.g., 'rbf' for Gaussian kernel)
svm = SVC(kernel='rbf')

# Fit the classifier to the training data
svm.fit(X_train, y_train)

# Predict the classes for the test set
y_pred = svm.predict(X_test)

# Calculate and print the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

Accuracy: 0.9337931034482758
Precision: 0.9321074964639321
Recall: 1.0
F1 Score: 0.9648609077598829
Confusion Matrix:
[[ 18  48]
 [  0 659]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.27      0.43        66
           1       0.93      1.00      0.96       659

    accuracy                           0.93       725
   macro avg       0.97      0.64      0.70       725
weighted avg       0.94      0.93      0.92       725

