In [None]:
#@title Import librarys
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score, classification_report, accuracy_score, confusion_matrix

In [None]:
#@title Initialize Variables
path= "/content/thyroidDF.csv"
data = pd.read_csv(path)
df = data.copy(True)
le = LabelEncoder()

##Data cleaning


###Understand data

---


In [None]:
#@title `head()`
df.head(10)

In [None]:
#@title Find `columns`
col = df.columns
col

In [None]:
#@title Ploting the number of `null values`
def plotna(data = df, col=col):
  nulV = data.isna().sum()
  plt.bar(col, nulV)
  plt.xlabel("Columns")
  plt.ylabel("Number of null values")
  plt.xticks(rotation=90, ha='right')
  plt.show()

plotna(df, col)
plot = 0

In [None]:
#@title `shape` <b>Before Cleaning the data </b>
df.shape

In [None]:
#@title `describe()`
df.describe()

In [None]:
#@title  `info()`
df.info()

In [None]:
#@title <b>Five summary</B>
summary = df.describe()

In [None]:
# @title <b>Visualization</b> of 5 Summary

for i in summary.columns:
    q1 = df[i].quantile(0.25)
    q3 = df[i].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # Filter the data
    filtered_data = df[(df[i] >= lower_bound) & (df[i] <= upper_bound)]

    # Create subplots
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))  # 1 row, 2 columns

    # Seaborn Boxplot
    sns.boxplot(x=filtered_data[i], width = 0.35, ax= axes[0],color="lightblue")
    axes[0].set_xlabel(f"{i} Levels")
    axes[0].set_title(f"Box Plot of {i} Levels (Outliers Removed)")

    # Seaborn Histogram
    sns.histplot(filtered_data[i], bins=20, kde=True, ax=axes[1], color="red")
    axes[1].set_xlabel(f"{i} Levels")
    axes[1].set_title(f"Distribution of {i}")

    # Adjust layout and show
    plt.tight_layout()
    plt.show()




---


##Eliminate Null Value


---





In [None]:
plotna()

In [None]:
#@title Cleaning ` Age ` column
df = df[df['age'] <=100] # Removing elements where the age is more than 100
df.age.isna().sum()

In [None]:
#@title Cleaning `Gender` column
df = df.dropna(subset = ['sex'])

In [None]:
"""
No need to Clean ['query_on_thyroxine',
       'on_antithyroid_meds', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured',]
 """

In [None]:
df.hypopituitary.unique()

In [None]:
df['TSH'].isna().sum()

In [None]:
#@title Chaning NaN of `TSH,'T3', 'TT4', 'T4U', 'FTI', 'TBG'` columns to there median respectively

features_to_impute = ['TSH', 'T3', 'TT4', 'T4U', 'FTI']
for feature in features_to_impute:
    df[feature] = df[feature].fillna(df[feature].median())

In [None]:
#@title Drop `TBG` column as <b>too many NaN Values<b>
df = df.drop(columns=['TBG'])

In [None]:
col = df.columns #Re-checking columns since we droped `TBG`

In [None]:
plotna(df, col)

###There is a problem in the dataset <b>in target column as the unique values are `['-', 'S', 'F', 'AK', 'R', 'I', 'M', 'N', 'G', 'K', 'A', 'L', 'MK','Q', 'J', 'C|I', 'O', 'LJ', 'H|K', 'GK', 'MI', 'KJ', 'P', 'FK','B', 'GI', 'C', 'GKJ', 'OI', 'D|R', 'D', 'E']`</b>

In [None]:
# Categorizing different classes into 3 major thyroid conditions: Hyperthyroid, Hypothyroid and Negative (No Thyroid Condition)
hyperthyroid_conditions = ['A', 'B', 'C', 'D', 'O', 'P', 'Q', 'S', 'T']
hypothyroid_conditions = ['E', 'F', 'G', 'H', 'M']
normal_conditions = ['-']

def categorize_target(value):
    # Split the diagnosis into parts for compound cases
    diagnoses = value.split('|')
    # Check for hyperthyroid conditions
    for diagnosis in diagnoses:
        if diagnosis in hyperthyroid_conditions:
            return 'Hyperthyroid'
    # Check for hypothyroid conditions
    for diagnosis in diagnoses:
        if diagnosis in hypothyroid_conditions:
            return 'Hypothyroid'
    for diagnosis in diagnoses:
        if diagnosis in normal_conditions:
            return 'Negative'

#Applying 'categorize_target' function on 'target' column to categorize the values into 3 classes
df['target'] = df['target'].apply(categorize_target)

In [None]:
# df = df.dropna()
# df.target.isna().sum()

---
#### Hurray!!! Data Is Clean And Ready To Use  
---

In [None]:
# @title Removing Unwanted data `'patient_id', 'TBG_measured', 'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured', 'referral_source'`
# Drop irrelevant and redundant columns
columns_to_drop = [
    'patient_id', 'TBG_measured', 'TSH_measured', 'T3_measured',
    'TT4_measured', 'T4U_measured', 'FTI_measured', 'referral_source'
]

# Remove unwanted columns
df = df.drop(columns=columns_to_drop)

In [None]:
df.hypopituitary.unique()

##Encoding Data using `LabelEncoder`
---

In [None]:
binary_cols = [
    'sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_meds', 'sick',
    'pregnant', 'thyroid_surgery', 'I131_treatment', 'query_hypothyroid',
    'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'target'
]

label_encoders = {}
for col in binary_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Encode each binary column
    label_encoders[col] = le

In [None]:
# @title Scaling columns using `MinMaxScaler`

scaler = MinMaxScaler()
numerical_features = ['TSH', 'T3', 'TT4', 'T4U', 'FTI']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [None]:
df.head()

## Finding the Covariance of feature
---

In [None]:
# Assuming `data` is your dataset with features and target combined
plt.figure(figsize=(10, 8))
correlation_matrix = df.corr()

# Use Seaborn to create the heatmap
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
correlation_matrix.target

#Create Models
---

*   <b>Linear regression(<i>Multiple</i>)
*   KNN
*   SVM
*   Decision Tree</b>

---


## Linear Regression
---

In [None]:
target_column = 'target'

X = df.drop(columns=['target'])
y = df[target_column]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
LinearRegression_model = LinearRegression()
LinearRegression_model.fit(X_train, y_train)

In [None]:
ypred = LinearRegression_model.predict(X)
linear_result = r2_score(y, ypred)
print("R^2 Score:", linear_result)

---
##KNN

---

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN_model = KNeighborsClassifier(n_neighbors= 3, metric='euclidean')
KNN_model.fit(X_train, y_train)

In [None]:
ypred = KNN_model.predict(X)
print(ypred)

In [None]:
knn_result = accuracy_score(y, ypred)
knn_result

---
##SVM
---
* Linear SVM
* Non-Linear SVM

In [None]:
# @title Linear SVM
LinearSVC_model = LinearSVC()
LinearSVC_model.fit(X_train, y_train)

In [None]:
ypred = LinearSVC_model.predict(X)
linear_svm_result = accuracy_score(y, ypred)
linear_svm_result

In [None]:
confusion_matrix(y, ypred)

### Non-linear *SVM*

In [None]:
#@title Finding the best degree
# for i in range(50):
#   model = SVC (kernel = 'poly', degree = ).fit(X, y)
#   print("POLY3: ",i , model.score(X_train, y_train))

# Just to find out the best suitable degree

In [None]:
#@title SVM using `Radial Basis Function` ' rbf ' kernel
RBFSVM_model = SVC (kernel = "rbf").fit(X, y)

In [None]:
rbf_result = RBFSVM_model.score(X, y)
rbf_result

In [None]:
#@title SVM using `Polynomial` ' poly ' where the best <b><u>degree is 20</u></b> kernel
# But it requires high computational power
PolynomialSVM_model = SVC (kernel = "poly", degree= 15).fit(X, y)

In [None]:
poly_result = PolynomialSVM_model.score(X, y)
poly_result

---
## Decision Tree
---

In [None]:
DecisionTree_model= DecisionTreeClassifier()
DecisionTree_model.fit(X_train, y_train)

In [None]:
ypred = DecisionTree_model.predict(X)
decision_tree_result = accuracy_score(ypred, y)

## Visualizing the performance of all the models

In [None]:
# Data
results = [linear_result, knn_result, linear_svm_result, rbf_result,  poly_result, decision_tree_result]
names = ['LinearRegression', 'KNN', 'LinearSVM', 'RBFSVM', 'PolynomialSVM', 'DecisionTree']

# Plot
plt.figure(figsize=(8, 6))
sns.barplot(x=names, y=results, color="lightgreen")

# Adding titles and labels
plt.title("Comparison of Model Results", fontsize=16)
plt.xlabel("Models", fontsize=14)
plt.ylabel("Results", fontsize=14)
plt.xticks(rotation=45)  # Rotate x-axis labels if needed
plt.tight_layout()

# Show the plot
plt.show()

for i in range(len(results)):
  print(f"Result of {names[i]} : {results[i]}" )


## Coping models to `thyroid_dection_models` using dictionary dataStructure

In [None]:
model_list = [LinearRegression_model, KNN_model, LinearSVC_model, RBFSVM_model, PolynomialSVM_model, DecisionTree_model]
model_name = ['LinearRegression_model', 'KNN_model', 'LinearSVC_model', 'RBFSVM_model', 'PolynomialSVM_model', 'DecisionTree_model']
models = {}
for i in range(len(model_list)):
  models[model_name[i]] = model_list[i]

models

In [None]:
# Assuming `model` is your trained decision tree
file_name = "thyroid_dection_models.pkl"
with open(file_name, "wb") as file:
    pickle.dump(models, file),
print(f"Model saved as '{file_name}'")

In [None]:
# Load the model from the pickle file
with open(file_name, "rb") as file:
    loaded_model = pickle.load(file)

# Use the loaded model for predictions
ypred = loaded_model['DecisionTree_model'].predict(X)
ypred