In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Introduction
This project predicts thyroid cancer recurrence after radioactive iodine (RAI) therapy using supervised and unsupervised machine learning. 

For supervised ML Randomforest and Logistic Regression was used.
For unsupervised K-means and Hierarchical Clustering was used.

## Dataset Overview

- **Source**: Institutional dataset (uploaded manually)
- **Records**: 383 patients
- **Features**: 13 attributes
- **Target variable**: `Recurred` (Yes/No)

Part 1 – Pre-processing/Exploring the data:

In [None]:
# Load dataset
df = pd.read_csv('/kaggle/input/thyroid2/filtered_thyroid_data.csv')
df.head()
print(df.shape)        # Should return (383, 13)


In [None]:
# Shape and basic info
print("Shape:", df.shape)
df.info()


In [None]:
# Encode categorical variables
label_cols = df.select_dtypes(include='object').columns

le = LabelEncoder()
df_encoded = df.copy()
for col in label_cols:
    df_encoded[col] = le.fit_transform(df_encoded[col])

df_encoded.head()


In [None]:
#Check if encoded properly. Should give int instead of the previous object.
df_encoded.info()

In [None]:


# Distribution of age
# https://seaborn.pydata.org/generated/seaborn.histplot.html
plt.figure(figsize=(8,4))
sns.histplot(df['Age'], bins=20, kde=True)
plt.title('Age Distribution')
plt.show()


In [None]:
# Recurred class distribution
# https://seaborn.pydata.org/generated/seaborn.countplot.html
sns.countplot(x='Recurred', data=df)
plt.title('Recurred Class Balance')
plt.show()
print(df['Recurred'].value_counts(normalize=True))

In [None]:
# Gender feature distribution
# https://seaborn.pydata.org/generated/seaborn.countplot.html
sns.countplot(x='Gender', data=df)
plt.title('Gender  Balance')
plt.show()
print(df['Gender'].value_counts(normalize=True))

Skewed balance between genders.

In [None]:
# Boxplot of Age vs Recurred
# https://seaborn.pydata.org/generated/seaborn.boxplot.html
sns.boxplot(x='Recurred', y='Age', data=df)
plt.title('Age vs Recurred')
plt.show()

In [None]:
#Plot distributions for key categories compared to recurrense
#Cleaner version by ChatGPT with addition of for loop to iterate through the categories
cat_cols = ['Gender','Hx Radiothreapy', 'Adenopathy', 'Pathology', 'Focality', 'Risk', 'Stage', 'Response']
for col in cat_cols:
    plt.figure(figsize=(8,4))
    sns.countplot(x=col, hue='Recurred', data=df)
    plt.title(f'{col} vs. Recurrence')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
#As only a small degree of people had done radiotherapy previously
#Check to see if it has statistic importance to remove it or not.
from scipy.stats import fisher_exact

contingency_table = pd.crosstab(df['Hx Radiothreapy'], df['Recurred'])
odds_ratio, p_value = fisher_exact(contingency_table)
print(f"p-value: {p_value:.4f}")  # Significant if p < 0.05

P < 0.05. Hypothesis is discarded. Hx Radiothreapy is kept.

In [None]:
#Codeblock generated with ChatGPT 4-Turbo
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Drop target and scale the features
X = df_encoded.drop(columns=['Recurred'])
y = df_encoded['Recurred']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Step 3: Create a DataFrame for plotting
pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
pca_df['Recurred'] = y

# Step 4: Plot PCA projection with class labels
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='Recurred', palette='Set1')
plt.title('PCA Projection (2D) Colored by Recurred Class')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.legend(title='Recurred')
plt.show()

Part 2: Supervised learning

In [None]:
# Target variable
# https://www.geeksforgeeks.org/random-forest-algorithm-in-machine-learning/
target = 'Recurred'
X = df_encoded.drop(columns=[target])
y = df_encoded[target]

# Train-test split (70/30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


LogisticRegression

In [None]:
#Example modified from https://www.kaggle.com/code/gururajbhase/logistic-regression
from sklearn.linear_model import LogisticRegression

# Baseline model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Evaluation
print("Logistic Regression Performance:")
print(classification_report(y_test, y_pred_lr))

# Confusion matrix generated by ChatGPT 4-Turbo
sns.heatmap(confusion_matrix(y_test, y_pred_lr), annot=True, fmt='d')
plt.title("Confusion Matrix: Logistic Regression")
plt.show()


In [None]:
# Try different C values (regularization)
for C in [0.01, 0.1, 1, 10]:
    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(f"Logistic Regression with C={C}")
    print(classification_report(y_test, preds))
    print('-'*50)


RandomForest

In [None]:
#Example modified from https://www.kaggle.com/code/dansbecker/exercise-random-forests
from sklearn.ensemble import RandomForestClassifier


# Define the model. Set random_state to 42
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on validation data (X_test)
y_pred_rf = rf_model.predict(X_test)

# Evaluation
print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))

# Plot confusion matrix
# Confusion matrix generated by ChatGPT 4-Turbo
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d')
plt.title("Confusion Matrix: Random Forest")
plt.show()

In [None]:
# Try different n_estimators and max_depth
for n in [50, 100, 200]:
    for d in [None, 5, 10]:
        rf_model = RandomForestClassifier(n_estimators=n, max_depth=d, random_state=42)
        rf_model.fit(X_train, y_train)
        preds = rf_model.predict(X_test)
        print(f"Random Forest (n_estimators={n}, max_depth={d})")
        print(classification_report(y_test, preds))
        print('-'*60)


In [None]:
# Compare training and test accuracy
train_acc = rf.score(X_train, y_train)
test_acc = rf.score(X_test, y_test)

print(f"Random Forest Training Accuracy: {train_acc:.2f}")
print(f"Random Forest Test Accuracy: {test_acc:.2f}")


Part 3 – Unsupervised learning
K-Means

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Use encoded data without the target column
X_unsupervised = df_encoded.drop(columns=['Recurred'])

# Cleaned up with ChatGPT 4-Turbo. Instead of a line for getting a score per cluster,
# For loop used to iterate. 
# Try different k values
silhouette_scores = []

for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_unsupervised)
    score = silhouette_score(X_unsupervised, labels)
    silhouette_scores.append(score)
    print(f"K={k} => Silhouette Score: {score:.4f}")


In [None]:
# Plot silhouette scores
plt.plot(range(2, 10), silhouette_scores, marker='o')
plt.title("K-Means Silhouette Scores")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Silhouette Score")
plt.grid(True)
plt.show()


In [None]:
# Fit KMeans with optimal K. Set here at 2. 
kmeans_final = KMeans(n_clusters=2, random_state=42)
clusters = kmeans_final.fit_predict(X_unsupervised)

# Visualize with PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_unsupervised)

plt.figure(figsize=(8,5))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=clusters, palette='Set2')
plt.title("K-Means Clustering (PCA projection)")
plt.show()


Hierarchical Clustering

In [None]:
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
from scipy.cluster.hierarchy import dendrogram, linkage

# Create linkage matrix
linked = linkage(X_unsupervised, method='ward')
fig = plt.figure(figsize=(25, 10))
dn = dendrogram(linked)


In [None]:
# Assign cluster labels from Agglomerative Clustering
# Codesection generated by ChatGPT 4-Turbo
from sklearn.cluster import AgglomerativeClustering

agg = AgglomerativeClustering(n_clusters=3, linkage='ward')
labels_agg = agg.fit_predict(X_unsupervised)

# Visualize clusters using PCA again
plt.figure(figsize=(8,5))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=labels_agg, palette='Set1')
plt.title("Hierarchical Clustering (PCA projection)")
plt.show()
