In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# Load dataset
df = pd.read_csv("dataset.csv")

# --- Preprocessing ---
# Drop irrelevant columns if any, or columns with too many unique values for one-hot encoding
# Based on the description, all columns seem relevant.

# Encode categorical features
# Identify categorical columns excluding the target 'Recurred' and numerical columns
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()
if 'Recurred' in categorical_cols:
    categorical_cols.remove('Recurred') # Exclude target if it's in this list

# Apply Label Encoding to categorical columns
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Encode the target variable 'Recurred'
le_recurred = LabelEncoder()
df['Recurred'] = le_recurred.fit_transform(df['Recurred'])
# Save the label encoder for 'Recurred' if you need to transform predictions back
joblib.dump(le_recurred, "label_encoder_recurred.pkl")


# Drop any rows with missing values (if any exist after encoding, though not explicitly mentioned for this dataset)
df = df.dropna()

# Features and target
X = df.drop(columns=['Recurred'])
y = df['Recurred']

# Ensure no NaNs are left
assert X.isnull().sum().sum() == 0, "NaNs found in features after preprocessing"
assert y.isnull().sum() == 0, "NaNs found in target after preprocessing"

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save the scaler
joblib.dump(scaler, "scaler.pkl")

# =============================
# 1. Logistic Regression
# =============================
print("\n" + "="*30)
print("LOGISTIC REGRESSION")
print("="*30)
# Using 'liblinear' solver for binary classification for simplicity and good performance on small datasets
logreg = LogisticRegression(solver='liblinear', max_iter=1000, random_state=42)
param_grid_lr = {'C': [0.01, 0.1, 1, 10, 100]}
grid_lr = GridSearchCV(logreg, param_grid_lr, cv=5, n_jobs=-1, verbose=1)
grid_lr.fit(X_train, y_train)
y_pred_lr = grid_lr.predict(X_test)

print("Best Params:", grid_lr.best_params_)
acc_lr = accuracy_score(y_test, y_pred_lr)
print("Accuracy:", acc_lr)
print("Classification Report:\n", classification_report(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))

# =============================
# 2. Decision Tree
# =============================
print("\n" + "="*30)
print("DECISION TREE")
print("="*30)
dt = DecisionTreeClassifier(random_state=42)
param_grid_dt = {'max_depth': [3, 5, 7, 10], 'min_samples_split': [2, 5, 10]}
grid_dt = GridSearchCV(dt, param_grid_dt, cv=5, n_jobs=-1, verbose=1)
grid_dt.fit(X_train, y_train)
y_pred_dt = grid_dt.predict(X_test)

print("Best Params:", grid_dt.best_params_)
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy:", acc_dt)
print("Classification Report:\n", classification_report(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))

# =============================
# 3. Random Forest
# =============================
print("\n" + "="*30)
print("RANDOM FOREST")
print("="*30)
rf = RandomForestClassifier(random_state=42)
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, 10]}
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, n_jobs=-1, verbose=1)
grid_rf.fit(X_train, y_train)
y_pred_rf = grid_rf.predict(X_test)

print("Best Params:", grid_rf.best_params_)
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy:", acc_rf)
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

# =============================
# 📊 Accuracy Comparison
# =============================
print("\n" + "="*30)
print("MODEL ACCURACY COMPARISON")
print("="*30)
print(f"Logistic Regression Accuracy: {acc_lr:.4f}")
print(f"Decision Tree Accuracy     : {acc_dt:.4f}")
print(f"Random Forest Accuracy     : {acc_rf:.4f}")

# =============================
# 💾 Save the best model
# =============================
# Determine the best model based on accuracy
accuracies = {'Logistic Regression': acc_lr, 'Decision Tree': acc_dt, 'Random Forest': acc_rf}
best_model_name = max(accuracies, key=accuracies.get)

if best_model_name == 'Logistic Regression':
    best_model = grid_lr.best_estimator_
elif best_model_name == 'Decision Tree':
    best_model = grid_dt.best_estimator_
else: # Random Forest
    best_model = grid_rf.best_estimator_

joblib.dump(best_model, "best_thyroid_cancer_recurrence_model.pkl")
print(f"\n✅ Best model ({best_model_name}) saved to: best_thyroid_cancer_recurrence_model.pkl")


LOGISTIC REGRESSION
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Params: {'C': 0.1}
Accuracy: 0.935064935064935
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.98      0.96        58
           1       0.94      0.79      0.86        19

    accuracy                           0.94        77
   macro avg       0.94      0.89      0.91        77
weighted avg       0.94      0.94      0.93        77

Confusion Matrix:
 [[57  1]
 [ 4 15]]

DECISION TREE
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Params: {'max_depth': 3, 'min_samples_split': 10}
Accuracy: 0.974025974025974
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98        58
           1       1.00      0.89      0.94        19

    accuracy                           0.97        77
   macro avg       0.98      0.95      0.96        77
weighted avg       0.97