In [1]:
#Essential Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Load the dataset

dataset = pd.read_csv('Thyroid_Diff.csv')
df = dataset

# Data Preprocessing
    # finding Correlation
le = LabelEncoder()
df_encoded = df.apply(lambda col: le.fit_transform(col) if col.dtypes == 'object' else col)
correlations = df_encoded.corr()['Recurred'].drop('Recurred').sort_values(ascending=False)

    # Selecting columns with low correlation
columns_to_drop = [
    'Hx Radiothreapy',
    'Hx Smoking',
    'Thyroid Function',
    'Pathology',
    'Physical Examination',
    'Adenopathy'
]
    # Dropping columns with low correlation
df_cleaned = df.drop(columns=columns_to_drop)
X= df_cleaned.iloc[:, :-1]
y= df_cleaned.iloc[:, -1].values

    # Encoding categorical variables
y = le.fit_transform(y)

columns = X.select_dtypes(include=['object']).columns.tolist()

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('encoder', OneHotEncoder(drop='first'), columns)], remainder='passthrough')
X_encoded = ct.fit_transform(X)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False)
X_train[:,-1] = scaler.fit_transform(X_train[:,-1].reshape(-1, 1))
X_test[:,-1] = scaler.transform(X_test[:,-1].reshape(-1, 1))


from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
param_grid = {'n_estimators': list(range(100, 500, 100))}
grid_search = GridSearchCV(RandomForestClassifier(random_state=0), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# print("Best n_estimators:", grid_search.best_params_['n_estimators'])
k= grid_search.best_params_['n_estimators']

# y_pred = classifier.predict(X_test)
# from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# cm = confusion_matrix(y_test, y_pred)
# print("Confusion Matrix:\n", cm)

# print('accuracy_score:', accuracy_score(y_test, y_pred))
classifier = RandomForestClassifier(n_estimators = k, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)


import joblib

# Save your trained model
joblib.dump(classifier, "classifier.pkl")

# Save your ColumnTransformer
joblib.dump(ct, "column_transformer.pkl")

# Save your scaler
joblib.dump(scaler, "scaler.pkl")

# Save your LabelEncoder
joblib.dump(le, "label_encoder.pkl")


['label_encoder.pkl']

In [5]:
def predict_thyroid_recurrence(user_input_dict):
    # Convert to DataFrame
    user_df = pd.DataFrame([user_input_dict])

    # Drop unused columns (just like training)
    columns_to_drop = [
        'Hx Radiothreapy',
        'Hx Smoking',
        'Thyroid Function',
        'Pathology',
        'Physical Examination',
        'Adenopathy'
    ]
    user_df = user_df.drop(columns=columns_to_drop)

    # Encode categorical features using same transformer
    user_encoded = ct.transform(user_df)

    # Scale last column like in training
    user_encoded[:, -1] = scaler.transform(user_encoded[:, -1].reshape(-1, 1))

    # Predict
    prediction = classifier.predict(user_encoded)
    prediction_label = le.inverse_transform(prediction)

    return prediction_label[0]

def select_option(prompt, options):
    while True:
        print(f"\n{prompt}")
        for i, option in enumerate(options, 1):
            print(f"{i}. {option}")
        try:
            choice = int(input("Select option number: "))
            if 1 <= choice <= len(options):
                return options[choice - 1]
            else:
                print(f"⚠️ Please enter a number between 1 and {len(options)}.")
        except ValueError:
            print("⚠️ Invalid input. Please enter a number.")

def get_user_input_menu():
    user_input = {
        'Age': int(input("Enter Age: ")),
        'Gender': select_option("Gender", ['F', 'M']),
        'Smoking': select_option("Smoking", ['Yes', 'No']),
        'Hx Smoking': select_option("Hx Smoking", ['Yes', 'No']),
        'Hx Radiothreapy': select_option("Hx Radiothreapy", ['Yes', 'No']),
        'Thyroid Function': select_option("Thyroid Function", ['Euthyroid', 'Hypothyroid', 'Hyperthyroid']),
        'Physical Examination': select_option("Physical Examination", [
            'Single nodular goiter-left',
            'Single nodular goiter-right',
            'Multinodular goiter'
        ]),
        'Adenopathy': select_option("Adenopathy", [
            'No', 'Right', 'Left', 'Bilateral', 'Posterior', 'Extensive'
        ]),
        'Pathology': select_option("Pathology", [
            'Micropapillary', 'Hurthel cell', 'Papillary'
        ]),
        'Focality': select_option("Focality", ['Uni-Focal', 'Multi-Focal']),
        'Risk': select_option("Risk", ['Low', 'Intermediate', 'High']),
        'T': select_option("T Stage", ['T1a','T1b' ,'T2', 'T3a', 'T3b', 'T4a', 'T4b']),
        'N': select_option("N Stage", ['N0', 'N1a', 'N1b']),
        'M': select_option("M Stage", ['M0', 'M1']),
        'Stage': select_option("Stage", ['I', 'II', 'III', 'IVA', 'IVB']),
        'Response': select_option("Response", ['Excellent', 'Indeterminate', 'Structural Incomplete', 'Biochemical Incomplete']),
    }
    return user_input
# Get input from user via dropdown-style CLI
sample_input = get_user_input_menu()

# Predict recurrence
result = predict_thyroid_recurrence(sample_input)
print("\n🔮 Prediction (Recurred):", result)


ValueError: invalid literal for int() with base 10: ''


Gender
1. M
2. F

Smoking
1. Yes
2. No

Hx Smoking
1. Yes
2. No

Hx Radiothreapy
1. Yes
2. No

Thyroid Function
1. Euthyroid
2. Hypothyroid
3. Hyperthyroid

Physical Examination
1. Single nodular goiter-left
2. Single nodular goiter-right
3. Multi nodular goiter
4. Diffuse goiter

Adenopathy
1. Yes
2. No

Pathology
1. Micropapillary
2. Classic Papillary
3. Follicular Variant
4. Tall Cell

Focality
1. Uni-Focal
2. Multi-Focal

Risk
1. Low
2. Intermediate
3. High

T Stage
1. T1a
2. T1b
3. T2
4. T3
5. T4

N Stage
1. N0
2. N1a
3. N1b

M Stage
1. M0
2. M1

Stage
1. I
2. II
3. III
4. IV

Response
1. Excellent
2. Indeterminate
3. Incomplete


ValueError: Found unknown categories ['T4'] in column 4 during transform