In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# SMOTE is no longer needed

# --- 1. Load Data ---
print("Loading the processed data...")
try:
    df = pd.read_csv('thyroid_final_cleaned.csv')
    print("✅ Data loaded successfully.")
except FileNotFoundError:
    print("❌ Error: 'thyroid_final_cleaned.csv' not found. Make sure it's in the same directory.")
    exit()

# --- 2. Separate Features (X) and Target (y) ---
X = df.drop('target', axis=1)
y = df['target']

# --- 3. Split Data into Training and Testing Sets ---
print("\nSplitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
print("✅ Data split complete.")

# --- 4. Train the Random Forest Model with Class Weighting ---
print("\nTraining the Random Forest model with class_weight='balanced'...")
# Add the class_weight='balanced' parameter to handle imbalance
model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'
)

# Train on the original, imbalanced training data
model.fit(X_train, y_train)
print("✅ Model training complete.")

# --- 5. Save the Trained Model ---
model_filename = 'thyroid_model_weighted.pkl'
with open(model_filename, 'wb') as f:
    pickle.dump(model, f)
print(f"✅ Trained model saved to '{model_filename}'")

Loading the processed data...
✅ Data loaded successfully.

Splitting data into training and testing sets...
✅ Data split complete.

Training the Random Forest model with class_weight='balanced'...
✅ Model training complete.
✅ Trained model saved to 'thyroid_model_weighted.pkl'
