In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import itertools

In [2]:
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
dt_full = DecisionTreeClassifier(random_state=42)
dt_full.fit(X_train, y_train)
y_train_pred = dt_full.predict(X_train)
y_test_pred = dt_full.predict(X_test)
print("Full Decision Tree:")
print("Train Accuracy:", accuracy_score(y_train, y_train_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

Full Decision Tree:
Train Accuracy: 1.0
Test Accuracy: 0.9473684210526315


In [5]:
dt_pruned = DecisionTreeClassifier(max_depth=3, random_state=42)
dt_pruned.fit(X_train, y_train)
y_train_pred = dt_pruned.predict(X_train)
y_test_pred = dt_pruned.predict(X_test)
print("\nPruned Decision Tree (max_depth=3):")
print("Train Accuracy:", accuracy_score(y_train, y_train_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))


Pruned Decision Tree (max_depth=3):
Train Accuracy: 0.978021978021978
Test Accuracy: 0.9473684210526315


In [7]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)
print("\nRandom Forest (100 trees):")
print("Train Accuracy:", accuracy_score(y_train, y_train_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))


Random Forest (100 trees):
Train Accuracy: 1.0
Test Accuracy: 0.9649122807017544


In [8]:
rf_importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nTop 5 Feature Importances (Random Forest):")
print(rf_importances.head(5))



Top 5 Feature Importances (Random Forest):
worst area              0.153892
worst concave points    0.144663
mean concave points     0.106210
worst radius            0.077987
mean concavity          0.068001
dtype: float64


In [9]:
rf_importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nTop 5 Feature Importances (Random Forest):")
print(rf_importances.head(5))



Top 5 Feature Importances (Random Forest):
worst area              0.153892
worst concave points    0.144663
mean concave points     0.106210
worst radius            0.077987
mean concavity          0.068001
dtype: float64


In [10]:
learning_rates = [0.01, 0.1]
n_estimators_list = [50, 100, 200]

print("\nGradient Boosting results:")
for lr, n_est in itertools.product(learning_rates, n_estimators_list):
    gb = GradientBoostingClassifier(learning_rate=lr, n_estimators=n_est, random_state=42)
    gb.fit(X_train, y_train)
    y_train_pred = gb.predict(X_train)
    y_test_pred = gb.predict(X_test)
    print(f"learning_rate={lr}, n_estimators={n_est} --> Train Acc: {accuracy_score(y_train, y_train_pred):.4f}, Test Acc: {accuracy_score(y_test, y_test_pred):.4f}")



Gradient Boosting results:
learning_rate=0.01, n_estimators=50 --> Train Acc: 0.9780, Test Acc: 0.9561
learning_rate=0.01, n_estimators=100 --> Train Acc: 0.9868, Test Acc: 0.9561
learning_rate=0.01, n_estimators=200 --> Train Acc: 0.9934, Test Acc: 0.9561
learning_rate=0.1, n_estimators=50 --> Train Acc: 1.0000, Test Acc: 0.9561
learning_rate=0.1, n_estimators=100 --> Train Acc: 1.0000, Test Acc: 0.9561
learning_rate=0.1, n_estimators=200 --> Train Acc: 1.0000, Test Acc: 0.9561


In [11]:
gb_default = GradientBoostingClassifier(random_state=42)
gb_default.fit(X_train, y_train)
gb_importances = pd.Series(gb_default.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nTop 5 Feature Importances (Gradient Boosting):")
print(gb_importances.head(5))


Top 5 Feature Importances (Gradient Boosting):
mean concave points     0.450528
worst concave points    0.240103
worst radius            0.075589
worst perimeter         0.051408
worst texture           0.039886
dtype: float64
