In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
# Load the dataset
file_path = 'path_merge.csv'  # Replace with your actual file path
data = pd.read_csv(file_path)
# Drop the 'Unnamed: 0' and 'patient' columns
data = data.drop(columns=['Unnamed: 0', 'patient'])
# Encode categorical columns
label_columns = ['lesion_type', 'subtype', 'er', 'pr', 'her2']
label_encoders = {col: LabelEncoder() for col in label_columns}
for col in label_columns:
    data[col] = label_encoders[col].fit_transform(data[col])
# Handle missing values by imputing with the mean
imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
# Standardize the features before applying PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data_imputed.iloc[:, 2:])
# Apply PCA to reduce dimensionality
pca = PCA(n_components=20)
X_pca = pca.fit_transform(X_scaled)
# Create a new dataframe with the PCA components and the target variables
pca_columns = [f'PC{i+1}' for i in range(X_pca.shape[1])]
data_pca = pd.DataFrame(X_pca, columns=pca_columns)
data_pca = pd.concat([data_pca, data_imputed[label_columns].reset_index(drop=True)], axis=1)
# Split the data into training and testing sets
X = data_pca.iloc[:, :-5]
y = data_pca['lesion_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialize the models
models = {
    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        bootstrap=True,
        random_state=42
    ),
    "Gradient Boosting": GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=3,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42
    ),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000)
}
# Plot learning curves
for name, model in models.items():
    plt.figure(figsize=(8, 6))
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y, cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 5)
    )
    train_mean = np.mean(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    plt.plot(train_sizes, train_mean, label='Training Score', color='blue')
    plt.plot(train_sizes, test_mean, label='Cross-Validation Score', color='green')
    plt.title(f'{name} Learning Curve')
    plt.xlabel('Training Size')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)
    plt.show()













ModuleNotFoundError: No module named 'sklearn'