In [None]:
import pandas as pd
data = pd.read_csv('/content/Cancer data - egc (1).csv')
data.head()


In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

def preprocess_data(file_path):

    df = pd.read_csv(file_path)


    required_columns = [
        'Age at Diagnosis', 'ECOG At Stage IV Diagnosis', "Patient's Vital Status",
        'Overall Survival (Months)', 'HER2 IHC or FISH', 'EBV Tested',
        'Liver Metastasis', 'Lung Metastasis', 'Peritoneum Metastasis',
        'Fraction Genome Altered', 'Mutation Count'
    ]
    for col in required_columns:
        if col not in df.columns:
            raise KeyError(f"Missing required column: {col}")


    metastasis_features = ['Liver Metastasis', 'Lung Metastasis', 'Peritoneum Metastasis']
    for col in metastasis_features:
        df[col] = df[col].map({'YES': 1, 'NO': 0})


    df['HER2_status'] = df['HER2 IHC or FISH'].map({'POS': 1, 'NEG': 0})
    df['EBV_status'] = df['EBV Tested'].map({'YES': 1, 'NO': 0})


    df['risk_score'] = (
        (df['Age at Diagnosis'] > 70).astype(int) * 2 +
        df[metastasis_features].sum(axis=1) * 3 +
        df['ECOG At Stage IV Diagnosis'].fillna(0) * 2 +
        df['HER2_status'].fillna(0) * 1.5
    )
    df['risk_score_treatment'] = (
        df['HER2_status'] * 1.5 +
        df['Fraction Genome Altered'].fillna(0) * 1.5 +
        df['Mutation Count'].fillna(0) * 1.2 +
        df['Age at Diagnosis'] * 1.2
    )


    df['survival_months'] = pd.to_numeric(df['Overall Survival (Months)'], errors='coerce')
    df['is_deceased'] = (df["Patient's Vital Status"] == 'DOD').astype(int)


    df['disease_progression'] = ((df['risk_score'] > df['risk_score'].median()) & (df['is_deceased'] == 1)).astype(int)
    df['treatment_response'] = (
        (df['risk_score_treatment'] < df['risk_score_treatment'].median()) &
        (df['survival_months'] > df['survival_months'].median()) &
        (df['is_deceased'] == 0)
    ).astype(int)
    df['immunotherapy_effectiveness'] = (
        (df['EBV_status'] == 1) & (df['survival_months'] > df['survival_months'].median())
    ).astype(int)



    plot_correlation_map(df)

    return df

def prepare_features(df):
    features = [
        'Age at Diagnosis', 'ECOG At Stage IV Diagnosis', 'risk_score',
        'HER2_status', 'EBV_status',
        'Fraction Genome Altered', 'Mutation Count',
        'Liver Metastasis', 'Lung Metastasis', 'Peritoneum Metastasis'
    ]
    for col in features:
        if col not in df.columns:
            print(f"Warning: Feature {col} is missing. Imputation may fail.")

    X = df[features].copy()


    imp = IterativeImputer(max_iter=10, random_state=42)
    X = pd.DataFrame(imp.fit_transform(X), columns=X.columns)

    return X

def plot_correlation_map(df):

    numeric_df = df.select_dtypes(include=['float64', 'int64'])
    plt.figure(figsize=(12, 10))
    correlation_matrix = numeric_df.corr()

    sns.heatmap(
        correlation_matrix,
        annot=True,
        fmt=".2f",
        cmap="coolwarm",
        linewidths=0.5
    )
    plt.title("Feature Correlation Map")
    plt.show()


def plot_confusion_matrix(y_true, y_pred, dataset_type, model_name, task_name):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
    plt.title(f"{model_name} - {task_name} ({dataset_type}) Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

def plot_roc_curve(y_true, y_scores, model_name, task_name):
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{model_name} (AUC = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.title(f"{model_name} - {task_name} ROC Curve")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

def plot_feature_importance(importances, feature_names, model_name, task_name):
    feature_data = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    feature_data = feature_data.sort_values(by='Importance', ascending=True)
    feature_data.plot.barh(x='Feature', y='Importance', legend=False)
    plt.title(f"{model_name} - {task_name} Feature Importance")
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.tight_layout()
    plt.show()

def scatter_features(X, features, target):
    sns.pairplot(pd.concat([X[features], target], axis=1), hue=target.name, diag_kind="kde", corner=True)
    plt.suptitle("Feature Scatter Plot")
    plt.show()

def train_and_evaluate_models(X, y, task_name):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)


    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)


    models = {
        'LightGBM': LGBMClassifier(learning_rate=0.01, n_estimators=200, max_depth=4, num_leaves=32, random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42),
        'Deep Neural Network': MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu',
                                             solver='adam', max_iter=200, random_state=42)
    }

    results = {}

    for name, model in models.items():

        model.fit(X_train_balanced, y_train_balanced)

        y_train_pred = model.predict(X_train_scaled)
        y_test_pred = model.predict(X_test_scaled)
        y_test_scores = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X_test_scaled)

        print(f"\n{name} - {task_name} Results:")
        print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
        print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
        print(classification_report(y_test, y_test_pred))

        plot_confusion_matrix(y_train, y_train_pred, "Train", name, task_name)
        plot_confusion_matrix(y_test, y_test_pred, "Test", name, task_name)


        plot_roc_curve(y_test, y_test_scores, name, task_name)


        if hasattr(model, 'feature_importances_'):
            plot_feature_importance(model.feature_importances_, X.columns, name, task_name)


        results[name] = {
            'train_accuracy': accuracy_score(y_train, y_train_pred),
            'test_accuracy': accuracy_score(y_test, y_test_pred),
            'roc_auc': roc_auc_score(y_test, y_test_scores)
        }

    return results

def main():

    file_path = '/content/Cancer data - egc (1).csv'


    df = preprocess_data(file_path)


    X = prepare_features(df)


    tasks = [
        ('Disease Progression', df['disease_progression']),
        ('Treatment Response', df['treatment_response']),
        ('Immunotherapy Effectiveness', df['immunotherapy_effectiveness'])
    ]

    for task_name, y in tasks:
        print(f"\n--- {task_name} Analysis ---")
        scatter_features(X, X.columns[:4], y)
        results = train_and_evaluate_models(X, y, task_name)

        print("\nSummary Results:")
        for model, metrics in results.items():
            print(f"{model}:")
            print(f"  Train Accuracy: {metrics['train_accuracy']:.4f}")
            print(f"  Test Accuracy: {metrics['test_accuracy']:.4f}")
            print(f"  ROC-AUC: {metrics['roc_auc']:.4f}")

if __name__ == '__main__':
    main()
