In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Load dataset (placeholder - replace with actual data loading)
data = pd.read_csv("immune_response_data.csv")

# Data Preprocessing
def preprocess_data(df):
    # Handle missing values
    df = df.dropna(thresh=0.7 * len(df), axis=1)  # Drop columns with >30% missing
    df.fillna(df.median(), inplace=True)

    # Encode categorical features
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = LabelEncoder().fit_transform(df[col])

    # Normalize numerical features
    scaler = StandardScaler()
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df

data = preprocess_data(data)

# Feature Selection
X = data.drop(columns=['immune_response'])  # Features
y = data['immune_response']  # Target

feature_importance_model = RandomForestClassifier(n_estimators=100)
feature_importance_model.fit(X, y)
feature_importances = pd.Series(feature_importance_model.feature_importances_, index=X.columns)
selected_features = feature_importances.nlargest(20).index  # Select top 20 features
X = X[selected_features]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Model Training & Evaluation
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    }

# Train different models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "SVM": SVC(probability=True),
    "Logistic Regression": LogisticRegression()
}

results = {name: train_and_evaluate(model, X_train, X_test, y_train, y_test) for name, model in models.items()}
print(pd.DataFrame(results))

# Neural Network Model
def build_nn(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

nn_model = build_nn(X_train.shape[1])
nn_model.fit(X_train, y_train, epochs=50, batch_size=16, validation_split=0.15, verbose=1)
y_pred_nn = (nn_model.predict(X_test) > 0.5).astype(int).flatten()
nn_results = {
    "Accuracy": accuracy_score(y_test, y_pred_nn),
    "Precision": precision_score(y_test, y_pred_nn),
    "Recall": recall_score(y_test, y_pred_nn),
    "F1 Score": f1_score(y_test, y_pred_nn),
    "AUC-ROC": roc_auc_score(y_test, nn_model.predict(X_test))
}
print("Neural Network Results:", nn_results)
