In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns                       #visualisation
import matplotlib.pyplot as plt             #visualisation
%matplotlib inline
sns.set(color_codes=True)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# TODO: change this to yours ...
DATA_DIR = "/content/drive/MyDrive/untitled folder"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pointbiserialr, chi2_contingency
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
import warnings

# Suppress specific FutureWarning from pandas about inplace modification
warnings.simplefilter(action='ignore', category=FutureWarning)

# --- Configuration ---
# REVIEW POINT: Ensure DATA_DIR is correct for your Kaggle environment
# Example for Kaggle: DATA_DIR = "/kaggle/input/your-dataset-folder-name"

# REVIEW POINT: Choose Aggregation Method ('last', 'mean', 'max')
AGGREGATION_METHOD = 'mean'

# REVIEW POINT: Define Filtering Thresholds
# Adjust these values to experiment with initial feature removal
INITIAL_FILTER_THRESHOLDS = {
    'zero_ratio': { # Remove if ZERO RATIO is ABOVE this
        'icd': 0.95,
        'med': 0.95,
        'lab': 0.95,
        'demo': 0.90
    },
    'std_dev': 0.001, # Remove if STD DEV is BELOW this (only for non-ICD)
    'correlation': {
        'min_abs_corr': 0.001, # Remove if ABS CORRELATION/CHI2 is BELOW this
        'max_pvalue': 0.05 # Remove if P-VALUE is ABOVE this
    }
}

# REVIEW POINT: Define Collinearity Threshold
# Adjust this value to experiment with removing correlated features
COLLINEARITY_THRESHOLD = 0.9

# REVIEW POINT: Define Clinically Relevant Features
# These features will NOT be removed by the initial filtering step
# Review this list for appropriateness to your specific problem/data
CLINICALLY_RELEVANT = [
    'Creatinine Blood', 'Hemoglobin Blood', 'Hematocrit Blood',
    'Potassium Blood', 'Sodium Blood', 'Glucose Blood',
    'Troponin T Blood', 'Platelet Count Blood', 'Eosinophils Blood',
    'pH Urine', 'pO2 Blood', 'pCO2 Blood', 'Anion Gap Blood',
    'I10-I16', 'N17-N19', 'J09-J18', 'E70-E88', 'I30-I52',
    'J40-J47', 'B20-B20',
    'ANTICOAGULANTS', 'ANTIBIOTICS', 'IMMUNOSUPPRESSANTS',
    'ANTIINFLAM.TUMOR NECROSIS FACTOR INHIBITING AGENTS'
]
# --- End Configuration ---


# === 1. Data Loading ===
print("--- 1. Data Loading ---")
train_csv_file = os.path.join(DATA_DIR, "train.csv")
val_csv_file = os.path.join(DATA_DIR, "valid.csv")
test_csv_file = os.path.join(DATA_DIR, "test.csv")
ehr_pkl_file = os.path.join(DATA_DIR, "ehr_preprocessed_seq_by_day_cat_embedding.pkl")

try:
    train_df = pd.read_csv(train_csv_file)
    val_df = pd.read_csv(val_csv_file)
    test_df = pd.read_csv(test_csv_file)
    print("CSV data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading CSV files: {e}")
    print(f"Please ensure DATA_DIR '{DATA_DIR}' is correct and files exist.")
    raise

try:
    with open(ehr_pkl_file, 'rb') as f:
        ehr_data = pd.read_pickle(f)
    print("EHR pickle data loaded successfully.")
    # Verify necessary keys exist
    required_keys = ['feat_dict', 'feature_cols', 'cat_idxs', 'cat_dims', 'demo_cols', 'icd_cols', 'lab_cols', 'med_cols']
    for key in required_keys:
        if key not in ehr_data:
            raise KeyError(f"Missing required key in ehr_data: '{key}'")
    print(f"EHR data keys verified: {list(ehr_data.keys())}")
except FileNotFoundError as e:
    print(f"Error loading pickle file: {e}")
    print(f"Please ensure ehr_pkl_file path is correct.")
    raise
except KeyError as e:
    print(f"Error: {e}")
    raise
except Exception as e:
    print(f"An unexpected error occurred loading pickle file: {e}")
    raise

# === 2. Feature Aggregation ===
print("\n--- 2. Feature Aggregation ---")
def aggregate_admission_data(df, ehr_data_dict, aggregation, df_name="DataFrame", has_labels=True):
    """Aggregate features per admission with optional labels"""
    print(f"Aggregating features for {df_name} using '{aggregation}' method...")
    admission_ids = df['id'].unique()
    X = []
    y = [] if has_labels else None
    processed_ids = [] # Keep track of IDs for which we successfully generated features

    feature_dict = ehr_data_dict.get('feat_dict', {})

    for adm_id in admission_ids:
        # Ensure adm_id exists in the feature dictionary
        if adm_id not in feature_dict:
            print(f"Warning: Admission ID {adm_id} not found in ehr_data['feat_dict']. Skipping.")
            continue

        adm_features = feature_dict[adm_id]

        # Ensure adm_features is not empty or invalid
        if not isinstance(adm_features, np.ndarray) or adm_features.size == 0:
            print(f"Warning: Invalid or empty features found for Admission ID {adm_id}. Skipping.")
            continue

        # Apply aggregation
        try:
            if aggregation == 'last':
                features = adm_features[-1]
            elif aggregation == 'mean':
                features = np.mean(adm_features, axis=0)
            elif aggregation == 'max':
                features = np.max(adm_features, axis=0)
            else:
                raise ValueError(f"Invalid aggregation method: {aggregation}")
        except IndexError:
             print(f"Warning: IndexError during '{aggregation}' aggregation for Admission ID {adm_id} (likely empty sequence). Skipping.")
             continue
        except Exception as e:
             print(f"Warning: Error during aggregation for Admission ID {adm_id}: {e}. Skipping.")
             continue

        # Check if aggregated features have the expected dimension (should match feature_cols length)
        expected_dim = len(ehr_data_dict.get('feature_cols', []))
        if expected_dim > 0 and features.shape[0] != expected_dim:
             print(f"Warning: Feature dimension mismatch for {adm_id}. Expected {expected_dim}, got {features.shape[0]}. Skipping.")
             continue

        X.append(features)
        processed_ids.append(adm_id) # Add ID only if features were processed

        # Get label if needed
        if has_labels:
            adm_rows = df[df['id'] == adm_id]
            if not adm_rows.empty:
                y.append(adm_rows['readmitted_within_30days'].iloc[0])
            else:
                # This case indicates an ID was in ehr_data but not the corresponding CSV
                # Remove the features added for this ID as we can't get its label
                print(f"Warning: Admission ID {adm_id} found in ehr_dict but not in {df_name}. Cannot get label. Removing feature vector.")
                X.pop()
                processed_ids.pop()

    X_np = np.array(X)
    y_np = np.array(y) if has_labels and y is not None else None # Ensure y is converted only if it exists

    # Final check for consistency between processed IDs and output shapes
    if X_np.shape[0] != len(processed_ids):
         print(f"Error: Mismatch between number of processed IDs ({len(processed_ids)}) and feature rows ({X_np.shape[0]}) for {df_name}.")
         # Handle error appropriately - maybe raise exception or return empty arrays
         raise RuntimeError("Inconsistent number of samples after aggregation.")
    if has_labels and y_np is not None and y_np.shape[0] != len(processed_ids):
         print(f"Error: Mismatch between number of processed IDs ({len(processed_ids)}) and labels ({y_np.shape[0]}) for {df_name}.")
         raise RuntimeError("Inconsistent number of labels after aggregation.")


    print(f"Aggregation complete for {df_name}. Shape: {X_np.shape}")
    return X_np, y_np, processed_ids # Return processed IDs as well

# Perform aggregation
X_train, y_train, train_ids = aggregate_admission_data(train_df, ehr_data, AGGREGATION_METHOD, df_name="train_df")
X_val, y_val, val_ids = aggregate_admission_data(val_df, ehr_data, AGGREGATION_METHOD, df_name="val_df")
X_test, _, test_ids = aggregate_admission_data(test_df, ehr_data, AGGREGATION_METHOD, df_name="test_df", has_labels=False)

# Verify non-empty results
if X_train.size == 0 or X_val.size == 0 or X_test.size == 0:
    raise ValueError("Aggregation resulted in empty datasets. Check input data and aggregation logic.")


# === 3. Initial Feature Filtering ===
print("\n--- 3. Initial Feature Filtering ---")
feature_names = list(ehr_data["feature_cols"]) # Get the original list of feature names

# Check for consistency
if len(feature_names) != X_train.shape[1]:
    raise ValueError(f"Mismatch: {len(feature_names)} feature names in 'feature_cols' but {X_train.shape[1]} columns in aggregated data.")

# --- Calculate Stats (Std Dev, Correlation, Zero Ratio) ---
print("Calculating feature statistics...")
std_devs = np.std(X_train, axis=0)
std_df = pd.DataFrame({"Feature": feature_names, "Std_Dev": std_devs})

correlations = []
p_values = []
for i in range(X_train.shape[1]):
    feature_name = feature_names[i]
    if std_devs[i] == 0:
        correlations.append(0)
        p_values.append(1)
        continue
    if feature_name in ehr_data["icd_cols"]:
        contingency_table = pd.crosstab(X_train[:, i], y_train)
        try:
            chi2, pval, _, _ = chi2_contingency(contingency_table)
            correlations.append(chi2)
            p_values.append(pval)
        except ValueError: correlations.append(0); p_values.append(1)
    else:
        try:
            if np.std(y_train) == 0: corr, pval = 0, 1
            else: corr, pval = pointbiserialr(X_train[:, i], y_train)
            if np.isnan(corr): corr, pval = 0, 1
            correlations.append(corr); p_values.append(pval)
        except ValueError: correlations.append(0); p_values.append(1)

corr_df = pd.DataFrame({"Feature": feature_names, "Correlation": correlations, "P-value": p_values})
corr_df["Abs_Correlation"] = np.abs(corr_df["Correlation"])

# Function to analyze zero-value ratio
def analyze_zeros(ehr_data, feature_cols_category):
    all_zeros_info = []
    all_feature_names_list = ehr_data["feature_cols"]
    for feature_name in feature_cols_category:
        try: feature_idx = all_feature_names_list.index(feature_name)
        except ValueError: continue
        zeros_count = np.sum([np.sum(matrix[:, feature_idx] == 0) for matrix in ehr_data["feat_dict"].values()])
        total_values = sum([matrix.shape[0] for matrix in ehr_data["feat_dict"].values()])
        zero_ratio = (zeros_count / total_values) if total_values > 0 else 0
        all_zeros_info.append((feature_name, zero_ratio))
    return pd.DataFrame(all_zeros_info, columns=["Feature", "Zero_Ratio"])

demo_zero_df = analyze_zeros(ehr_data, ehr_data["demo_cols"])
icd_zero_df = analyze_zeros(ehr_data, ehr_data["icd_cols"])
lab_zero_df = analyze_zeros(ehr_data, ehr_data["lab_cols"])
med_zero_df = analyze_zeros(ehr_data, ehr_data["med_cols"])
zero_df = pd.concat([demo_zero_df, icd_zero_df, lab_zero_df, med_zero_df], ignore_index=True)

combined_df = pd.merge(pd.merge(std_df, corr_df, on='Feature'), zero_df, on='Feature', how='left')
combined_df['Zero_Ratio'].fillna(0, inplace=True)
print("Feature statistics calculated.")

# --- Apply Filtering ---
print("Applying initial filtering based on thresholds...")
feature_types = {
    'icd': ehr_data["icd_cols"], 'med': ehr_data["med_cols"],
    'lab': ehr_data["lab_cols"], 'demo': ehr_data["demo_cols"]
}
features_to_remove = []
for ftype, cols in feature_types.items():
    for feature in cols:
        if feature in CLINICALLY_RELEVANT: continue
        row = combined_df[combined_df['Feature'] == feature]
        if row.empty: continue
        row = row.iloc[0]
        check_variance = ftype not in ['icd']
        sig_threshold = INITIAL_FILTER_THRESHOLDS['correlation']['max_pvalue']
        effect_size = row['Abs_Correlation']
        remove_condition = (
            (row['Zero_Ratio'] > INITIAL_FILTER_THRESHOLDS['zero_ratio'][ftype]) and
            (not check_variance or row['Std_Dev'] < INITIAL_FILTER_THRESHOLDS['std_dev']) and
            (effect_size < INITIAL_FILTER_THRESHOLDS['correlation']['min_abs_corr']) and
            (row['P-value'] > sig_threshold)
        )
        if remove_condition: features_to_remove.append(feature)

remove_indices = sorted([i for i, feature in enumerate(feature_names) if feature in features_to_remove], reverse=True)
print(f"Identified {len(features_to_remove)} features to remove based on initial filtering.")

# Remove features from data and names list
X_train_filtered = np.delete(X_train, remove_indices, axis=1)
X_val_filtered = np.delete(X_val, remove_indices, axis=1)
X_test_filtered = np.delete(X_test, remove_indices, axis=1)
remaining_features_after_initial = [f for i, f in enumerate(feature_names) if i not in remove_indices]

print(f"Number of features remaining after initial filtering: {len(remaining_features_after_initial)}")
print(f"Shapes after initial filtering - Train X: {X_train_filtered.shape}, Val X: {X_val_filtered.shape}, Test X: {X_test_filtered.shape}")


# === 4. Collinearity Handling ===
print("\n--- 4. Collinearity Handling ---")
# --- Function Definitions ---
def find_collinear_features(X, feature_names, threshold):
    """Identify feature pairs with correlation > threshold"""
    if X.shape[1] < 2: return pd.DataFrame(columns=['Feature1', 'Feature2', 'Correlation']) # Handle case with < 2 features
    # Scale data before calculating correlations for stability
    scaler_coll = StandardScaler()
    X_scaled_coll = scaler_coll.fit_transform(X)
    corr_matrix = pd.DataFrame(X_scaled_coll, columns=feature_names).corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    collinear_pairs = [(feature_names[i], feature_names[j], corr_matrix.iloc[i,j])
                      for i,j in zip(*np.where(upper > threshold))]
    return pd.DataFrame(collinear_pairs, columns=['Feature1', 'Feature2', 'Correlation'])

def select_non_collinear_features(X, y, feature_names, clinically_relevant, corr_threshold):
    """ Selects features, removing one from each highly correlated pair based on importance. """
    if X.shape[1] < 2: return list(range(X.shape[1])), feature_names # Return all if < 2 features

    # Find collinear pairs
    collinear_df = find_collinear_features(X, feature_names, corr_threshold)
    print(f"Found {len(collinear_df)} pairs of features with correlation > {corr_threshold}")
    if collinear_df.empty:
        print("No highly collinear features found to remove.")
        return list(range(X.shape[1])), feature_names # Return all indices and names

    # Calculate Mutual Information
    try:
        # Ensure y has more than one class for MI calculation
        if len(np.unique(y)) > 1:
            importance = mutual_info_classif(X, y, random_state=42)
            importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
        else:
            print("Warning: Target variable has only one class. Cannot calculate mutual information. Using fallback for selection.")
            importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': 0}) # Dummy importance
    except Exception as e:
        print(f"Error calculating mutual information: {e}. Using fallback for selection.")
        importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': 0}) # Dummy importance

    clinically_relevant_remaining = [f for f in clinically_relevant if f in feature_names]
    features_to_keep = set(clinically_relevant_remaining)
    features_to_remove_coll = set() # Track features explicitly marked for removal

    # Iterate through collinear pairs to decide which one to remove
    processed_in_pair = set() # Track features already handled in a pair
    for _, row in collinear_df.iterrows():
        f1, f2 = row['Feature1'], row['Feature2']

        # If either feature was already removed in a previous pair comparison, skip
        if f1 in features_to_remove_coll or f2 in features_to_remove_coll: continue
        # If both already decided to keep (e.g. both clinical), skip
        if f1 in features_to_keep and f2 in features_to_keep: continue

        # Prioritize clinically relevant
        if f1 in clinically_relevant_remaining:
            features_to_keep.add(f1)
            if f2 not in clinically_relevant_remaining: features_to_remove_coll.add(f2)
            continue
        elif f2 in clinically_relevant_remaining:
            features_to_keep.add(f2)
            if f1 not in clinically_relevant_remaining: features_to_remove_coll.add(f1)
            continue
        # If neither is clinically relevant, compare importance
        else:
            try:
                imp1 = importance_df.loc[importance_df['Feature'] == f1, 'Importance'].iloc[0]
                imp2 = importance_df.loc[importance_df['Feature'] == f2, 'Importance'].iloc[0]
                if imp1 >= imp2: # Keep f1
                    features_to_keep.add(f1)
                    features_to_remove_coll.add(f2)
                else: # Keep f2
                    features_to_keep.add(f2)
                    features_to_remove_coll.add(f1)
            except IndexError:
                 print(f"Warning: Feature {f1} or {f2} not found in importance df. Keeping first ({f1}).")
                 features_to_keep.add(f1)
                 features_to_remove_coll.add(f2)

    # Final list of features to keep
    final_feature_set = (set(feature_names) - features_to_remove_coll)

    # Get indices based on the feature_names list passed to this function
    keep_indices = sorted([i for i, f in enumerate(feature_names) if f in final_feature_set])
    final_feature_names = [feature_names[i] for i in keep_indices]

    print(f"Removed {len(features_to_remove_coll)} features due to collinearity.")
    return keep_indices, final_feature_names
# --- End of Function Definitions ---

# Apply collinearity filtering
keep_indices_final, final_features = select_non_collinear_features(
    X_train_filtered,
    y_train,
    remaining_features_after_initial, # Pass the list of names corresponding to X_train_filtered
    CLINICALLY_RELEVANT,
    corr_threshold=COLLINEARITY_THRESHOLD
)

# Select final features based on the returned indices relative to X_train_filtered
X_train_final = X_train_filtered[:, keep_indices_final]
X_val_final = X_val_filtered[:, keep_indices_final]
X_test_final = X_test_filtered[:, keep_indices_final]

print(f"Number of features remaining after collinearity handling: {len(final_features)}")
print(f"Shapes after final filtering - Train X: {X_train_final.shape}, Val X: {X_val_final.shape}, Test X: {X_test_final.shape}")


# === 5. Feature Scaling ===
print("\n--- 5. Feature Scaling ---")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_final)
X_val_scaled = scaler.transform(X_val_final)
X_test_scaled = scaler.transform(X_test_final)
print("Scaling complete.")
print("\nPreprocessing finished. Ready for modeling using:")
print(f" X_train_scaled ({X_train_scaled.shape})")
print(f" y_train ({y_train.shape})")
print(f" X_val_scaled ({X_val_scaled.shape})")
print(f" y_val ({y_val.shape})")
print(f" X_test_scaled ({X_test_scaled.shape})")
print(f" final_features ({len(final_features)} names)")
print(f" train_ids ({len(train_ids)}), val_ids ({len(val_ids)}), test_ids ({len(test_ids)})") # IDs corresponding to rows

# Optional: Save the processed data if needed for separate modeling notebooks
# np.savez('processed_data.npz',
#          X_train_scaled=X_train_scaled, y_train=y_train, train_ids=train_ids,
#          X_val_scaled=X_val_scaled, y_val=y_val, val_ids=val_ids,
#          X_test_scaled=X_test_scaled, test_ids=test_ids,
#          final_features=final_features)
# print("\nProcessed data saved to processed_data.npz")



--- 1. Data Loading ---
CSV data loaded successfully.
EHR pickle data loaded successfully.
EHR data keys verified: ['feat_dict', 'feature_cols', 'cat_idxs', 'cat_dims', 'demo_cols', 'icd_cols', 'lab_cols', 'med_cols']

--- 2. Feature Aggregation ---
Aggregating features for train_df using 'mean' method...
Aggregation complete for train_df. Shape: (8234, 171)
Aggregating features for val_df using 'mean' method...
Aggregation complete for val_df. Shape: (2788, 171)
Aggregating features for test_df using 'mean' method...
Aggregation complete for test_df. Shape: (2741, 171)

--- 3. Initial Feature Filtering ---
Calculating feature statistics...
Feature statistics calculated.
Applying initial filtering based on thresholds...
Identified 40 features to remove based on initial filtering.
Number of features remaining after initial filtering: 131
Shapes after initial filtering - Train X: (8234, 131), Val X: (2788, 131), Test X: (2741, 131)

--- 4. Collinearity Handling ---
Found 2 pairs of featu

In [None]:
# 1. Import necessary libraries (if not already imported)
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler # Ensure scaler is available if needed
import pandas as pd
import numpy as np
import time # To time the training

# Assume X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, test_ids
# and test_csv_file are available from the previous preprocessing cell.

# 2. Train an SVM Classifier Model
# Using kernel='rbf' is common.
# C is the regularization parameter.
# probability=True allows predict_proba but slows down training considerably.
print("Training SVM Classifier model...")
print("(Note: SVM training with probability=True can be slow)...")
svm_model = SVC(
    random_state=42,
    C=1.0,                 # Default regularization parameter
    kernel='rbf',          # Common kernel choice
    gamma='scale',         # Default gamma value
    probability=True,      # Enable probability estimates
    class_weight='balanced' # Handle class imbalance
)

# Time the training
start_time = time.time()
svm_model.fit(X_train_scaled, y_train)
end_time = time.time()
print(f"Training complete. Time taken: {end_time - start_time:.2f} seconds")

# 3. Evaluate the Model on the Validation Set
print("\nEvaluating SVM model on validation set...")
y_val_pred_svm = svm_model.predict(X_val_scaled)
y_val_pred_proba_svm = svm_model.predict_proba(X_val_scaled)[:, 1] # Probabilities

accuracy_svm = accuracy_score(y_val, y_val_pred_svm)
auc_roc_svm = roc_auc_score(y_val, y_val_pred_proba_svm)

print(f"Validation Accuracy (SVM): {accuracy_svm:.4f}")
print(f"Validation AUC-ROC (SVM): {auc_roc_svm:.4f}")
print("\nValidation Classification Report (SVM):")
try:
    print(classification_report(y_val, y_val_pred_svm))
except ValueError as e:
    print(f"Could not generate classification report: {e}")


# 4. Predict Probabilities on the Test Set using SVM
print("\nPredicting probabilities on the test set using SVM...")
y_test_pred_proba_svm = svm_model.predict_proba(X_test_scaled)[:, 1]
print("Prediction complete.")

# 5. Create Submission File for SVM
# Use the test_ids returned by the aggregation function
print(f"Number of test predictions: {len(y_test_pred_proba_svm)}")
print(f"Number of test IDs: {len(test_ids)}")

if len(test_ids) != len(y_test_pred_proba_svm):
    print(f"Error: Length mismatch! Test IDs ({len(test_ids)}) vs Predictions ({len(y_test_pred_proba_svm)})")
    # Attempt fallback using unique IDs from original test_df if lengths match
    try:
        original_test_df = pd.read_csv(test_csv_file)
        fallback_test_ids = original_test_df['id'].unique()
        if len(fallback_test_ids) == len(y_test_pred_proba_svm):
            print("Warning: Using unique IDs from original test_df as fallback.")
            test_ids_to_use = fallback_test_ids
        else:
             raise ValueError("Cannot reliably create submission file due to ID mismatch even with fallback.")
    except Exception as fallback_e:
         print(f"Fallback ID generation failed: {fallback_e}")
         raise ValueError("Cannot reliably create submission file due to ID mismatch.")
else:
    print("Test IDs and predictions match.")
    test_ids_to_use = test_ids # Use the IDs from aggregation


try:
    submission_df_svm = pd.DataFrame({
        'id': test_ids_to_use,
        'readmitted_within_30days': y_test_pred_proba_svm
    })

    # Define the output path
    output_path_svm = "submission_svm.csv" # Give it a distinct name

    # Save the submission file
    submission_df_svm.to_csv(output_path_svm, index=False)

    print(f"\nSubmission file for SVM created successfully at: {output_path_svm}")
    print("Submission file head (SVM):")
    print(submission_df_svm.head())

except Exception as e:
    print(f"An unexpected error occurred while preparing the submission file: {e}")
    raise


Training SVM Classifier model...
(Note: SVM training with probability=True can be slow)...
Training complete. Time taken: 37.07 seconds

Evaluating SVM model on validation set...
Validation Accuracy (SVM): 0.7848
Validation AUC-ROC (SVM): 0.7342

Validation Classification Report (SVM):
              precision    recall  f1-score   support

       False       0.88      0.85      0.87      2307
        True       0.39      0.45      0.42       481

    accuracy                           0.78      2788
   macro avg       0.64      0.65      0.64      2788
weighted avg       0.80      0.78      0.79      2788


Predicting probabilities on the test set using SVM...
Prediction complete.
Number of test predictions: 2741
Number of test IDs: 2741
Test IDs and predictions match.

Submission file for SVM created successfully at: submission_svm.csv
Submission file head (SVM):
                  id  readmitted_within_30days
0  16026764_21404901                  0.160818
1  18463717_24608289         

In [None]:
import os
import pandas as pd
import numpy as np
import time
from scipy.stats import pointbiserialr, chi2_contingency
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
import warnings

# Suppress specific FutureWarning from pandas
warnings.simplefilter(action='ignore', category=FutureWarning)

# --- Configuration ---
# REVIEW POINT: Ensure DATA_DIR is correct for your Kaggle environment

# --- Parameters to Test ---
AGGREGATION_METHODS_TO_TEST = ['last', 'mean', 'max']
COLLINEARITY_THRESHOLDS_TO_TEST = [0.6, 0.7, 0.8, 0.9] # Add or remove values as needed

# --- Fixed Parameters for this Experiment ---
INITIAL_FILTER_THRESHOLDS = { # Using original strict thresholds for consistency in this test
    'zero_ratio': {'icd': 0.95, 'med': 0.95, 'lab': 0.95, 'demo': 0.90},
    'std_dev': 0.001,
    'correlation': {'min_abs_corr': 0.001, 'max_pvalue': 0.05}
}
CLINICALLY_RELEVANT = [
    'Creatinine Blood', 'Hemoglobin Blood', 'Hematocrit Blood',
    'Potassium Blood', 'Sodium Blood', 'Glucose Blood',
    'Troponin T Blood', 'Platelet Count Blood', 'Eosinophils Blood',
    'pH Urine', 'pO2 Blood', 'pCO2 Blood', 'Anion Gap Blood',
    'I10-I16', 'N17-N19', 'J09-J18', 'E70-E88', 'I30-I52',
    'J40-J47', 'B20-B20',
    'ANTICOAGULANTS', 'ANTIBIOTICS', 'IMMUNOSUPPRESSANTS',
    'ANTIINFLAM.TUMOR NECROSIS FACTOR INHIBITING AGENTS'
]
# Define file paths globally within the cell
train_csv_file = os.path.join(DATA_DIR, "train.csv")
val_csv_file = os.path.join(DATA_DIR, "valid.csv")
test_csv_file = os.path.join(DATA_DIR, "test.csv") # Needed for ID mapping if generating submissions later
ehr_pkl_file = os.path.join(DATA_DIR, "ehr_preprocessed_seq_by_day_cat_embedding.pkl")

# --- Preprocessing Function ---
def run_preprocessing(aggregation_method, collinearity_threshold):
    """Runs the entire preprocessing pipeline and returns scaled data."""
    print(f"\n--- Running Preprocessing: Aggregation='{aggregation_method}', Collinearity Threshold={collinearity_threshold} ---")
    try:
        # 1. Load Data (inside function to ensure fresh load each time)
        print("Loading data...")
        train_df = pd.read_csv(train_csv_file)
        val_df = pd.read_csv(val_csv_file)
        test_df = pd.read_csv(test_csv_file) # Load test_df for consistency check if needed later
        with open(ehr_pkl_file, 'rb') as f:
            ehr_data = pd.read_pickle(f)
        print("Data loaded.")

        # 2. Feature Aggregation
        print("Aggregating features...")
        # (Using the robust aggregate_admission_data function from previous consolidated code)
        # --- Start: aggregate_admission_data definition ---
        def aggregate_admission_data(df, ehr_data_dict, aggregation, df_name="DataFrame", has_labels=True):
            admission_ids = df['id'].unique()
            X = []
            y = [] if has_labels else None
            processed_ids = []
            feature_dict = ehr_data_dict.get('feat_dict', {})
            feature_cols = ehr_data_dict.get('feature_cols', [])
            expected_dim = len(feature_cols)

            for adm_id in admission_ids:
                if adm_id not in feature_dict: continue
                adm_features = feature_dict[adm_id]
                if not isinstance(adm_features, np.ndarray) or adm_features.size == 0: continue

                try:
                    if aggregation == 'last': features = adm_features[-1]
                    elif aggregation == 'mean': features = np.mean(adm_features, axis=0)
                    elif aggregation == 'max': features = np.max(adm_features, axis=0)
                    else: raise ValueError(f"Invalid aggregation method: {aggregation}")
                except Exception: continue # Skip if aggregation fails

                if expected_dim > 0 and features.shape[0] != expected_dim: continue

                X.append(features)
                processed_ids.append(adm_id)

                if has_labels:
                    adm_rows = df[df['id'] == adm_id]
                    if not adm_rows.empty: y.append(adm_rows['readmitted_within_30days'].iloc[0])
                    else: X.pop(); processed_ids.pop() # Remove if label missing

            X_np = np.array(X)
            y_np = np.array(y) if has_labels and y is not None else None
            if X_np.shape[0] != len(processed_ids): raise RuntimeError("Inconsistent samples after aggregation.")
            if has_labels and y_np is not None and y_np.shape[0] != len(processed_ids): raise RuntimeError("Inconsistent labels after aggregation.")
            return X_np, y_np, processed_ids
        # --- End: aggregate_admission_data definition ---

        X_train, y_train, _ = aggregate_admission_data(train_df, ehr_data, aggregation_method, df_name="train_df")
        X_val, y_val, _ = aggregate_admission_data(val_df, ehr_data, aggregation_method, df_name="val_df")
        # We only need X_train/y_train and X_val/y_val for this experiment
        if X_train.size == 0 or X_val.size == 0 or y_train is None or y_val is None:
             raise ValueError("Aggregation resulted in empty train/val data or labels.")
        print("Aggregation complete.")

        # 3. Initial Feature Filtering
        print("Performing initial filtering...")
        feature_names = list(ehr_data["feature_cols"])
        if len(feature_names) != X_train.shape[1]: raise ValueError("Feature name/column mismatch.")
        std_devs = np.std(X_train, axis=0)
        std_df = pd.DataFrame({"Feature": feature_names, "Std_Dev": std_devs})
        # (Correlation/P-value calculation logic...)
        correlations = []; p_values = []
        for i in range(X_train.shape[1]):
            fname = feature_names[i]
            if std_devs[i] == 0: corr, pval = 0, 1
            elif fname in ehr_data["icd_cols"]:
                try: corr, pval, _, _ = chi2_contingency(pd.crosstab(X_train[:, i], y_train));
                except ValueError: corr, pval = 0, 1
            else:
                try:
                    if np.std(y_train)==0: corr, pval = 0, 1
                    else: corr, pval = pointbiserialr(X_train[:, i], y_train)
                    if np.isnan(corr): corr, pval = 0, 1
                except ValueError: corr, pval = 0, 1
            correlations.append(corr); p_values.append(pval)
        corr_df = pd.DataFrame({"Feature": feature_names, "Correlation": correlations, "P-value": p_values})
        corr_df["Abs_Correlation"] = np.abs(corr_df["Correlation"])
        # (Zero ratio calculation logic...)
        def analyze_zeros(ehr_data, feature_cols_category):
             all_zeros_info = []
             all_feature_names_list = ehr_data["feature_cols"]
             for fname in feature_cols_category:
                 try: fidx = all_feature_names_list.index(fname)
                 except ValueError: continue
                 zeros_count = np.sum([np.sum(m[:, fidx] == 0) for m in ehr_data["feat_dict"].values()])
                 total_values = sum([m.shape[0] for m in ehr_data["feat_dict"].values()])
                 zero_ratio = (zeros_count / total_values) if total_values > 0 else 0
                 all_zeros_info.append((fname, zero_ratio))
             return pd.DataFrame(all_zeros_info, columns=["Feature", "Zero_Ratio"])
        demo_zero_df = analyze_zeros(ehr_data, ehr_data["demo_cols"])
        icd_zero_df = analyze_zeros(ehr_data, ehr_data["icd_cols"])
        lab_zero_df = analyze_zeros(ehr_data, ehr_data["lab_cols"])
        med_zero_df = analyze_zeros(ehr_data, ehr_data["med_cols"])
        zero_df = pd.concat([demo_zero_df, icd_zero_df, lab_zero_df, med_zero_df], ignore_index=True)
        combined_df = pd.merge(pd.merge(std_df, corr_df, on='Feature'), zero_df, on='Feature', how='left')
        combined_df['Zero_Ratio'].fillna(0, inplace=True)
        # (Filtering logic...)
        feature_types = {'icd': ehr_data["icd_cols"], 'med': ehr_data["med_cols"], 'lab': ehr_data["lab_cols"], 'demo': ehr_data["demo_cols"]}
        features_to_remove = []
        for ftype, cols in feature_types.items():
            for feature in cols:
                if feature in CLINICALLY_RELEVANT: continue
                row = combined_df[combined_df['Feature'] == feature]
                if row.empty: continue
                row = row.iloc[0]
                check_variance = ftype not in ['icd']
                sig_threshold = INITIAL_FILTER_THRESHOLDS['correlation']['max_pvalue']
                effect_size = row['Abs_Correlation']
                remove_condition = (
                    (row['Zero_Ratio'] > INITIAL_FILTER_THRESHOLDS['zero_ratio'][ftype]) and
                    (not check_variance or row['Std_Dev'] < INITIAL_FILTER_THRESHOLDS['std_dev']) and
                    (effect_size < INITIAL_FILTER_THRESHOLDS['correlation']['min_abs_corr']) and
                    (row['P-value'] > sig_threshold)
                )
                if remove_condition: features_to_remove.append(feature)
        remove_indices = sorted([i for i, feature in enumerate(feature_names) if feature in features_to_remove], reverse=True)
        X_train_filtered = np.delete(X_train, remove_indices, axis=1)
        X_val_filtered = np.delete(X_val, remove_indices, axis=1)
        remaining_features_after_initial = [f for i, f in enumerate(feature_names) if i not in remove_indices]
        print(f"Initial filtering removed {len(features_to_remove)} features. {len(remaining_features_after_initial)} remaining.")

        # 4. Collinearity Handling
        print("Performing collinearity handling...")
        # (Using find_collinear_features and select_non_collinear_features definitions from previous consolidated code)
        # --- Start: Collinearity function definitions ---
        def find_collinear_features(X, feature_names, threshold):
            if X.shape[1] < 2: return pd.DataFrame(columns=['Feature1', 'Feature2', 'Correlation'])
            scaler_coll = StandardScaler()
            X_scaled_coll = scaler_coll.fit_transform(X)
            corr_matrix = pd.DataFrame(X_scaled_coll, columns=feature_names).corr().abs()
            upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
            collinear_pairs = [(feature_names[i], feature_names[j], corr_matrix.iloc[i,j])
                              for i,j in zip(*np.where(upper > threshold))]
            return pd.DataFrame(collinear_pairs, columns=['Feature1', 'Feature2', 'Correlation'])

        def select_non_collinear_features(X, y, feature_names, clinically_relevant, corr_threshold):
            if X.shape[1] < 2: return list(range(X.shape[1])), feature_names
            collinear_df = find_collinear_features(X, feature_names, corr_threshold)
            print(f"Found {len(collinear_df)} pairs with correlation > {corr_threshold}")
            if collinear_df.empty: return list(range(X.shape[1])), feature_names
            try:
                if len(np.unique(y)) > 1:
                    importance = mutual_info_classif(X, y, random_state=42)
                    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
                else: importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': 0})
            except Exception: importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': 0})
            clinically_relevant_remaining = [f for f in clinically_relevant if f in feature_names]
            features_to_keep = set(clinically_relevant_remaining)
            features_to_remove_coll = set()
            for _, row in collinear_df.iterrows():
                f1, f2 = row['Feature1'], row['Feature2']
                if f1 in features_to_remove_coll or f2 in features_to_remove_coll: continue
                if f1 in features_to_keep and f2 in features_to_keep: continue
                if f1 in clinically_relevant_remaining:
                    if f2 not in clinically_relevant_remaining: features_to_remove_coll.add(f2)
                    continue
                elif f2 in clinically_relevant_remaining:
                    if f1 not in clinically_relevant_remaining: features_to_remove_coll.add(f1)
                    continue
                else:
                    try:
                        imp1 = importance_df.loc[importance_df['Feature'] == f1, 'Importance'].iloc[0]
                        imp2 = importance_df.loc[importance_df['Feature'] == f2, 'Importance'].iloc[0]
                        if imp1 >= imp2: features_to_remove_coll.add(f2)
                        else: features_to_remove_coll.add(f1)
                    except IndexError: features_to_remove_coll.add(f2) # Fallback: remove second one
            final_feature_set = (set(feature_names) - features_to_remove_coll)
            keep_indices = sorted([i for i, f in enumerate(feature_names) if f in final_feature_set])
            final_feature_names = [feature_names[i] for i in keep_indices]
            print(f"Collinearity handling removed {len(features_to_remove_coll)} features.")
            return keep_indices, final_feature_names
        # --- End: Collinearity function definitions ---

        keep_indices_final, final_features = select_non_collinear_features(
            X_train_filtered, y_train, remaining_features_after_initial,
            CLINICALLY_RELEVANT, corr_threshold=collinearity_threshold
        )
        if not final_features: raise ValueError("Collinearity handling removed all features.")

        X_train_final = X_train_filtered[:, keep_indices_final]
        X_val_final = X_val_filtered[:, keep_indices_final]
        print(f"Final feature count: {len(final_features)}")

        # 5. Feature Scaling
        print("Scaling features...")
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_final)
        X_val_scaled = scaler.transform(X_val_final)
        print("Preprocessing complete.")

        return X_train_scaled, y_train, X_val_scaled, y_val, True # Success

    except Exception as e:
        print(f"!!! Preprocessing failed for Agg='{aggregation_method}', Coll='{collinearity_threshold}': {e} !!!")
        return None, None, None, None, False # Failure

# --- Experiment Loop ---
results_list = []
print("\n=== Starting SVM Experiment Loop ===")
print(f"Testing Aggregations: {AGGREGATION_METHODS_TO_TEST}")
print(f"Testing Collinearity Thresholds: {COLLINEARITY_THRESHOLDS_TO_TEST}")
print("WARNING: This may take a long time due to SVM training!")

for agg_method in AGGREGATION_METHODS_TO_TEST:
    for coll_thresh in COLLINEARITY_THRESHOLDS_TO_TEST:
        run_start_time = time.time()

        # Run preprocessing
        X_train_s, y_train_p, X_val_s, y_val_p, success = run_preprocessing(agg_method, coll_thresh)

        val_auc = None # Default AUC if preprocessing or training fails
        train_time = None

        if success:
            # Initialize and Train SVM
            print(f"\nTraining SVM (Agg='{agg_method}', Coll='{coll_thresh}')...")
            svm_model = SVC(random_state=42, C=1.0, kernel='rbf', gamma='scale',
                            probability=True, class_weight='balanced')
            try:
                svm_start_time = time.time()
                svm_model.fit(X_train_s, y_train_p)
                svm_end_time = time.time()
                train_time = svm_end_time - svm_start_time
                print(f"SVM Training complete. Time: {train_time:.2f}s")

                # Evaluate
                y_val_pred_proba_svm = svm_model.predict_proba(X_val_s)[:, 1]
                val_auc = roc_auc_score(y_val_p, y_val_pred_proba_svm)
                print(f"Validation AUC-ROC: {val_auc:.4f}")

            except Exception as e:
                print(f"!!! SVM training/evaluation failed: {e} !!!")
                val_auc = None # Mark as failed
                train_time = None
        else:
             print("Skipping SVM training due to preprocessing failure.")


        # Store results
        results_list.append({
            'Aggregation': agg_method,
            'Collinearity Threshold': coll_thresh,
            'Validation AUC': val_auc,
            'SVM Train Time (s)': train_time,
            'Preprocessing Success': success
        })
        run_end_time = time.time()
        print(f"--- Finished run for Agg='{agg_method}', Coll='{coll_thresh}'. Total time: {run_end_time - run_start_time:.2f}s ---")


# --- Display Results ---
print("\n=== Experiment Results ===")
results_df = pd.DataFrame(results_list)
# Format AUC nicely
results_df['Validation AUC'] = results_df['Validation AUC'].map('{:.4f}'.format, na_action='ignore')
results_df['SVM Train Time (s)'] = results_df['SVM Train Time (s)'].map('{:.2f}'.format, na_action='ignore')

print(results_df)
print("==========================")




=== Starting SVM Experiment Loop ===
Testing Aggregations: ['last', 'mean', 'max']
Testing Collinearity Thresholds: [0.6, 0.7, 0.8, 0.9]

--- Running Preprocessing: Aggregation='last', Collinearity Threshold=0.6 ---
Loading data...
Data loaded.
Aggregating features...
Aggregation complete.
Performing initial filtering...
Initial filtering removed 40 features. 131 remaining.
Performing collinearity handling...
Found 11 pairs with correlation > 0.6
Collinearity handling removed 6 features.
Final feature count: 125
Scaling features...
Preprocessing complete.

Training SVM (Agg='last', Coll='0.6')...
SVM Training complete. Time: 35.13s
Validation AUC-ROC: 0.7264
--- Finished run for Agg='last', Coll='0.6'. Total time: 99.06s ---

--- Running Preprocessing: Aggregation='last', Collinearity Threshold=0.7 ---
Loading data...
Data loaded.
Aggregating features...
Aggregation complete.
Performing initial filtering...
Initial filtering removed 40 features. 131 remaining.
Performing collinearity 

In [None]:
# 1. Import necessary libraries
import os
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score
import warnings

# Suppress specific FutureWarning from pandas
warnings.simplefilter(action='ignore', category=FutureWarning)

# --- Configuration ---
AGGREGATION_METHOD = 'mean' # Or 'mean', 'max' if you want to test that aggregation
# Define file paths globally
train_csv_file = os.path.join(DATA_DIR, "train.csv")
val_csv_file = os.path.join(DATA_DIR, "valid.csv")
test_csv_file = os.path.join(DATA_DIR, "test.csv")
ehr_pkl_file = os.path.join(DATA_DIR, "ehr_preprocessed_seq_by_day_cat_embedding.pkl")

# --- Function Definition (Aggregation Step Only) ---
def aggregate_admission_data(df, ehr_data_dict, aggregation, df_name="DataFrame", has_labels=True):
    """Aggregate features per admission with optional labels"""
    print(f"Aggregating features for {df_name} using '{aggregation}' method...")
    admission_ids = df['id'].unique()
    X = []
    y = [] if has_labels else None
    processed_ids = []
    feature_dict = ehr_data_dict.get('feat_dict', {})
    feature_cols = ehr_data_dict.get('feature_cols', [])
    expected_dim = len(feature_cols)

    for adm_id in admission_ids:
        if adm_id not in feature_dict: continue
        adm_features = feature_dict[adm_id]
        if not isinstance(adm_features, np.ndarray) or adm_features.size == 0: continue
        try:
            if aggregation == 'last': features = adm_features[-1]
            elif aggregation == 'mean': features = np.mean(adm_features, axis=0)
            elif aggregation == 'max': features = np.max(adm_features, axis=0)
            else: raise ValueError(f"Invalid aggregation method: {aggregation}")
        except Exception: continue
        if expected_dim > 0 and features.shape[0] != expected_dim: continue
        X.append(features)
        processed_ids.append(adm_id)
        if has_labels:
            adm_rows = df[df['id'] == adm_id]
            if not adm_rows.empty: y.append(adm_rows['readmitted_within_30days'].iloc[0])
            else: X.pop(); processed_ids.pop()
    X_np = np.array(X); y_np = np.array(y) if has_labels and y is not None else None
    if X_np.shape[0] != len(processed_ids): raise RuntimeError("Inconsistent samples after aggregation.")
    if has_labels and y_np is not None and y_np.shape[0] != len(processed_ids): raise RuntimeError("Inconsistent labels after aggregation.")
    print(f"Aggregation complete for {df_name}. Shape: {X_np.shape}")
    return X_np, y_np, processed_ids

# --- Main Logic ---
results_agg_only = {}

try:
    # === Run SVM on Aggregated Data (No Filtering) ===
    print("--- Evaluating SVM on Aggregated Data (No Filtering) ---")
    # 1a. Load Data
    print("Loading data...")
    train_df = pd.read_csv(train_csv_file)
    val_df = pd.read_csv(val_csv_file)
    test_df = pd.read_csv(test_csv_file) # Load test_df for potential submission later
    with open(ehr_pkl_file, 'rb') as f:
        ehr_data = pd.read_pickle(f)
    print("Data loaded.")

    # 1b. Aggregate Features
    print("Aggregating features...")
    X_train_agg, y_train_agg, train_ids_agg = aggregate_admission_data(train_df, ehr_data, AGGREGATION_METHOD, df_name="train_df")
    X_val_agg, y_val_agg, val_ids_agg = aggregate_admission_data(val_df, ehr_data, AGGREGATION_METHOD, df_name="val_df")
    X_test_agg, _, test_ids_agg = aggregate_admission_data(test_df, ehr_data, AGGREGATION_METHOD, df_name="test_df", has_labels=False)
    print(f"Aggregated shapes: Train={X_train_agg.shape}, Val={X_val_agg.shape}, Test={X_test_agg.shape}")

    # 1c. Scale Aggregated Features
    print("Scaling aggregated features...")
    scaler_agg = StandardScaler()
    X_train_scaled_agg = scaler_agg.fit_transform(X_train_agg)
    X_val_scaled_agg = scaler_agg.transform(X_val_agg)
    X_test_scaled_agg = scaler_agg.transform(X_test_agg) # Scale test set too
    print("Scaling complete.")

    # 1d. Train and Evaluate SVM on Aggregated Data
    print("Training SVM on aggregated (unfiltered) data...")
    print("(Note: SVM training with probability=True can be slow)...")
    svm_model_agg = SVC(random_state=42, C=1.0, kernel='rbf', gamma='scale',
                        probability=True, class_weight='balanced')
    start_time = time.time()
    svm_model_agg.fit(X_train_scaled_agg, y_train_agg)
    end_time = time.time()
    train_time_agg = end_time - start_time
    print(f"Training complete. Time: {train_time_agg:.2f}s")

    # Evaluate on Validation Set
    print("\nEvaluating SVM model on validation set (aggregated data)...")
    y_val_pred_svm_agg = svm_model_agg.predict(X_val_scaled_agg)
    y_val_pred_proba_agg = svm_model_agg.predict_proba(X_val_scaled_agg)[:, 1]
    auc_agg = roc_auc_score(y_val_agg, y_val_pred_proba_agg)
    accuracy_agg = accuracy_score(y_val_agg, y_val_pred_svm_agg)
    results_agg_only['SVM (Aggregated Only)'] = auc_agg

    print(f"Validation Accuracy (SVM Aggregated): {accuracy_agg:.4f}")
    print(f"Validation AUC-ROC (SVM Aggregated): {auc_agg:.4f}")
    print("\nValidation Classification Report (SVM Aggregated):")
    try:
        print(classification_report(y_val_agg, y_val_pred_svm_agg))
    except ValueError as e:
        print(f"Could not generate classification report: {e}")

    # --- Optional: Create Submission File ---
    create_submission = True # Set to False if you don't need a submission file now
    if create_submission:
        print("\nPredicting probabilities on the test set (aggregated data)...")
        y_test_pred_proba_svm_agg = svm_model_agg.predict_proba(X_test_scaled_agg)[:, 1]
        print("Prediction complete.")

        print(f"Number of test predictions: {len(y_test_pred_proba_svm_agg)}")
        print(f"Number of test IDs: {len(test_ids_agg)}")

        if len(test_ids_agg) != len(y_test_pred_proba_svm_agg):
             print(f"Error: Length mismatch! Test IDs ({len(test_ids_agg)}) vs Predictions ({len(y_test_pred_proba_svm_agg)})")
             # Add fallback logic if necessary, similar to previous examples
        else:
            print("Test IDs and predictions match.")
            submission_df_svm_agg = pd.DataFrame({
                'id': test_ids_agg,
                'readmitted_within_30days': y_test_pred_proba_svm_agg
            })
            output_path_svm_agg = "submission_svm_aggregated.csv"
            submission_df_svm_agg.to_csv(output_path_svm_agg, index=False)
            print(f"\nSubmission file for SVM (Aggregated) created successfully at: {output_path_svm_agg}")
            print("Submission file head (SVM Aggregated):")
            print(submission_df_svm_agg.head())

except Exception as e:
    print(f"\n!!! An error occurred: {e} !!!")
    import traceback
    traceback.print_exc()

print("\n--- Comparison Point ---")
print(f"Validation AUC for SVM on Aggregated Data (171 features): {results_agg_only.get('SVM (Aggregated Only)', 'N/A'):.4f}")
print("Compare this to the AUC score you obtained for SVM after full preprocessing.")
print("-" * 26)



--- Evaluating SVM on Aggregated Data (No Filtering) ---
Loading data...
Data loaded.
Aggregating features...
Aggregating features for train_df using 'mean' method...
Aggregation complete for train_df. Shape: (8234, 171)
Aggregating features for val_df using 'mean' method...
Aggregation complete for val_df. Shape: (2788, 171)
Aggregating features for test_df using 'mean' method...
Aggregation complete for test_df. Shape: (2741, 171)
Aggregated shapes: Train=(8234, 171), Val=(2788, 171), Test=(2741, 171)
Scaling aggregated features...
Scaling complete.
Training SVM on aggregated (unfiltered) data...
(Note: SVM training with probability=True can be slow)...
Training complete. Time: 39.63s

Evaluating SVM model on validation set (aggregated data)...
Validation Accuracy (SVM Aggregated): 0.7816
Validation AUC-ROC (SVM Aggregated): 0.7343

Validation Classification Report (SVM Aggregated):
              precision    recall  f1-score   support

       False       0.89      0.84      0.86    

In [None]:
# 1. Import necessary libraries
import os
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score
import warnings

# Suppress specific FutureWarning from pandas
warnings.simplefilter(action='ignore', category=FutureWarning)

# --- Configuration ---
AGGREGATION_METHOD = 'mean' # Or 'mean', 'max' if you want to test that aggregation
# Define file paths globally
train_csv_file = os.path.join(DATA_DIR, "train.csv")
val_csv_file = os.path.join(DATA_DIR, "valid.csv")
test_csv_file = os.path.join(DATA_DIR, "test.csv")
ehr_pkl_file = os.path.join(DATA_DIR, "ehr_preprocessed_seq_by_day_cat_embedding.pkl")

# --- Function Definition (Aggregation Step Only) ---
def aggregate_admission_data(df, ehr_data_dict, aggregation, df_name="DataFrame", has_labels=True):
    """Aggregate features per admission with optional labels"""
    print(f"Aggregating features for {df_name} using '{aggregation}' method...")
    admission_ids = df['id'].unique()
    X = []
    y = [] if has_labels else None
    processed_ids = []
    feature_dict = ehr_data_dict.get('feat_dict', {})
    feature_cols = ehr_data_dict.get('feature_cols', [])
    expected_dim = len(feature_cols)

    for adm_id in admission_ids:
        if adm_id not in feature_dict: continue
        adm_features = feature_dict[adm_id]
        if not isinstance(adm_features, np.ndarray) or adm_features.size == 0: continue
        try:
            if aggregation == 'last': features = adm_features[-1]
            elif aggregation == 'mean': features = np.mean(adm_features, axis=0)
            elif aggregation == 'max': features = np.max(adm_features, axis=0)
            else: raise ValueError(f"Invalid aggregation method: {aggregation}")
        except Exception: continue
        if expected_dim > 0 and features.shape[0] != expected_dim: continue
        X.append(features)
        processed_ids.append(adm_id)
        if has_labels:
            adm_rows = df[df['id'] == adm_id]
            if not adm_rows.empty: y.append(adm_rows['readmitted_within_30days'].iloc[0])
            else: X.pop(); processed_ids.pop()
    X_np = np.array(X); y_np = np.array(y) if has_labels and y is not None else None
    if X_np.shape[0] != len(processed_ids): raise RuntimeError("Inconsistent samples after aggregation.")
    if has_labels and y_np is not None and y_np.shape[0] != len(processed_ids): raise RuntimeError("Inconsistent labels after aggregation.")
    print(f"Aggregation complete for {df_name}. Shape: {X_np.shape}")
    return X_np, y_np, processed_ids

# --- Main Logic ---
results_comparison = {} # Store results for comparison

try:
    # === Run SVM Comparison on Aggregated Data (No Filtering) ===
    print("--- Comparing SVM (Balanced vs. Unbalanced) on Aggregated Data (No Filtering) ---")
    # 1a. Load Data
    print("Loading data...")
    train_df = pd.read_csv(train_csv_file)
    val_df = pd.read_csv(val_csv_file)
    test_df = pd.read_csv(test_csv_file) # Load test_df for potential submission later
    with open(ehr_pkl_file, 'rb') as f:
        ehr_data = pd.read_pickle(f)
    print("Data loaded.")

    # 1b. Aggregate Features
    print("Aggregating features...")
    X_train_agg, y_train_agg, train_ids_agg = aggregate_admission_data(train_df, ehr_data, AGGREGATION_METHOD, df_name="train_df")
    X_val_agg, y_val_agg, val_ids_agg = aggregate_admission_data(val_df, ehr_data, AGGREGATION_METHOD, df_name="val_df")
    X_test_agg, _, test_ids_agg = aggregate_admission_data(test_df, ehr_data, AGGREGATION_METHOD, df_name="test_df", has_labels=False)
    print(f"Aggregated shapes: Train={X_train_agg.shape}, Val={X_val_agg.shape}, Test={X_test_agg.shape}")

    # 1c. Scale Aggregated Features
    print("Scaling aggregated features...")
    scaler_agg = StandardScaler()
    X_train_scaled_agg = scaler_agg.fit_transform(X_train_agg)
    X_val_scaled_agg = scaler_agg.transform(X_val_agg)
    X_test_scaled_agg = scaler_agg.transform(X_test_agg) # Scale test set too
    print("Scaling complete.")

    # --- Run 1: SVM WITHOUT Balancing ---
    print("\n--- Training SVM WITHOUT class_weight='balanced' ---")
    print("(Note: SVM training with probability=True can be slow)...")
    svm_model_unbalanced = SVC(random_state=42, C=1.0, kernel='rbf', gamma='scale',
                               probability=True, class_weight=None) # Explicitly None
    start_time = time.time()
    svm_model_unbalanced.fit(X_train_scaled_agg, y_train_agg)
    end_time = time.time()
    print(f"Training complete. Time: {end_time - start_time:.2f}s")

    # Evaluate Unbalanced SVM
    print("\nEvaluating UNBALANCED SVM model on validation set...")
    y_val_pred_svm_unbalanced = svm_model_unbalanced.predict(X_val_scaled_agg)
    y_val_pred_proba_unbalanced = svm_model_unbalanced.predict_proba(X_val_scaled_agg)[:, 1]
    auc_unbalanced = roc_auc_score(y_val_agg, y_val_pred_proba_unbalanced)
    accuracy_unbalanced = accuracy_score(y_val_agg, y_val_pred_svm_unbalanced)
    results_comparison['SVM (Unbalanced)'] = auc_unbalanced

    print(f"Validation Accuracy (SVM Unbalanced): {accuracy_unbalanced:.4f}")
    print(f"Validation AUC-ROC (SVM Unbalanced): {auc_unbalanced:.4f}")
    print("\nValidation Classification Report (SVM Unbalanced):")
    try:
        print(classification_report(y_val_agg, y_val_pred_svm_unbalanced))
    except ValueError as e:
        print(f"Could not generate classification report: {e}")

    # --- Run 2: SVM WITH Balancing ---
    print("\n--- Training SVM WITH class_weight='balanced' ---")
    print("(Note: SVM training with probability=True can be slow)...")
    svm_model_balanced = SVC(random_state=42, C=1.0, kernel='rbf', gamma='scale',
                             probability=True, class_weight='balanced') # Use balanced
    start_time = time.time()
    svm_model_balanced.fit(X_train_scaled_agg, y_train_agg)
    end_time = time.time()
    train_time_balanced = end_time - start_time # Store time for balanced model
    print(f"Training complete. Time: {train_time_balanced:.2f}s")

    # Evaluate Balanced SVM
    print("\nEvaluating BALANCED SVM model on validation set...")
    y_val_pred_svm_balanced = svm_model_balanced.predict(X_val_scaled_agg)
    y_val_pred_proba_balanced = svm_model_balanced.predict_proba(X_val_scaled_agg)[:, 1]
    auc_balanced = roc_auc_score(y_val_agg, y_val_pred_proba_balanced)
    accuracy_balanced = accuracy_score(y_val_agg, y_val_pred_svm_balanced)
    results_comparison['SVM (Balanced)'] = auc_balanced

    print(f"Validation Accuracy (SVM Balanced): {accuracy_balanced:.4f}")
    print(f"Validation AUC-ROC (SVM Balanced): {auc_balanced:.4f}")
    print("\nValidation Classification Report (SVM Balanced):")
    try:
        print(classification_report(y_val_agg, y_val_pred_svm_balanced))
    except ValueError as e:
        print(f"Could not generate classification report: {e}")


    # --- Optional: Create Submission File (Using the Balanced Model Results) ---
    create_submission = True # Set to False if you don't need a submission file now
    if create_submission:
        print("\nPredicting probabilities on the test set using BALANCED SVM...")
        # Use the balanced model for prediction if creating submission
        y_test_pred_proba_svm_balanced = svm_model_balanced.predict_proba(X_test_scaled_agg)[:, 1]
        print("Prediction complete.")

        print(f"Number of test predictions: {len(y_test_pred_proba_svm_balanced)}")
        print(f"Number of test IDs: {len(test_ids_agg)}")

        if len(test_ids_agg) != len(y_test_pred_proba_svm_balanced):
             print(f"Error: Length mismatch! Test IDs ({len(test_ids_agg)}) vs Predictions ({len(y_test_pred_proba_svm_balanced)})")
             # Add fallback logic if necessary
        else:
            print("Test IDs and predictions match.")
            submission_df_svm_balanced = pd.DataFrame({
                'id': test_ids_agg,
                'readmitted_within_30days': y_test_pred_proba_svm_balanced
            })
            output_path_svm_balanced = "submission_svm_aggregated_balanced.csv" # Specific name
            submission_df_svm_balanced.to_csv(output_path_svm_balanced, index=False)
            print(f"\nSubmission file for SVM (Aggregated, Balanced) created successfully at: {output_path_svm_balanced}")
            print("Submission file head (SVM Aggregated, Balanced):")
            print(submission_df_svm_balanced.head())

except Exception as e:
    print(f"\n!!! An error occurred: {e} !!!")
    import traceback
    traceback.print_exc()

# --- Final Comparison Output ---
print("\n--- SVM AUC Comparison (Aggregated Data Only) ---")
print(f"Aggregation Method: {AGGREGATION_METHOD}")
print("-" * 50)
auc_unbal = results_comparison.get('SVM (Unbalanced)', 'N/A')
auc_bal = results_comparison.get('SVM (Balanced)', 'N/A')

print(f"SVM (Unbalanced): {auc_unbal:.4f}" if isinstance(auc_unbal, float) else f"SVM (Unbalanced): {auc_unbal}")
print(f"SVM (Balanced):   {auc_bal:.4f}" if isinstance(auc_bal, float) else f"SVM (Balanced):   {auc_bal}")
print("-" * 50)


--- Comparing SVM (Balanced vs. Unbalanced) on Aggregated Data (No Filtering) ---
Loading data...
Data loaded.
Aggregating features...
Aggregating features for train_df using 'mean' method...
Aggregation complete for train_df. Shape: (8234, 171)
Aggregating features for val_df using 'mean' method...
Aggregation complete for val_df. Shape: (2788, 171)
Aggregating features for test_df using 'mean' method...
Aggregation complete for test_df. Shape: (2741, 171)
Aggregated shapes: Train=(8234, 171), Val=(2788, 171), Test=(2741, 171)
Scaling aggregated features...
Scaling complete.

--- Training SVM WITHOUT class_weight='balanced' ---
(Note: SVM training with probability=True can be slow)...
Training complete. Time: 26.00s

Evaluating UNBALANCED SVM model on validation set...
Validation Accuracy (SVM Unbalanced): 0.8468
Validation AUC-ROC (SVM Unbalanced): 0.7176

Validation Classification Report (SVM Unbalanced):
              precision    recall  f1-score   support

       False       0.85