In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

pd.set_option('display.max_columns', 20)

In [None]:
df = pd.read_csv('cardiac arrest dataset.csv')

print(f"original Shape: {df.shape}")
df.head()

In [None]:
duplicated_rows = df.duplicated().sum()
print(f"Duplicate lines detected: {duplicated_rows}")

df = df.drop_duplicates()
print(f"New shape after removal: {df.shape}")

In [None]:
target_column = 'target'
feature_columns = df.columns.drop(target_column).tolist()

numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_features = list(set(feature_columns) - set(numerical_features))

print("Numerical variables:", numerical_features)
print("Categorical variables:", categorical_features)

In [None]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [None]:
X = df[feature_columns]
y = df[target_column]

X_processed = preprocessor.fit_transform(X)

print(f" X processed - shape: {X_processed.shape}")
print(f" y - shape: {y.shape}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set: {X_train.shape}, {y_train.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")

In [None]:
np.save("X_train.npy", X_train)
np.save("X_test.npy", X_test)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

print("Data saved in .npy format.")