In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import os

# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Load data
columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
           'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
data = pd.read_csv('data/heart_disease.csv', names=columns)

# Display initial data info
print("Initial data info:")
print(data.info())
print("\nSample data:")
print(data.head())

# Replace '?' with NaN
data = data.replace('?', np.nan)

# Convert all columns to appropriate data types
# First, handle numeric columns
numeric_columns = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']
for col in numeric_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Convert categorical columns to numeric as well
categorical_columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'target']
for col in categorical_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Now handle missing values column by column
print("\nMissing values before imputation:")
print(data.isnull().sum())

# For numeric columns, use median imputation
for col in numeric_columns:
    median_value = data[col].median()
    data[col] = data[col].fillna(median_value)

# For categorical columns, use mode imputation
for col in categorical_columns:
    mode_value = data[col].mode()[0]  # Get the most frequent value
    data[col] = data[col].fillna(mode_value)

print("\nMissing values after imputation:")
print(data.isnull().sum())

# Scale numerical features
scaler = StandardScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Save processed data
data.to_csv('data/processed_heart_data.csv', index=False)

print("\nProcessed data shape:", data.shape)
print(data.head())

# Split features and target
X = data.drop('target', axis=1)
y = data['target']

# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

# Save splits for later use
import pickle
with open('data/train_test_split.pkl', 'wb') as f:
    pickle.dump((X_train, X_test, y_train, y_test), f)

print("\nData preprocessing completed successfully!")

Initial data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  target    303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB
None

Sample data:
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4