In [17]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Load the dataset
file_path = 'dataset/diabetic_data.csv'
data = pd.read_csv(file_path)

# Step 1: Replace '?' with NaN
data.replace('?', np.nan, inplace=True)

# Step 2: Convert 'readmitted' to binary (0 for NO, 1 for >30 or <=30)
data['readmitted'] = data['readmitted'].map({'NO': 0, '>30': 1, '<=30': 1})

# Step 3: Drop unnecessary columns
columns_to_drop = ['encounter_id', 'patient_nbr', 'payer_code', 'medical_specialty']
data.drop(columns=columns_to_drop, inplace=True, axis=1)

# Step 4: Separate features into categorical and numeric
categorical_cols = data.select_dtypes(include='object').columns
numeric_cols = data.select_dtypes(include=np.number).columns

# Step 5: Define preprocessing for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Step 6: Apply preprocessing
processed_data = preprocessor.fit_transform(data)

# Extract feature names for one-hot encoded categorical columns
categorical_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
all_feature_names = numeric_cols.tolist() + categorical_feature_names.tolist()

# Convert processed data to a DataFrame
processed_df = pd.DataFrame(processed_data, columns=all_feature_names)

# Add the target column back
processed_df['readmitted'] = data['readmitted'].values

# Step 7: Save processed data to CSV
output_file = 'dataset/preprocessed_data.csv'
processed_df.to_csv(output_file, index=False)
print(f"Preprocessed dataset saved to {output_file}")


Highly correlated features with 'readmitted':
readmitted                  1.000000
number_inpatient            0.166616
number_diagnoses            0.101320
number_emergency            0.086318
number_outpatient           0.081094
diabetesMed                 0.055338
admission_source_id         0.040878
time_in_hospital            0.037187
num_medications             0.034690
num_lab_procedures          0.034048
age                         0.025599
repaglinide                 0.017146
acarbose                    0.016817
glipizide                   0.016816
rosiglitazone               0.015029
pioglitazone                0.014171
A1Cresult                   0.009288
glimepiride                 0.005256
diag_3                      0.004434
nateglinide                 0.004388
glipizide-metformin         0.004310
glimepiride-pioglitazone    0.004132
acetohexamide               0.004132
chlorpropamide              0.003728
miglitol                    0.003290
glyburide-metformin         0

In [21]:
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import SVC

# Load the preprocessed dataset
df = pd.read_csv('dataset/preprocessed_data.csv')

# Separate features and target variable
target = df['readmitted']
features = df.drop('readmitted', axis=1)

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features, target, 
                                                    test_size=0.2, random_state=0)

# Verify the split
print("Training features shape:", x_train.shape)
print("Testing features shape:", x_test.shape)
print("Training target shape:", y_train.shape)
print("Testing target shape:", y_test.shape)


Training features shape: (81412, 45)
Testing features shape: (20354, 45)
Training target shape: (81412,)
Testing target shape: (20354,)


In [22]:
# Define pipelines

# Decision Tree
pl_std_tree = Pipeline([('Standard Scaler', StandardScaler()),
                        ('DTClassifier', DecisionTreeClassifier())]) 
pl_mm_tree = Pipeline([('Min Max Scaler', MinMaxScaler()),
                       ('DTClassifier', DecisionTreeClassifier())]) 
# KNN
pl_std_knn = Pipeline([('Standard Scaler', StandardScaler()),
                       ('KNClassifier', KNeighborsClassifier())]) 
pl_mm_knn = Pipeline([('Min Max Scaler', MinMaxScaler()),
                      ('KNClassifier', KNeighborsClassifier())])

# Multinomial Naive Bayes
pl_mm_mnb = Pipeline([('Min Max Scaler', MinMaxScaler()),
                      ('MNBClassifier', MultinomialNB())])

# Bernoulli Naive Bayes
pl_mm_bnb = Pipeline([('Min Max Scaler', MinMaxScaler()),
                      ('BNBClassifier', BernoulliNB())])

# Gaussian Naive Bayes
pl_std_gnb = Pipeline([('Standard Scaler', StandardScaler()),
                       ('GNBClassifier', GaussianNB())])

# SVM
pl_std_svc = Pipeline([('Standard Scaler', StandardScaler()),
                       ('SVClassifier', SVC())]) 
pl_mm_svc = Pipeline([('Min Max Scaler', MinMaxScaler()),
                      ('SVClassifier', SVC())])

# Create list of pipelines
pipelines = [
    pl_std_tree, pl_mm_tree, 
    pl_std_knn, pl_mm_knn,
    pl_mm_mnb, pl_mm_bnb, pl_std_gnb, 
    # pl_std_svc, pl_mm_svc
]

# Pipeline dictionary for labeling
pipe_dict = {
    0: 'SS+Tree', 
    1: 'MM+Tree',
    2: 'SS+KNN',
    3: 'MM+KNN',
    4: 'MM+MNB',
    5: 'MM+BNB',
    6: 'SS+GNB',
    #7: 'SS+SVC',
    #8: 'MM+SVC'
}

# Handle mixed-type columns with non-numeric values (e.g., 'V58')
problematic_columns = ['diag_1', 'diag_2', 'diag_3']  # Update these with actual columns causing issues
for col in problematic_columns:
    x_train[col] = x_train[col].apply(lambda x: float(x) if str(x).replace('.', '', 1).isdigit() else -1)
    x_test[col] = x_test[col].apply(lambda x: float(x) if str(x).replace('.', '', 1).isdigit() else -1)

# Fitting the data and evaluating models
best_f1_score = 0

for i, pipeline in enumerate(pipelines):
    pipeline.fit(x_train, y_train)  # Fit the pipeline
    y_pred = pipeline.predict(x_test)  # Predict on the test set

    # Calculate precision, recall, f1-score
    report = classification_report(y_test, y_pred, output_dict=True)
    f1_score = report['weighted avg']['f1-score']
    
    print(f"\n{pipe_dict[i]} -> Classification Report:")
    
    # Header for table
    print(f"{'Class':<15}{'Precision':<12}{'Recall':<12}{'F1-Score':<12}{'Support':<12}")
    
    # Class-level metrics
    for label, metrics in report.items():
        if label not in ['accuracy', 'macro avg', 'weighted avg']:
            precision = metrics['precision']
            recall = metrics['recall']
            f1 = metrics['f1-score']
            support = metrics['support']
            print(f"{label:<15}{precision:<12.4f}{recall:<12.4f}{f1:<12.4f}{support:<12}")

    # Overall metrics (averages and accuracy)
    print(f"{'Weighted Avg':<15}{report['weighted avg']['precision']:<12.4f}"
          f"{report['weighted avg']['recall']:<12.4f}"
          f"{report['weighted avg']['f1-score']:<12.4f}{'-':<12}")
    print(f"{'Macro Avg':<15}{report['macro avg']['precision']:<12.4f}"
          f"{report['macro avg']['recall']:<12.4f}"
          f"{report['macro avg']['f1-score']:<12.4f}{'-':<12}")
    print(f"{'Accuracy':<15}{'-':<12}{'-':<12}"
          f"{f1_score:<12.4f}{'-':<12}")
    
    # Identify the best-performing pipeline based on F1 score
    if f1_score > best_f1_score:
        best_f1_score = f1_score
        best_pipeline = pipeline
        best_scaler = pipe_dict[i]

print(f"\nThe best pipeline for the dataset is {best_scaler} with an F1 score of {best_f1_score:.4f}")


ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.