In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
file_path = "Healthcare-Diabetes.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Dataset Loaded Successfully!")
data.head()


Dataset Loaded Successfully!


Unnamed: 0,Id,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,6,148,72,35,0,33.6,0.627,50,1
1,2,1,85,66,29,0,26.6,0.351,31,0
2,3,8,183,64,0,0,23.3,0.672,32,1
3,4,1,89,66,23,94,28.1,0.167,21,0
4,5,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# Basic data info
print("Dataset Info:")
data.info()

# Handle missing values (if any)
print("\nMissing Values:")
print(data.isnull().sum())

# Replace missing values (example: mean for numerical columns)
data.fillna(data.mean(numeric_only=True), inplace=True)

# Drop irrelevant columns (if any)
# Example: Dropping an ID column
if "ID" in data.columns:
    data.drop(columns=["ID"], inplace=True)


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2768 entries, 0 to 2767
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Id                        2768 non-null   int64  
 1   Pregnancies               2768 non-null   int64  
 2   Glucose                   2768 non-null   int64  
 3   BloodPressure             2768 non-null   int64  
 4   SkinThickness             2768 non-null   int64  
 5   Insulin                   2768 non-null   int64  
 6   BMI                       2768 non-null   float64
 7   DiabetesPedigreeFunction  2768 non-null   float64
 8   Age                       2768 non-null   int64  
 9   Outcome                   2768 non-null   int64  
dtypes: float64(2), int64(8)
memory usage: 216.4 KB

Missing Values:
Id                          0
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0


In [5]:
# Specify target and feature columns
target_column = "Outcome"  # Replace 'Outcome' with the actual target column name
X = data.drop(columns=[target_column])
y = data[target_column]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training Set Size: {X_train.shape}, Testing Set Size: {X_test.shape}")


Training Set Size: (2214, 9), Testing Set Size: (554, 9)


In [8]:
# Identify numerical and categorical columns
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

# Preprocessing for numerical and categorical data
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [9]:
# Create a full pipeline for preprocessing
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit the pipeline to the training data
pipeline.fit(X_train)

# Transform the data
X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

print("Data Preprocessing and Transformation Completed!")


Data Preprocessing and Transformation Completed!


In [10]:
# Convert the transformed arrays back to DataFrames for easier loading
X_train_df = pd.DataFrame(X_train_transformed.toarray())
X_test_df = pd.DataFrame(X_test_transformed.toarray())

# Save to CSV
X_train_df.to_csv("X_train_processed.csv", index=False)
X_test_df.to_csv("X_test_processed.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

print("Transformed Data Saved Successfully!")


AttributeError: 'numpy.ndarray' object has no attribute 'toarray'

In [11]:
# Convert the transformed arrays back to DataFrames for easier loading
X_train_df = pd.DataFrame(X_train_transformed)
X_test_df = pd.DataFrame(X_test_transformed)

# Save to CSV
X_train_df.to_csv("X_train_processed.csv", index=False)
X_test_df.to_csv("X_test_processed.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

print("Transformed Data Saved Successfully!")


Transformed Data Saved Successfully!
