* # *Task-1. CREATE A PIPELINE FOR DATA PREPROCESSING, TRANSFORMATION, AND LOADING USING TOOLS LIKE PANDAS AND SCIKIT-LEARN*

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

In [25]:
# Function to load the dataset
def load_df(file_path):
    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        elif file_path.endswith(('.xls', '.xlsx')):
            df = pd.read_excel(file_path)
        else:
            raise ValueError("Unsupported file format.")
        return df
    except Exception as e:
        print(f"Error loading df: {e}")
        return None

In [26]:
# Function to preprocess the df
def preprocess_df(df, target_column):
    # Check for missing values
    print("Checking for missing values...")
    print(df.isnull().sum())
    # Fill missing values with mean for numerical columns)
    df.fillna(df.mean(), inplace=True)
    # Convert categorical variables to numerical using one-hot encoding)
    df = pd.get_dummies(df, drop_first=True)
    # Split the df into X and y
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    return X, y

In [27]:
# Function to scale the features
def scale_features(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

In [28]:
# Main function to run the ETL process
def main():
    # User input for dataset and target column
    file_path = input("Enter the path to your dataset (CSV file): ")
    df = load_df(file_path)
    
    if df is not None:
        print("Available columns in the dataset:- ")
        print(df.columns.tolist())
        target_column = input("Enter the name of the target column: ") 
        if target_column not in df.columns:
            print("Target column not found in the dataset. Please check the name.")
            return
        
        # Preprocess the df
        X, y = preprocess_df(df, target_column)
        # Scale the features
        X_scaled = scale_features(X)
        # Combine scaled features and target variable into a single dfFrame
        cleaned_df = pd.DataFrame(X_scaled, columns=X.columns)
        cleaned_df[target_column] = y.values  # Add the target column back
        # Save the cleaned dataset to a CSV file
        cleaned_df.to_csv('cleaned_dataset.csv', index=False)
        print("ETL process completed and cleaned dataset saved as 'cleaned_dataset.csv'.")

if __name__ == "__main__":
    main()

Available columns in the dataset:- 
['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
Checking for missing values...
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
ETL process completed and cleaned dataset saved as 'cleaned_dataset.csv'.
