In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Extract - Load data from CSV
def extract_data(file_path):
    return pd.read_csv(file_path)

# Step 2: Transform - Data Cleaning and Preprocessing
def transform_data(df):
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = df.select_dtypes(include=['object']).columns
    
    # Numeric transformation - Handling missing values and scaling
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    # Categorical transformation - Handling missing values and encoding
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Applying transformations
    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
    
    transformed_data = preprocessor.fit_transform(df)
    transformed_df = pd.DataFrame(transformed_data)
    
    return transformed_df

# Step 3: Load - Save transformed data
def load_data(df, output_file):
    df.to_csv(output_file, index=False)

if __name__ == "__main__":
    input_file = "Covid 19.csv"  # Replace with your actual data file
    output_file = "processed_data.csv"
    
    # Extract
    raw_data = extract_data(input_file)
    
    # Transform
    cleaned_data = transform_data(raw_data)
    
    # Load
    load_data(cleaned_data, output_file)
    
    print(f"ETL process completed. Processed data saved to {output_file}")


ETL process completed. Processed data saved to processed_data.csv
