In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# --- Step 1: Upload File ---
from google.colab import files
uploaded = files.upload()

# --- Step 2: Load Data ---
data = pd.read_csv("sample_data.csv")

# --- Step 3: Separate Columns ---
numerical_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = data.select_dtypes(include=['object']).columns.tolist()

# --- Step 4: Pipelines ---
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numerical_features),
    ("cat", categorical_pipeline, categorical_features)
])

# --- Step 5: Transform ---
processed_array = preprocessor.fit_transform(data)

# --- Step 6: Create DataFrame ---
try:
    cat_feature_names = preprocessor.named_transformers_["cat"]["encoder"].get_feature_names_out(categorical_features)
    all_feature_names = numerical_features + list(cat_feature_names)
    processed_df = pd.DataFrame(processed_array, columns=all_feature_names)
except:
    processed_df = pd.DataFrame(processed_array)

# --- Step 7: Save to CSV ---
processed_df.to_csv("processed_data.csv", index=False)
print("✅ Task 1 Completed. File saved as processed_data.csv")

Saving sample_data.csv to sample_data (1).csv
✅ Task 1 Completed. File saved as processed_data.csv


In [5]:
from google.colab import files
files.download("processed_data.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>