In [None]:
import pandas as pd
import joblib
from etl_model_utils import predict_column_transformation

In [8]:
df = pd.read_csv('K:/ETL_Automation/ETLAutomation/data/test/Social_Media_Engagement_Dataset.csv')

In [9]:
df.dtypes

post_id                     object
timestamp                   object
day_of_week                 object
platform                    object
user_id                     object
location                    object
language                    object
text_content                object
hashtags                    object
mentions                    object
keywords                    object
topic_category              object
sentiment_score            float64
sentiment_label             object
emotion_type                object
toxicity_score             float64
likes_count                  int64
shares_count                 int64
comments_count               int64
impressions                  int64
engagement_rate            float64
brand_name                  object
product_name                object
campaign_name               object
campaign_phase              object
user_past_sentiment_avg    float64
user_engagement_growth     float64
buzz_change_rate           float64
dtype: object

In [17]:


model = joblib.load("../models/column_transform_model.pkl")
label_encoder = joblib.load("../models/label_encoder.pkl")

def smart_etl_with_model(filepath, model):
    df = pd.read_csv(filepath)
    transformed_df = df.copy()

    for col in df.columns:
        action = predict_column_transformation(df[col], model, label_encoder)
        print(f"{col}: {action}")

        if action == 'drop_column':
            transformed_df.drop(columns=[col], inplace=True)
        elif action == 'fill_zero':
            transformed_df[col] = transformed_df[col].fillna(0)
        elif action == 'fill_mean':
            if pd.api.types.is_numeric_dtype(df[col]):
                transformed_df[col] = transformed_df[col].fillna(df[col].mean())
        elif action == 'strip_string':
            transformed_df[col] = transformed_df[col].astype(str).str.strip()
        elif action == 'cast_to_string':
            transformed_df[col] = transformed_df[col].astype(str)
        elif action == 'fill_mode':
            mode_val = df[col].mode(dropna=True)
            if not mode_val.empty:
                transformed_df[col] = df[col].fillna(mode_val[0])
        elif action == 'fill_mode_date':
            if pd.api.types.is_datetime64_any_dtype(df[col]):
                mode_val = df[col].mode(dropna=True)
                if not mode_val.empty:
                    transformed_df[col] = df[col].fillna(mode_val[0])
        elif action == 'cast_to_int':
            transformed_df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to float, NaNs where conversion fails
            transformed_df[col] = transformed_df[col].fillna(0).astype(int)  # Fill NaNs (e.g., with 0) and cast to int

    #print("\n✅ Final transformed data:")
    #print(transformed_df.head())
    return transformed_df

# Example usage:
df = smart_etl_with_model("../data/test/Social_Media_Engagement_Dataset.csv", model)


TypeError: predict_column_transformation() takes 2 positional arguments but 3 were given

In [16]:
df.dtypes

post_id                     object
timestamp                   object
day_of_week                 object
platform                    object
user_id                     object
location                    object
language                    object
text_content                object
hashtags                    object
mentions                    object
keywords                    object
topic_category              object
sentiment_score            float64
sentiment_label             object
emotion_type                object
toxicity_score             float64
likes_count                  int64
shares_count                 int64
comments_count               int64
impressions                  int64
engagement_rate            float64
brand_name                  object
product_name                object
campaign_name               object
campaign_phase              object
user_past_sentiment_avg    float64
user_engagement_growth     float64
buzz_change_rate           float64
dtype: object