In [23]:
import pickle
import pandas as pd
import shap
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [24]:
df = pd.read_csv('../data/processed/sample_feature_engineered.csv')
other_targets = ['impressions', 'clicks', 'ctr', 'conversions', 'conversion_rate']
df = df.drop(columns=other_targets)

X = df.drop(columns=['roi'])
y = df['roi']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
df.columns

Index(['duration_seconds', 'brightness_score', 'text_to_image_ratio',
       'logo_size_ratio', 'has_human_face', 'face_count', 'sentiment_score',
       'word_count', 'music_tempo', 'speech_pace', 'roi',
       'color_palette_primary_R', 'color_palette_primary_G',
       'color_palette_primary_B', 'platform_encoded', 'ad_type_encoded',
       'industry_encoded', 'campaign_objective_encoded',
       'target_audience_gender_encoded', 'aspect_ratio_encoded',
       'cta_type_encoded'],
      dtype='object')

In [26]:
indices = np.random.choice(df.index, size=(1, df.shape[1]), replace=True)
ad_sample = pd.DataFrame(data=df.to_numpy()[indices, np.arange(len(df.columns))], 
                       columns=df.columns)

ad_sample.to_dict()

{'duration_seconds': {0: 0.0},
 'brightness_score': {0: 60.0},
 'text_to_image_ratio': {0: 0.42},
 'logo_size_ratio': {0: 0.07},
 'has_human_face': {0: 1.0},
 'face_count': {0: 2.0},
 'sentiment_score': {0: 0.68},
 'word_count': {0: 12.0},
 'music_tempo': {0: 0.0},
 'speech_pace': {0: 0.0},
 'roi': {0: 3.6},
 'color_palette_primary_R': {0: 30.0},
 'color_palette_primary_G': {0: 120.0},
 'color_palette_primary_B': {0: 205.0},
 'platform_encoded': {0: 2.0},
 'ad_type_encoded': {0: 3.0},
 'industry_encoded': {0: 7.0},
 'campaign_objective_encoded': {0: 4.0},
 'target_audience_gender_encoded': {0: 0.0},
 'aspect_ratio_encoded': {0: 1.0},
 'cta_type_encoded': {0: 11.0}}

In [27]:
import joblib

with open ("../models/encoders/label_encoders.pkl", "rb") as f:
    label_encoders = joblib.load(f)

In [28]:
for column, encoder in label_encoders.items():
    mapping = {class_: idx for idx, class_ in enumerate(encoder.classes_)}
    print((column, mapping))

('platform', {'LinkedIn': 0, 'Meta': 1, 'TikTok': 2, 'YouTube': 3})
('ad_type', {'Carousel': 0, 'Image': 1, 'Story': 2, 'Video': 3})
('industry', {'Automotive': 0, 'B2B Technology': 1, 'Beauty': 2, 'Cosmetics': 3, 'E-commerce': 4, 'Education': 5, 'Electronics': 6, 'Entertainment': 7, 'Fashion': 8, 'Finance': 9, 'Fitness': 10, 'Food & Beverage': 11, 'Gaming': 12, 'Health': 13, 'Home Goods': 14, 'Jewelry': 15, 'Professional Services': 16, 'Retail': 17, 'SaaS': 18, 'Travel': 19})
('campaign_objective', {'App Install': 0, 'Awareness': 1, 'Consideration': 2, 'Conversion': 3, 'Lead Generation': 4})
('target_audience_gender', {'all': 0, 'female': 1})
('aspect_ratio', {'16:9': 0, '1:1': 1, '4:3': 2, '4:5': 3, '9:16': 4})
('cta_type', {'Book Now': 0, 'Book Test Drive': 1, 'Browse Collection': 2, 'Contact Us': 3, 'Discover More': 4, 'Download Guide': 5, 'Download Now': 6, 'Download Report': 7, 'Get Demo': 8, 'Get Guide': 9, 'Join Now': 10, 'Learn More': 11, 'Order Now': 12, 'Register Now': 13, '

In [29]:
xgb_model = pickle.load(open('../models/trained_models/xgb_model.pkl', 'rb'))

In [30]:
ad_sample_scaled = scaler.transform(ad_sample.drop(columns=['roi']))

In [31]:
ad_sample.columns

Index(['duration_seconds', 'brightness_score', 'text_to_image_ratio',
       'logo_size_ratio', 'has_human_face', 'face_count', 'sentiment_score',
       'word_count', 'music_tempo', 'speech_pace', 'roi',
       'color_palette_primary_R', 'color_palette_primary_G',
       'color_palette_primary_B', 'platform_encoded', 'ad_type_encoded',
       'industry_encoded', 'campaign_objective_encoded',
       'target_audience_gender_encoded', 'aspect_ratio_encoded',
       'cta_type_encoded'],
      dtype='object')

In [32]:
xgb_model.predict(ad_sample_scaled)

array([3.7682674], dtype=float32)

In [33]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
explainer = shap.Explainer(xgb_model, X_train_scaled_df)

shap_values = explainer(ad_sample_scaled)

In [42]:
len(shap_values.values[0])

20

In [44]:
len(X.columns)

20

In [35]:
shap_df = pd.DataFrame({
    "feature": X.columns,
    "shap_value": shap_values.values[0],
})

worst_features = shap_df.sort_values(by="shap_value").head(10)

In [36]:
shap_df.sort_values(by="shap_value", ascending=False).head(5)

Unnamed: 0,feature,shap_value
16,campaign_objective_encoded,0.757703
12,color_palette_primary_B,0.1862
5,face_count,0.105521
0,duration_seconds,0.097354
13,platform_encoded,0.055809


In [37]:
worst_features

Unnamed: 0,feature,shap_value
19,cta_type_encoded,-0.242984
2,text_to_image_ratio,-0.235927
1,brightness_score,-0.121757
6,sentiment_score,-0.087157
9,speech_pace,-0.026471
7,word_count,-0.021558
10,color_palette_primary_R,-0.017185
8,music_tempo,-0.017149
14,ad_type_encoded,-0.007626
11,color_palette_primary_G,-0.001465


In [38]:
ease_map = {
    "duration_seconds": 1.5,
    "brightness_score": 1,
    "text_to_image_ratio": 1,
    "logo_size_ratio": 1,
    "has_human_face": 2,
    "face_count": 2,
    "sentiment_score": 1,
    "word_count": 1,
    "music_tempo": 1.5,
    "speech_pace": 1.5,
    "color_palette_primary_R": 1,
    "color_palette_primary_G": 1,
    "color_palette_primary_B": 1,
    "platform_encoded": 3,
    "industry_encoded": 3,
    "campaign_objective_encoded": 3,
    "target_audience_gender_encoded": 2,
    "aspect_ratio_encoded": 1,
    "cta_type_encoded": 1,
    "default": 1
}

In [39]:
base_roi = xgb_model.predict(ad_sample_scaled)[0]

best_changes = {}
suggestions = []

for id, row in worst_features.iterrows():
    feature = row['feature']
    shap_value = row['shap_value']

    # Caso seja uma feature categórica (label encoded)
    if feature.replace('_encoded', '') in label_encoders:
        encoder_label = feature.replace('_encoded', '')
        current_feature_value = ad_sample[feature].values[0].astype(int)
        current_label = label_encoders[encoder_label].inverse_transform([current_feature_value])[0]
        best_roi_for_feature = base_roi
        best_change = None
        
        for label in label_encoders[encoder_label].classes_:
            if label == current_feature_value: continue

            new_sample = ad_sample.copy()
            new_sample[feature] = label_encoders[encoder_label].transform([label])[0]

            new_roi = xgb_model.predict(scaler.transform(new_sample.drop(columns=['roi'])))[0]
            delta_roi = new_roi - base_roi
            
            if delta_roi > 0 and (best_change is None or delta_roi > best_change["delta_roi"]):
                best_change = {
                    "feature": feature,
                    "from": current_label,
                    "to": label,
                    "roi": new_roi,
                    "delta_roi": delta_roi,
                    "ease_score": ease_map.get(feature, ease_map["default"]),
                }

        if best_change:
            suggestions.append(best_change)
    
    else:
        current_feature_value = ad_sample[feature].values[0]
        best_roi_for_feature = base_roi
        best_change = None

        # Gera 15 perturbações entre -30% e +30%
        for pct in np.linspace(-0.3, 0.3, 15):
            new_value = current_feature_value * (1 + pct)

            if new_value < 0:
                continue

            new_sample = ad_sample.copy()
            new_sample[feature] = new_value
            new_roi = xgb_model.predict(scaler.transform(new_sample.drop(columns=['roi'])))[0]
            delta_roi = new_roi - base_roi

            if delta_roi > 0 and (best_change is None or delta_roi > best_change["delta_roi"]):
                best_change = {
                    "feature": feature,
                    "from": round(current_feature_value, 2),
                    "to": round(new_value, 2),
                    "roi": new_roi,
                    "delta_roi": delta_roi,
                    "ease_score": ease_map.get(feature, ease_map["default"]),
                    "pct_change": f"{pct*100:.1f}%"
                }

        if best_change:
            suggestions.append(best_change)

In [40]:
suggestions_sorted = sorted(
    suggestions,
    key=lambda x: (-x['delta_roi'], x['ease_score'])
)

In [41]:
suggestions_sorted

[{'feature': 'cta_type_encoded',
  'from': 'Learn More',
  'to': 'Book Now',
  'roi': 11.208212,
  'delta_roi': 7.4399443,
  'ease_score': 1},
 {'feature': 'brightness_score',
  'from': 60.0,
  'to': 42.0,
  'roi': 5.774897,
  'delta_roi': 2.0066297,
  'ease_score': 1,
  'pct_change': '-30.0%'},
 {'feature': 'color_palette_primary_R',
  'from': 30.0,
  'to': 21.0,
  'roi': 3.7938452,
  'delta_roi': 0.025577784,
  'ease_score': 1,
  'pct_change': '-30.0%'},
 {'feature': 'text_to_image_ratio',
  'from': 0.42,
  'to': 0.38,
  'roi': 3.7809465,
  'delta_roi': 0.0126791,
  'ease_score': 1,
  'pct_change': '-8.6%'},
 {'feature': 'sentiment_score',
  'from': 0.68,
  'to': 0.85,
  'roi': 3.7729557,
  'delta_roi': 0.004688263,
  'ease_score': 1,
  'pct_change': '25.7%'}]