In [2]:
import os
import random
import warnings
from io import StringIO

import boto3
import igraph
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from dotenv import load_dotenv
from howso.engine import Trainee, delete_trainee, get_trainee
from howso.openapi.models.trainee_resources import TraineeResources
from howso.utilities import infer_feature_attributes
from igraph import EdgeSeq

HOWSO_API_ENV = "external"

load_dotenv()
if HOWSO_API_ENV == "internal":
    os.environ["HOWSO_CONFIG"] = os.getenv("HOWSO_API_KEY_INTERNAL")
elif HOWSO_API_ENV == "external":
    os.environ["HOWSO_CONFIG"] = os.getenv("HOWSO_API_KEY_EXTERNAL")
else:
    if "HOWSO_CONFIG" in os.environ:
        os.environ.pop("HOWSO_CONFIG")

print(f"HOWSO_CONFIG: {os.getenv('HOWSO_CONFIG')}")

pio.renderers.default = "notebook"

warnings.filterwarnings("ignore", "This pattern has match groups")
warnings.filterwarnings("ignore", "The 'warn' parameter of")
warnings.filterwarnings(
    "ignore",
    "This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.",
)

pd.options.mode.chained_assignment = None  # default='warn'

BUCKET = "prod-data-ml-pipeline"

s3_client = boto3.client(service_name="s3")

HOWSO_CONFIG: /home/joseja/repos/data_ml_serverless_stack/howso_external.yml


In [3]:
def get_df_from_s3(s3_path):
    if s3_path.endswith(".csv"):
        print(f"Reading {s3_path} from S3")
        obj = s3_client.get_object(Bucket=BUCKET, Key=s3_path)
        df = pd.read_csv(obj["Body"])
        return df
    else:
        objs = s3_client.list_objects(Bucket=BUCKET, Prefix=s3_path)
        for obj in objs["Contents"]:
            if ".csv" in (obj["Key"]):
                print(f"Reading {obj['Key']} from S3")
                obj = s3_client.get_object(Bucket=BUCKET, Key=obj["Key"])
                df = pd.read_csv(obj["Body"])
                return df
    return False


def upload_file_to_s3(df, s3_path):
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)
    s3_client.put_object(Bucket=BUCKET, Key=s3_path, Body=csv_buffer.getvalue())


def plot_taxonomy(
    taxonomy,
    target_name,
    target_node_path,
    stats,
):
    # Add ROOT node as first record in the taxonomy.
    taxonomy = taxonomy.fillna({"parent_id": 1})
    root_node = pd.DataFrame(
        {"node_id": 1, "parent_id": None, "node_name": "ROOT", "node_path": "ROOT"},
        index=[0],
    )
    taxonomy = pd.concat([root_node, taxonomy], ignore_index=True)

    good_categories = stats[stats["quality"] == "good"]["node_path"].tolist()
    average_categories = stats[stats["quality"] == "average"]["node_path"].tolist()
    bad_categories = stats[stats["quality"] == "bad"]["node_path"].tolist()

    unique_categories = taxonomy["node_id"].unique()

    adjacency_matrix = pd.DataFrame(
        np.zeros((len(unique_categories), len(unique_categories))),
        columns=unique_categories,
        index=unique_categories,
    )

    for index, row in taxonomy[taxonomy["parent_id"].notnull()].iterrows():
        adjacency_matrix.at[row["parent_id"], row["node_id"]] = 1

    # Drop indexes not appearing in the columns.
    for index in adjacency_matrix.index:
        if index not in adjacency_matrix.columns:
            adjacency_matrix.drop(index, inplace=True)

    # Map columns names from node_id to node_path.
    adjacency_matrix.columns = (
        taxonomy.set_index("node_id").loc[adjacency_matrix.columns, "node_path"].values
        if adjacency_matrix.index.name == "node_id"
        else adjacency_matrix.index
    )
    adjacency_matrix.index = (
        taxonomy.set_index("node_id").loc[adjacency_matrix.index, "node_path"].values
        if adjacency_matrix.index.name == "node_id"
        else adjacency_matrix.index
    )

    A = adjacency_matrix.to_numpy()
    G = igraph.Graph.Adjacency((A > 0).tolist())
    G.vs["label"] = adjacency_matrix.columns
    G.es["weight"] = A[A.nonzero()]

    lay = G.layout("rt")

    position = {k: lay[k] for k, category in enumerate(unique_categories)}
    Y = [lay[k][1] for k, category in enumerate(unique_categories)]
    M = max(Y)

    es = EdgeSeq(G)  # sequence of edges
    E = [e.tuple for e in G.es]  # list of edges

    L = len(position)
    Xn = [position[k][0] for k in range(L)]
    Yn = [2 * M - position[k][1] for k in range(L)]
    Xe = []
    Ye = []
    for edge in E:
        Xe += [position[edge[0]][0], position[edge[1]][0], None]
        Ye += [2 * M - position[edge[0]][1], 2 * M - position[edge[1]][1], None]

    unique_categories = (
        taxonomy.set_index("node_id").loc[unique_categories, "node_path"].values
    )
    node_color_map = {
        category: (
            "rgba(0, 0, 0, 1)"
            if category in target_node_path
            else (
                "rgba(0, 255, 0, 1)"
                if category in good_categories
                else (
                    "rgba(0, 0, 255, 1)"
                    if category in average_categories
                    else (
                        "rgba(255, 0, 0, 1)"
                        if category in bad_categories
                        else "rgba(105, 105, 105, 0.2)"
                    )
                )
            )
        )
        for category in unique_categories
    }

    node_symbol_map = {
        category: ("square" if category == target_node_path else "circle")
        for category in unique_categories
    }

    # Increse the size of good, average, bad, current category, and current target.
    node_size_map = {
        category: (
            24
            if category == target_node_path
            else (
                12
                if category in good_categories
                else (
                    12
                    if category in average_categories
                    else 12 if category in bad_categories else 8
                )
            )
        )
        for category in unique_categories
    }

    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=Xe,
            y=Ye,
            mode="lines",
            line=dict(color="rgba(0,0,0,1)", width=0.2),
            hoverinfo="none",
            showlegend=False,
        )
    )
    fig.add_trace(
        go.Scatter(
            x=Xn,
            y=Yn,
            mode="markers",
            marker=dict(
                symbol=[node_symbol_map[category] for category in unique_categories],
                size=[node_size_map[category] for category in unique_categories],
                color=[node_color_map[category] for category in unique_categories],
                line=dict(color="rgba(0,0,0,1)", width=1),
            ),
            text=unique_categories,
            hoverinfo="text",
            opacity=1,
            showlegend=False,
        )
    )
    fig.add_trace(
        go.Scatter(
            x=[None],
            y=[None],
            mode="markers",
            name="Target",
            marker=dict(
                symbol="diamond",
                size=10,
                color="rgba(0, 0, 0, 1)",
                line=dict(color="rgba(0,0,0,1)", width=1),
            ),
        ),
    )
    fig.add_trace(
        go.Scatter(
            x=[None],
            y=[None],
            mode="markers",
            name="Good Performance Categories",
            marker=dict(
                symbol="circle",
                size=8,
                color="rgba(0, 255, 0, 1)",
                line=dict(color="rgba(0,0,0,1)", width=1),
            ),
        ),
    )
    fig.add_trace(
        go.Scatter(
            x=[None],
            y=[None],
            mode="markers",
            name="Average Performance Categories",
            marker=dict(
                symbol="circle",
                size=8,
                color="rgba(0, 0, 255, 1)",
                line=dict(color="rgba(0,0,0,1)", width=1),
            ),
        ),
    )
    fig.add_trace(
        go.Scatter(
            x=[None],
            y=[None],
            mode="markers",
            name="Bad Performance Categories",
            marker=dict(
                symbol="circle",
                size=8,
                color="rgba(255, 0, 0, 1)",
                line=dict(color="rgba(0,0,0,1)", width=1),
            ),
        ),
    )

    fig.update_layout(
        title=dict(
            text=f"Hierarchical Feature Selection for {target_name}",
            font=dict(size=24),
            x=0.475,
            y=0.9,
        ),
        font_family="Consolas",
        autosize=False,
        width=2000,
        height=500,
        template="simple_white",
        # showlegend=False,
    )
    fig.update_xaxes(
        mirror=True,
        ticks="",
        showline=True,
        linewidth=1,
        linecolor="black",
        showticklabels=False,
        showgrid=False,
    )
    fig.update_yaxes(
        mirror=True,
        ticks="",
        showline=True,
        linewidth=1,
        linecolor="black",
        showticklabels=False,
        showgrid=False,
    )
    return fig


def plot_mda_contribution_scatterplot(stats_df, target_name):
    fig = px.scatter(
        stats_df,
        x="mda",
        y="contribution",
        color="quality",
        color_discrete_map={
            "good": "rgba(0, 255, 0, 1)",
            "average": "rgba(0, 0, 255, 1)",
            "bad": "rgba(255, 0, 0, 1)",
        },
        hover_data=["node_path", "rank"],
        text="node_path",
        template="plotly_white",
        # color_continuous_scale=px.colors.diverging.Temps_r,
        # color_continuous_midpoint=0,
    )

    # Howso Update

    # Find the maximum absolute value in the x and y columns
    max_value = max(
        abs(stats_df["feature_mda_robust"].max()),
        abs(stats_df["feature_mda_robust"].min()),
        abs(stats_df["feature_contributions_robust"].max()),
        abs(stats_df["feature_contributions_robust"].min()),
    )

    # Set the x and y axis ranges to include the origin (0, 0)
    fig.update_xaxes(range=[-max_value * 1.5, max_value * 1.5])
    fig.update_yaxes(range=[-max_value * 1.5, max_value * 1.5])

    fig.update_layout(
        title=dict(
            text=f"Normalized Feature Contribution/MDA Scatterplot for {target_name}",
            font=dict(size=24),
            x=0.5,
        ),
        font_family="Consolas",
        autosize=False,
        width=1200,
        height=1000,
        titlefont=dict(size=24),
        hovermode="closest",
    )

    fig.update_traces(marker_size=10, textposition="top center")
    fig.update_yaxes(
        tickfont=dict(size=14),
        title_text="Feature Contribution",
        mirror=True,
        ticks="",
        showline=True,
        linewidth=1,
        linecolor="black",
        # showticklabels=False,
        showgrid=False,
    )
    fig.update_xaxes(
        tickfont=dict(size=14),
        title_text="MDA",
        mirror=True,
        ticks="",
        showline=True,
        linewidth=1,
        linecolor="black",
        # showticklabels=False,
        showgrid=False,
    )
    return fig


def create_density_histogram(dataset):
    non_id_columns = [
        col for col in dataset.columns if col not in ["device_id", "hashed_email"]
    ]
    dataset_columns_density = dataset[non_id_columns].fillna(0).astype(bool).sum(axis=0)

    dataset_columns_density_df = pd.DataFrame(
        {
            "feature": dataset_columns_density.index,
            "nonzero_count": dataset_columns_density.values,
        }
    )
    dataset_columns_density_df["nonzero_pct"] = round(
        ((dataset_columns_density_df["nonzero_count"] / dataset.shape[0]) * 100), 2
    )

    dataset_targets_nonzero_df = dataset_columns_density_df.sort_values(
        by=["nonzero_count"], ascending=False
    )

    fig = px.bar(
        dataset_targets_nonzero_df,
        x="feature",
        y="nonzero_count",
        hover_data=["nonzero_pct"],
        text_auto=".2s",
        title="Features Density",
    )

    fig.update_layout(
        autosize=False,
        width=1200,
        height=500,
    )
    fig.update_yaxes(range=[0, dataset.shape[0]])
    fig.update_xaxes(tickangle=40)
    return dataset_targets_nonzero_df, fig


def run_model(model_id, seed, target):
    all_features = infer_feature_attributes(seed)
    non_target_context_features = [col for col in seed.columns if col != target]

    resources = TraineeResources(
        cpu={"minimum": 4000, "maximum": 4000},
        memory={"minimum": 2000, "maximum": 2000},
    )

    model = Trainee(
        name=model_id,
        features=all_features,
        resources=resources,
        overwrite_existing=True,
    )

    model.train(cases=seed)

    model.analyze(
        action_features=[target],
        context_features=non_target_context_features,
    )

    # Begin Howso Update

    # `react_into_trainee` and `react_aggregate` have now been combined into
    # one method, `react_aggregate`. `react_aggregate` also does not
    # cache the stats anymore, and they are just calculated and returned when it is called. Thus,
    # calling the method before you need the stats is no longer necessary.

    # All of this code can be deleted.

    # model.react_into_trainee(
    #     residuals=True,
    #     mda=True,
    #     contributions=True,
    #     context_features=non_target_context_features,
    #     action_feature=target,
    # )

    # End Howso Update

    return model


def get_node_children(taxonomy, node_path):
    node_id = taxonomy[taxonomy["node_path"] == node_path]["node_id"].values[0]
    children_nodes = list(
        taxonomy[taxonomy["parent_id"] == node_id]["node_path"].values
    )
    if len(children_nodes) == 0:
        return None
    return children_nodes


def rank_features(taxonomy, model_id, seed, target):
    stats = None
    try:
        model = get_trainee(name_or_id=model_id)

        # Begin Howso Update

        # `react_into_trainee` and `react_aggregate` have now been combined into
        # one method, `react_aggregate`. `react_aggregate` also does not
        # cache the stats anymore, and they are just calculated and returned when it is called. Thus,
        # calling the method before you need the stats is no longer necessary.

        ############### Begin Old code for reference ##################

        # stats = model.react_aggregate(
        #     stats=["mda", "contribution"],
        #     action_feature=target,
        # )

        # # If 'mda' or 'contribution' are missing from the stats dictionary, delete the current model and retrain.
        # if (
        #     not all(
        #         all(metric in stats[category] for metric in ("feature_mda_robust", "feature_contributions_robusts"))
        #         for category in stats
        #     )
        # ) or len(stats) == 0:
        #     delete_trainee(name_or_id=model_id)
        #     raise Exception()
        # print(f" -- Model {model_id} already has cached stats. Continuing...")

        ############### End Old code for reference #####################


        # 'mda' and 'contribution' have been renamed to be more verbose.

        # NOTE: Metrics have also been switched to robust! If non-robust is desired,
        # Find and replace 'feature_mda_robust' -> 'feature_mda_full' and
        # 'feature_contributions_robust' -> 'feature_contributions_full'

        # Please note that the output keys for these metrics have also changed,
        # matching their details key.

        # NOTE: We have already changed all occurences of the mda and contribution key to
        # 'feature_mda_robust' and 'feature_contributions_robust' in this notebook.

        stats = model.react_aggregate(
            action_feature=target,
            details = {
                "feature_mda_robust": True,
                "feature_contributions_robust": True
            }
        )

        # If 'feature_mda_robust' or 'feature_contributions_robusts' are missing from the stats dictionary, delete the current model and retrain.
        if (
            not all(
                all(metric in stats[category] for metric in ("feature_mda_robust", "feature_contributions_robusts"))
                for category in stats
            )
        ) or len(stats) == 0:
            delete_trainee(name_or_id=model_id)
            raise Exception()
        print(f" -- Model {model_id} already has cached stats. Continuing...")

        # End Howso Update

    except Exception as e:
        # print(f" -- Model {model_id} is missing or is missing stats. Retraining...")
        model = run_model(
            model_id,
            seed,
            target,
        )

        # Begin Howso Update
        # Same explanation as the Howso update section above

        ############### Begin Old code for reference ##################

        # stats = model.react_aggregate(
        #     stats=["mda", "contribution"],
        #     action_feature=target,
        # )

        ############### End Old code for reference #####################

        stats = model.react_aggregate(
            action_feature=target,
            details = {
                "feature_mda_robust": True,
                "feature_contributions_robust": True
            }
        )

        # End Howso Update

        if len(stats) == 0:
            print(
                f" -- Model {model_id} could not compute prediction stats. Skipping..."
            )
            return
        else:
            print(f" -- Model {model_id} has been retrained successfully.")

    stats_df = pd.DataFrame(
        stats.items(),
        columns=["node_path", "metrics"],
    )

    stats_df = pd.concat(
        [stats_df.drop(["metrics"], axis=1), stats_df["metrics"].apply(pd.Series)],
        axis=1,
    )
    # Add node_id column to the stats_df.
    stats_df["node_id"] = taxonomy[taxonomy["node_path"].isin(stats_df["node_path"])][
        "node_id"
    ].values
    # display(stats_df)

    def quadrant_selection(contribution, mda):
        if contribution > 0 and mda > 0:
            return "top_right"
        elif contribution < 0 and mda > 0:
            return "bottom_right"
        elif contribution < 0 and mda < 0:
            return "bottom_left"
        elif contribution > 0 and mda < 0:
            return "top_left"
        else:
            return "null"

    def percentile_classification(
        contribution,
        mda,
        contribution_good_thresold,
        mda_good_thresold,
        contribution_average_thresold,
        mda_average_thresold,
    ):
        return (
            "good"
            if contribution > contribution_good_thresold and mda > mda_good_thresold
            else (
                "average"
                if contribution > contribution_average_thresold
                and mda > mda_average_thresold
                else "bad"
            )
        )

    stats_df["quadrant"] = stats_df.apply(
        lambda row: quadrant_selection(row["feature_contributions_robust"], row["feature_mda_robust"]), axis=1
    )

    contribution_good_performance_percentile = 0.6
    contribution_average_performance_percentile = 0.4
    mda_good_performance_percentile = 0.7
    mda_average_performance_percentile = 0.5

    stats_df[f"c{contribution_good_performance_percentile}"] = stats_df[
        "feature_contributions_robust"
    ].quantile(contribution_good_performance_percentile, interpolation="lower")
    stats_df[f"m{mda_good_performance_percentile}"] = stats_df["feature_mda_robust"].quantile(
        mda_good_performance_percentile, interpolation="lower"
    )

    stats_df[f"c{contribution_average_performance_percentile}"] = stats_df[
        "feature_contributions_robust"
    ].quantile(contribution_average_performance_percentile, interpolation="lower")
    stats_df[f"m{mda_average_performance_percentile}"] = stats_df["feature_mda_robust"].quantile(
        mda_average_performance_percentile, interpolation="lower"
    )

    stats_df["quality"] = stats_df.apply(
        lambda row: percentile_classification(
            row["feature_contributions_robust"],
            row["feature_mda_robust"],
            row[f"c{contribution_good_performance_percentile}"],
            row[f"m{mda_good_performance_percentile}"],
            row[f"c{contribution_average_performance_percentile}"],
            row[f"m{mda_average_performance_percentile}"],
        ),
        axis=1,
    )

    stats_df["rank"] = stats_df.apply(
        lambda row: (
            (row["feature_contributions_robust"] + row["feature_mda_robust"])
            if (row["quality"] == "good") or (row["quality"] == "average")
            else 0
        ),
        axis=1,
    )
    stats_df = stats_df.sort_values(by=["rank"], ascending=False)

    return stats_df


def rank_features_mock(taxonomy, seed, target_node_id):
    all_features = seed.columns.to_list()
    all_features = taxonomy[taxonomy["node_path"].isin(all_features)][
        "node_id"
    ].to_list()
    if target_node_id in all_features:
        all_features.remove(target_node_id)
    try:
        good_features = random.sample(all_features, 2)
        all_features = list(set(all_features) - set(good_features))
        average_features = random.sample(all_features, 2)
        bad_features = list(
            set(all_features) - set(average_features) - set(good_features)
        )
    except Exception as e:
        good_features = []
        average_features = []
        bad_features = []
    return good_features, average_features, bad_features


def feature_selection(
    taxonomy,
    initial_seed,
    initial_raw_seed,
    target_name,
    target_node_path,
    target_definition,
    target_definition_method,  # {"regex", "categories"}
):
    print(f"FEATURE SELECTION START -> {target_name} ({target_node_path})")
    MAX_FEATURES = 50
    if taxonomy[taxonomy["node_path"] == target_node_path].empty:
        print(f"target_node_path ({target_node_path}) not found in the taxonomy.")
        return

    target_node_level = taxonomy[taxonomy["node_path"] == target_node_path][
        "node_level"
    ].values[0]

    initial_level = 1

    context_features = taxonomy[taxonomy["node_level"] == 1].node_path.to_list()
    context_features = [
        feature for feature in context_features if feature in initial_seed.columns
    ]
    if len(context_features) < 5:
        print(
            f" -- Number of level 1 context features is less than 5. Skipping straight to level 2."
        )
        context_features = taxonomy[taxonomy["node_level"] == 2].node_path.to_list()
        context_features = [
            feature for feature in context_features if feature in initial_seed.columns
        ]
        initial_level = 2

    # print(f" -- context_features: {context_features}")

    for iteration in range(initial_level, target_node_level + 1):
        print(f" ## ITERATION {iteration}")

        model_id = (
            f"ibotta_{target_node_path}_{iteration}"
            if iteration != target_node_level
            else f"ibotta_{target_name}"
        )

        # FOR LAST ITERATION. Create the seed for the target based on the target definition.
        if iteration == target_node_level:
            if target_definition_method == "regex":
                raw_seed_target = initial_raw_seed[
                    initial_raw_seed.product_name.str.contains(
                        target_definition, regex=True, na=False
                    )
                ]
                raw_seed_target["category"] = target_name

                # print(f" -- raw_seed_target shape: {raw_seed_target.shape}")
                # print(f" -- raw_seed_target columns: {raw_seed_target.columns}")

                target_pivoted = raw_seed_target.pivot_table(
                    index=["hashed_email"],
                    columns="category",
                    values="spend_pct",
                    aggfunc="sum",
                    fill_value=0,
                )
                target_pivoted.reset_index(inplace=True)
                target_pivoted.columns.name = None

                # print(f" -- target_pivoted shape: {target_pivoted.shape}")
                # print(f" -- target_pivoted columns: {target_pivoted.columns}")

                seed_target = initial_seed.merge(
                    target_pivoted, on=["hashed_email"], how="left"
                ).fillna(0)
                seed = seed_target[context_features + [target_name]]
            elif target_definition_method == "categories":
                print(f" -- target_definition_method: {target_definition_method}")
                print(f"FEATURE SELECTION END - {target_name} ({target_node_path}) \n")
                return None, None
        else:
            seed = initial_seed[context_features + [target_node_path]]

        if iteration == target_node_level:
            print(
                f" -- seed target density: {seed[target_name].astype(bool).sum(axis=0)} ({seed[target_name].astype(bool).sum(axis=0)/seed.shape[0]:.2%})"
            )
        else:
            print(
                f" -- seed target density: {seed[target_node_path].astype(bool).sum(axis=0)} ({seed[target_node_path].astype(bool).sum(axis=0)/seed.shape[0]:.2%})"
            )

        print(f" -- context_features ({len(context_features)}): {context_features}")

        if iteration == target_node_level:
            stats_df = rank_features(taxonomy, model_id, seed, target_name)
        else:
            stats_df = rank_features(taxonomy, model_id, seed, target_node_path)

        good_features = stats_df[stats_df["quality"] == "good"]["node_path"].to_list()
        average_features = stats_df[stats_df["quality"] == "average"][
            "node_path"
        ].to_list()

        # FOR NON-LAST ITERATIONS. Set the context features for the next iteration based on the good and average children features of the current iteration.
        if iteration != target_node_level:
            context_features = []

            good_features_children = []
            for good_feature in good_features:
                good_children = get_node_children(taxonomy, good_feature)
                if good_children is not None:
                    good_features_children.extend(good_children)
            # print(
            #     f" -- good_features_children ({len(good_features_children)}): {good_features_children}"
            # )

            average_features_children = []
            for average_feature in average_features:
                average_children = get_node_children(taxonomy, average_feature)
                if average_children is not None:
                    average_features_children.extend(average_children)
            # print(
            #     f" -- average_features_children ({len(average_features_children)}): {average_features_children}"
            # )

            # Add good and average categories ordered by rank descending to the context features until we reach the maximum number of context features.
            context_features = good_features_children + average_features_children
            context_features = context_features[:MAX_FEATURES]

            # Make sure the are not duplicates or targets in the context features.
            context_features = list(set(context_features))
            if target_node_path in context_features:
                context_features.remove(target_node_path)
            context_features = [
                feature
                for feature in context_features
                if feature in initial_seed.columns
            ]
            if len(context_features) == 0:
                print(" -- context_features is empty. Aborting")
                return
        else:
            optimized_seed = seed[good_features + average_features + [target_name]]
            print(f"FEATURE SELECTION END -> {target_name} ({target_node_path}) \n")
            return stats_df, optimized_seed
    return

In [4]:
taxonomy_s3_path = f"datasets/taxonomies/iab_taxonomy_clean.csv"
taxonomy = get_df_from_s3(taxonomy_s3_path).rename({"full_path": "node_path"}, axis=1)
print(f"Taxonomy shape: {taxonomy.shape}")
print(f"Taxonomy count distinct node_path: {taxonomy['node_path'].nunique()}")
print(f"Taxonomy distinct node_id: {taxonomy['node_id'].nunique()}")
taxonomy.head(3)

Reading datasets/taxonomies/iab_taxonomy_clean.csv from S3
Taxonomy shape: (583, 5)
Taxonomy count distinct node_path: 583
Taxonomy distinct node_id: 583


Unnamed: 0,node_id,parent_id,node_name,node_path,node_level
0,1000,,ad_safety_risk,ad_safety_risk,1
1,1001,,adult_products_and_services,adult_products_and_services,1
2,1002,,alcohol,alcohol,1


In [5]:
seed_s3_path = f"datasets/seeds/ibotta_seed_v6/"
seed = get_df_from_s3(seed_s3_path)
print(f"Seed shape: {seed.shape}")
print(f"Unique seed colums: {len(set(seed.columns))}")
seed.head(3)

Reading datasets/seeds/ibotta_seed_v6/part-00000-971c994a-3be3-47d6-9127-5f8cf26a451d-c000.csv from S3
Seed shape: (7895, 205)
Unique seed colums: 205


Unnamed: 0,device_id,hashed_email,alcohol,alcohol|beer,alcohol|hard_sodas_seltzers_alco_pops,alcohol|spirits,alcohol|wine,clothing_and_accessories,clothing_and_accessories|clothing,clothing_and_accessories|clothing_accessories,...,sporting_goods|outdoor_recreation_equipment,tobacco,tobacco|cigarettes,tobacco|cigars,tobacco|smokeless_tobacco,vehicles,vehicles|automotive_products,vehicles|automotive_products|aftermarket_parts_and_accessories,vehicles|automotive_products|automotive_care_products,vehicles|automotive_products|automotive_parts_and_accessories
0,,1a572ea35f0ba09f63002cf8a576e93f3389d973751a7a...,3.959592,0.983785,2.77568,0.0,0.200127,2.126809,2.065271,0.0,...,0.0,0.409602,0.0,0.0,0.409602,0.143623,0.143623,0.0,0.066675,0.076948
1,,5a7d09db3ef6ffb42ec3ecb70c5c8aa85979402ae71825...,48.386719,48.158921,0.048413,0.044098,0.135287,0.059337,0.059337,0.0,...,0.210213,0.482946,0.482946,0.0,0.0,0.267581,0.156325,0.053538,0.036519,0.066268
2,,a83bb8e3a3752a8a4906e9e3896519ceaf32582c5889bb...,1.155229,0.353884,0.678825,0.012264,0.110255,0.0,0.0,0.0,...,0.137175,10.005648,10.005648,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Remove features from the taxonomy that are not in the seed.
taxonomy_features = taxonomy["node_path"].to_list()
seed_features = seed.columns.to_list()
missing_features = [
    feature for feature in taxonomy_features if feature not in seed_features
]
if len(missing_features) > 0:
    print(f"Missing features in the seed ({len(missing_features)}): {missing_features}")
    taxonomy = taxonomy[~taxonomy["node_path"].isin(missing_features)]
    print(f"Taxonomy shape: {taxonomy.shape}")

# Make sure all the parent_id appear in the node_id.
missing_parent_ids = [
    parent_id
    for parent_id in taxonomy["parent_id"].unique()
    if parent_id not in taxonomy["node_id"].unique() and parent_id.is_integer()
]
if len(missing_parent_ids) > 0:
    print(
        f"Missing parent_ids in the taxonomy ({len(missing_parent_ids)}): {missing_parent_ids}"
    )
    taxonomy = taxonomy[~taxonomy["parent_id"].isin(missing_parent_ids)]
    taxonomy = taxonomy[~taxonomy["node_id"].isin(missing_parent_ids)]

taxonomy

Missing features in the seed (380): ['ad_safety_risk', 'adult_products_and_services', 'alcohol|bars', 'culture_and_fine_arts', 'culture_and_fine_arts|museums_and_galleries', 'business_and_industrial', 'business_and_industrial|advertising_and_marketing', 'business_and_industrial|business_services', 'business_and_industrial|business_services|consulting', 'business_and_industrial|business_services|employee_expense_and_time_tracking_services', 'business_and_industrial|business_services|event_planning', 'business_and_industrial|business_services|human_resources_and_recruiting', 'business_and_industrial|business_services|information_technology_services', 'business_and_industrial|business_services|laundry_and_dry_cleaning_services', 'business_and_industrial|business_services|logistics_and_delivery', 'business_and_industrial|business_services|office_equipment_and_supplies', 'business_and_industrial|business_services|photographers', 'business_and_industrial|business_services|printing_fax_wifi_s

Unnamed: 0,node_id,parent_id,node_name,node_path,node_level
2,1002,,alcohol,alcohol,1
4,1004,1002.0,beer,alcohol|beer,2
5,1005,1002.0,hard_sodas_seltzers_alco_pops,alcohol|hard_sodas_seltzers_alco_pops,2
6,1006,1002.0,spirits,alcohol|spirits,2
7,1007,1002.0,wine,alcohol|wine,2
...,...,...,...,...,...
551,1551,,vehicles,vehicles,1
556,1556,1551.0,automotive_products,vehicles|automotive_products,2
557,1557,1556.0,automotive_care_products,vehicles|automotive_products|automotive_care_p...,3
558,1558,1556.0,automotive_parts_and_accessories,vehicles|automotive_products|automotive_parts_...,3


In [7]:
all_features = seed.columns.to_list()
all_features.remove("device_id")
all_features.remove("hashed_email")

level1_categories = set(
    [cat.split("|")[0] for cat in all_features if cat.count("|") == 1]
)
print(f"level1_categories({len(level1_categories)}): {level1_categories}")
level2_categories = set([cat for cat in all_features if cat.count("|") == 1])
print(f"level2_categories categories ({len(level2_categories)}): {level2_categories}")
level3_categories = set([cat for cat in all_features if cat.count("|") == 2])
print(f"level3_categories ({len(level3_categories)}): {level3_categories}")

# Add level_1 nodes to the seed.
for parent in level1_categories:
    children_seed = seed[[col for col in seed.columns if parent == col.split("|")[0]]]
    seed[parent] = children_seed.sum(axis=1)
seed.columns = seed.columns.drop_duplicates(keep=False)

level1_categories(13): {'sporting_goods', 'tobacco', 'collectables_and_antiques', 'vehicles', 'retail', 'clothing_and_accessories', 'food_and_beverage_services', 'gifts_and_holiday_items', 'media', 'alcohol', 'durable_goods', 'consumer_electronics', 'consumer_packaged_goods'}
level2_categories categories (76): {'consumer_packaged_goods|personal_cleansing', 'sporting_goods|athletics_equipment', 'consumer_packaged_goods|baby_and_toddler_products', 'consumer_packaged_goods|barbeque', 'sporting_goods|outdoor_recreation_equipment', 'consumer_electronics|televisions', 'consumer_electronics|video_game_consoles', 'consumer_packaged_goods|beverages', 'consumer_packaged_goods|hair_care', 'durable_goods|grooming_supplies', 'media|books_and_audio_books', 'consumer_packaged_goods|over_the_counter_medication', 'retail|arts_and_crafts_supplies', 'consumer_packaged_goods|household_plastics_storage', 'clothing_and_accessories|jewelry_and_watches', 'consumer_electronics|printers_copiers_scanners_fax', '

In [8]:
raw_seed_s3_path = f"datasets/seeds/ibotta_seed_raw_v6/"
raw_seed = get_df_from_s3(raw_seed_s3_path)
raw_seed.head(3)

Reading datasets/seeds/ibotta_seed_raw_v6/part-00000-834b7ff2-18b6-4415-99e0-441c3f51901f-c000.csv from S3


Unnamed: 0,device_id,hashed_email,product_name,category,spend_pct
0,,19843ff2ff2755bead60dff575535aaec5d2de0cefafb3...,Bud Light Beer 25oz Can,alcohol|beer,0.012105
1,,19843ff2ff2755bead60dff575535aaec5d2de0cefafb3...,Clearly Canadian Orchard Peach Qty 4 11oz Bott...,alcohol|beer,0.063309
2,,19843ff2ff2755bead60dff575535aaec5d2de0cefafb3...,Cayman Jack Margarita Variety 12 pack,alcohol|hard_sodas_seltzers_alco_pops,0.177164


# Extended Audiences creation

In [9]:
# Nielsen whitelabel (not a priority)
audiences = [
    {
        "target": "natural_competitive_pad",
        "target_category": "health_wellness",
        "target_regex": "(?i)rael.*organic|rael.*sanitary|lola.*feminine|lola.*ultra.*pads|lola.*liners|\\bcora\\b.*pads|\\bcora\\b.*liners|honey pot.*liners|honey pot.*pads",
    },
    {
        "target": "advanced_select_a_size",
        "target_category": "household_essentials|paper_goods",
        "target_regex": "(?i)(bounty.*advance.*select.*size).*",
    },
    {
        "target": "angostura",
        "target_category": "beverages|cocktail_mixes",
        "target_regex": "(?i)(angostura).*",
    },
    {
        "target": "lysol",
        "target_category": "household_essentials|all_purpose_cleaner",
        "target_regex": "(?i)(lysol.*complete.*clean).*",
    },
    {
        "target": "pillsbury_ready_to_bake",
        "target_category": "refrigerated|refrigerated_dough",
        "target_regex": "(?i)(Pillsbury.*Ready.*Bake).*",
    },
    {
        "target": "pillsbury_ready_to_bake_shape",
        "target_category": "refrigerated|refrigerated_dough",
        "target_regex": "(?i)(Pillsbury.*Ready.*bake.*shape).*",
    },
    {
        "target": "pillsbury_big_deluxe_classic",
        "target_category": "refrigerated|refrigerated_dough",
        "target_regex": "(?i)(Pillsbury.*Big Deluxe).*",
    },
    {
        "target": "vicks_nyquil",
        "target_category": "health_wellness|medicine",
        "target_regex": "(?i)(vicks.*nyquil).*",
    },
    {
        "target": "cold_allergy_sinus_liquid",
        "target_category": "health_wellness|medicine",
        "target_regex": "(?i)^(private label|\\baldi\\b|\\bblossom\\b|\\bpublix\\b|\\bcostco\\b|\\bsam.*\\bclub\\b|\\bmember.*\\bmark\\b|\\bgiant\\b|\\bcareone\\b|\\bcare one\\b|\\btarget\\b|\\bup\\b.*\\bup\\b|\\bH-E-B\\b|\\bHEB \\b|\\bWalmart\\b|\\bEquate\\b|\\bHy-Vee\\b|\\bHyvee\\b|\\bHy vee\\b|\\btopcare\\b|\\bwegmans\\b|\\bkroger\\b|\\bkirkland\\b|\\bwhole foods\\b|\\b365 \\b|\\bmeijer\\b).*?(\\bcold\\b|\\ballergy\\b|\\bsinus\\b|\\bmucus\\b|congestion|Sinusitis|\\bnasal\\b|\\brhinitis\\b).*?(\\bfl oz\\b|\\bfloz\\b|\\bliquid|\\bottle).*",
    },
    {
        "target": "vicks_dayquil",
        "target_category": "health_wellness|medicine",
        "target_regex": "(?i)vicks.*dayquil",
    },
    {
        "target": "mucinex_fast_max",
        "target_category": "health_wellness|medicine",
        "target_regex": "(?i)(mucinex.*fast.*max.*\\bgel\\b|mucinex.*fast.*max.*liquid|mucinex.*fast.*max.*oz\\b|mucinex.*fast.*max.*bottle|mucinex.*fast.*max.*Hot Drink Mix).*",
    },
    {
        "target": "old_spice_body_wash",
        "target_category": "beauty_grooming|bath_body",
        "target_regex": "(?i)(old spice.*body.*wash|old spice.*shower.*gel).*",
    },
    {
        "target": "old_spice_red_zone",
        "target_category": "beauty_grooming|bath_body",
        "target_regex": "(?i)(old spice.*red.*zone.*body.*wash|old spice.*red.*zone.*shower.*gel).*",
    },
    {
        "target": "softsoap_luminous_oils",
        "target_category": "beauty_grooming|bath_body",
        "target_regex": "(?i)(Softsoap.*Luminous.*Oils.*body.*wash|Softsoap.*Luminous.*Oils.*shower.*gel).*",
    },
    {
        "target": "dial_all_day_freshness",
        "target_category": "beauty_grooming|bath_body",
        "target_regex": "(?i)(Dial.*all.*day.*Freshness.*body.*wash|Dial.*all.*day.*Freshness.*shower.*gel).*",
    },
    {
        "target": "aveeno_baby_bath",
        "target_category": "baby_kids|baby_kids_personal_care",
        "target_regex": "(?i)(aveeno.*kids.*body.*wash).*",
    },
    {
        "target": "bolthouse_farms",
        "target_category": "refrigerated|refrigerated_juice",
        "target_regex": "(?i)(bolthouse farm.*juice|bolthouse farm.*drink|bolthouse farm.*smoothie).*",
    },
    {
        "target": "naked_protein_zone",
        "target_category": "refrigerated|refrigerated_juice",
        "target_regex": "(?i)^(naked.*protein.*zone|naked.*protein).*",
    },
    {
        "target": "dental_floss",
        "target_category": "health_wellness|oral_care",
        "target_regex": "(?i)^(private label|\\baldi\\b|\\bblossom\\b|\\bpublix\\b|\\bcostco\\b|\\bsam.*\\bclub\\b|\\bmember.*\\bmark\\b|\\bgiant\\b|\\bcareone\\b|\\bcare one\\b|\\btarget\\b|\\bup\\b.*\\bup\\b|\\bH-E-B\\b|\\bHEB \\b|\\bWalmart\\b|\\bEquate\\b|\\bHy-Vee\\b|\\bHyvee\\b|\\bHy vee\\b|\\btopcare\\b|\\bwegmans\\b|\\bkroger\\b|\\bkirkland\\b|\\bwhole foods\\b|\\b365 \\b|\\bmeijer\\b).*(\\bfloss\\b).*",
    },
    {
        "target": "fruit_rolls_bar_snacks",
        "target_category": "snacks_cookies_chips|fruit_snacks",
        "target_regex": "(?i)^(private label|\\baldi\\b|\\bblossom\\b|\\bpublix\\b|\\bcostco\\b|\\bsam.*\\bclub\\b|\\bmember.*\\bmark\\b|\\bgiant\\b|\\bcareone\\b|\\bcare one\\b|\\btarget\\b|\\bup\\b.*\\bup\\b|\\bH-E-B\\b|\\bHEB \\b|\\bWalmart\\b|\\bEquate\\b|\\bHy-Vee\b|\\bHyvee\\b|\\bHy vee\\b|\\btopcare\\b|\\bwegmans\\b|\\bkroger\\b|\\bkirkland\\b|\\bwhole foods\\b|\\b365 \\b|\\bmeijer\\b).*(\\bfruit roll\\b|fruit.*\\bbar|blueberry.*granola.*\\bbar|raspberry.*bar|fruit.*\\bsnack|snack.*peach|Strawberry.*snack|apple.*bar|cherry.*snack|apple.*snack|snack.*fruit|Snack Blueberry|Bar Mango|Bar Pineapple|Strawberry Rhubarb Bar|lemon.*bar|Bars Blueberry).*",
    },
    {
        "target": "scotch_brite",
        "target_category": "household_essentials|cleaning_tools",
        "target_regex": "(?i)(scotch.*brite).*",
    },
    {
        "target": "swiffer",
        "target_category": "household_essentials|cleaning_tools",
        "target_regex": "(?i)(\\bswiffer\\b).*",
    },
    {
        "target": "swiffer_sweeper",
        "target_category": "household_essentials|cleaning_tools",
        "target_regex": "(?i)(\\bswiffer\\b.*sweeper).*",
    },
    {
        "target": "swiffer_duster",
        "target_category": "household_essentials|cleaning_tools",
        "target_regex": "(?i)^(?!.*(\\bfor\\b.*\\bdusters\\b)).*(\\bswiffer\\b.*duster).*",
    },
    {
        "target": "swiffer_sweeper_febreze",
        "target_category": "household_essentials|cleaning_tools",
        "target_regex": "(?i)(\\bswiffer\\b.*sweeper.*febreze|swiffer.*febreze.*sweeper).*",
    },
    {
        "target": "swiffer_wet_jet",
        "target_category": "household_essentials|cleaning_tools",
        "target_regex": "(?i)^(?!.*(\\bcompare)).*(\\bswiffer\\b.*wet.*jet\\b).*",
    },
    {
        "target": "swiffer_360_extender",
        "target_category": "household_essentials|cleaning_tools",
        "target_regex": "(?i)(swiffer.*360.*extend).*",
    },
    {
        "target": "pic_raid",
        "target_category": "household_essentials|pest_control",
        "target_regex": "(?i)(pic .*raid).*",
    },
]

In [10]:
# Stress test easy level.
audiences = [
    {
        "target_name": "gatorade_zero",
        "target_node_path": "consumer_packaged_goods|beverages|carbonated_soft_drinks",
        "target_definition": "(?i)^gatorade.*zero",
        "target_method": "regex",
    },
    {
        "target_name": "pic_raid",
        "target_node_path": "consumer_packaged_goods|pest_control|outdoor_insect_rodent_control_chem",
        "target_definition": "(?i)(?:pic .*raid).*",
        "target_method": "regex",
    },
]

In [11]:
# Stress test mid level.
audiences = [
    {
        "target_name": "mac_competitor",
        "target_node_path": "consumer_packaged_goods|cosmetics|facial",
        "target_definition": "(?i)^(?!.*yogurt).*(\\btarte\\b|benefit|lancome|too faced)",
        "target_method": "regex",
    },
    {
        "target_name": "mac_reach",
        "target_node_path": "consumer_packaged_goods|cosmetics|facial",
        "target_definition": "(?i)^(anastasia|benefit|buxom|dior\\b|\\belf\\b|hourglass|\\bilia\\b|\\bit cosmetics|\\bl.*oreal\\b|lancome|\\bnars\\b|\\bnyx\\b|summer fridays|\\btarte\\b|too faced|urban decay).*(mascara|primer|sharpener|setting spray|concealer|cream|moisturizer|Eyebrow|\\bbrow\\b|pencil|\\blip\\b|lipstick|liner|toner|lipbalm|lipstain|corrector|Cleanser|makeup|Glow Enhancer|shadow|\\bbath\\b|Remover|brush|serum|cleansing|hyaluronic|tanning|face mask|eye treatment|blush|foundation|glotion|powder|infallible|\\blumi\\b|lipgloss|True Match|bronzer|highlighter|visible lift|compact|Can.*Stop.*Won.*Stop).*",
        "target_method": "regex",
    },
    {
        "target_name": "ivory_apdo_competitor",
        "target_node_path": "consumer_packaged_goods|personal_cleansing|deodorant",
        "target_definition": "(?i)(deodorant|antiperspirant)(?!.*(\\bivory\\b))",
        "target_method": "regex",
    },
    {
        "target_name": "ivory_competitor",
        "target_node_path": "consumer_packaged_goods|personal_cleansing|deodorant",
        "target_definition": "(?i)^(\\bdove\\b|suave|\\bdial\\b|softsoap|caress|irish spring|aveeno|white rain|dr.*squatch|\\bzest\\b.*soap|\\bzest\\b.*wash|\\bzest\\b.*deo|nivea)",
        "target_method": "regex",
    },
    {
        "target_name": "lysol_laundry_sanitizer",
        "target_node_path": "consumer_packaged_goods|home_care|laundry",
        "target_definition": "(?i)lysol.*laundry.*sanitizer",
        "target_method": "regex",
    },
    {
        "target_name": "non_always_discreet_ai_underwear",
        "target_node_path": "clothing_and_accessories|clothing|underwear_and_lingerie",
        "target_definition": "(?i)^(?!.*always discreet).*(incontinence.*underwear)",
        "target_method": "regex",
    },
    {
        "target_name": "anti_frizz",
        "target_node_path": "consumer_packaged_goods|hair_care|hair_conditioner",
        "target_definition": "(?i)anti-frizz|anti frizz",
        "target_method": "regex",
    },
    {
        "target_name": "bare_competitor",
        "target_node_path": "consumer_packaged_goods|hair_care|hair_conditioner",
        "target_definition": "(?i)^(dove|pacifica|neutrogena|shea moisture|sheamoisture|vanicream).*(shampoo|conditioner|hair)",
        "target_method": "regex",
    },
    {
        "target_name": "color_treated",
        "target_node_path": "consumer_packaged_goods|hair_care|hair_coloring",
        "target_definition": "(?i)color treated.*hair|color-treated.*hair|color friendly.*hair|color-friendly.*hair|color safe.*hair|color-safe.*hair|color care.*hair|color-care.*hair|color safe.*hair|color protect.*hair|color-protect.*hair",
        "target_method": "regex",
    },
    {
        "target_name": "conditioner",
        "target_node_path": "consumer_packaged_goods|hair_care|hair_conditioner",
        "target_definition": "(?i)^(?!.*cat|dog).*(conditioner)",
        "target_method": "regex",
    },
]

In [12]:
# High priority.
audiences = [
    {
        "target_name": "hair_styling",
        "target_node_path": "consumer_packaged_goods|hair_care|hair_conditioner",
        "target_definition": "(?i)^(?!.*(hair color|haircolor|hair brush|\\btoner\\b |\\bpasta\\b|angel hair|\\bclog\\b |conditioner|hair lightening|hairball|hair removal|developer|remover|\\bcomb\\b|collagen|hairbrush|\\bdoll\\b |shower gel|\\bbraid\\b|hair food color|halloween|\\bwow\\b|\\bmask\\b|hair lightener|hair dryer|\\bbarbie\\b|\\blice\\b |shears|scissor|trimmer|haircolour|hair colour|color hair spray|ponies)).*(\\bhair\\b).*(\\bgel\\b|\\bspray\\b|\\bmousse\\b|\\bcream\\b|frizz|style|\\bfiber\\b|styling|\\bwax\\b|\\bclay\\b|\\bpomade\\b|\\bserum\\b|\\btexturizer\\b|curl enhance|\\bdefiner\\b|dry shampoo|styling|curler|\\bpin\\b|\\bclips\\b|\\bband\\b|\\btie\\b|\\btool\\b|scrunchie|appliance).*",
        "target_method": "regex",
    },
]

In [13]:
optimized_audiences = []
for audience in audiences:
    try:
        audience_stats, audience_optimized_seed = feature_selection(
            taxonomy,
            seed,
            raw_seed,
            audience["target_name"],
            audience["target_node_path"],
            audience["target_definition"],
            audience["target_method"],
        )
        optimized_audiences.append(
            {
                "target_name": audience["target_name"],
                "target_node_path": audience["target_node_path"],
                "stats": audience_stats,
                "seed": audience_optimized_seed,
            }
        )
    except Exception as e:
        print(f"FEATURE_SELECTION_ERROR: {e}")
        raise e
    finally:
        continue

FEATURE SELECTION START -> hair_styling (consumer_packaged_goods|hair_care|hair_conditioner)
 ## ITERATION 1
 -- seed target density: 5242 (66.40%)
 -- context_features (13): ['alcohol', 'clothing_and_accessories', 'collectables_and_antiques', 'consumer_electronics', 'consumer_packaged_goods', 'durable_goods', 'food_and_beverage_services', 'gifts_and_holiday_items', 'media', 'retail', 'sporting_goods', 'tobacco', 'vehicles']
FEATURE_SELECTION_ERROR: Trainee.__init__() got an unexpected keyword argument 'default_action_features'


In [14]:
for audience in optimized_audiences:
    if audience["target_name"] == "hair_growth":
        print(f"{audience['target_name']} stats:")
        display(audience["stats"])
        print(f"{audience['target_name']} optimized seed:")
        display(audience["seed"])
        print(f"{audience['target_name']} taxonomy path:")
        taxonomy_path = plot_taxonomy(
            taxonomy,
            audience["target_name"],
            audience["target_node_path"],
            audience["stats"],
        )
        display(taxonomy_path)
        print(f"{audience['target_name']} mda/contribution scatterplot:")
        mda_contribution_scatterplot = plot_mda_contribution_scatterplot(
            audience["stats"], audience["target_name"]
        )
        display(mda_contribution_scatterplot)

In [15]:
for audience in optimized_audiences:
    stats_s3_path = (
        f"projects/{audience['target_name']}/stats_{audience['target_name']}.csv"
    )
    optimized_seed_s3_path = (
        f"projects/{audience['target_name']}/seed_{audience['target_name']}.csv"
    )

    print(
        f"Uploading {audience['target_name']} stats and seed to https://us-east-1.console.aws.amazon.com/s3/buckets/prod-data-ml-pipeline?bucketType=general&prefix=projects%2F{audience['target_name']}%2F&region=us-east-1..."
    )
    upload_file_to_s3(audience["stats"], stats_s3_path)
    upload_file_to_s3(audience["seed"], optimized_seed_s3_path)