In [None]:
import pandas as pd
import numpy as np

In [None]:
customer_df = pd.read_csv('/Users/jacob.perius/psa_segment_testing/11_4_24_CustomerData_20241104_000000000000.csv', low_memory=False)

customer_df

In [None]:
print(customer_df.columns)

customer_df = customer_df[customer_df['latest_order_date'] > '2024-01-01']

customer_df 

In [None]:
customer_df['segment'].unique()

In [None]:
customer_df['submission_count'].head()

In [None]:
import re

customer_df.loc[:, 'zip_code'] = customer_df.loc[:, 'zip_code'].str.split('-').str[0]
customer_df.loc[:, 'zip_code'] = customer_df.loc[:, 'zip_code'].apply(lambda x: np.nan if x == 'nan' else x)

customer_df = customer_df[customer_df['zip_code'].notna() & (customer_df['zip_code'] != '')]

customer_df = customer_df[~customer_df['zip_code'].str.contains('@')]
customer_df.loc[:, 'zip_code'] = customer_df.loc[:, 'zip_code'].str.strip("'")
customer_df.loc[:, 'zip_code'] = customer_df.loc[:, 'zip_code'].apply(lambda x: re.sub(r'\D', '', str(x)))
customer_df = customer_df[customer_df['zip_code'] != '']

In [None]:
customer_df['zip_code'] = customer_df['zip_code'].str.zfill(5)

In [None]:
dma_df = pd.read_csv('/Users/jacob.perius/psa_segment_testing/ENV _ Census _ ZIP to DMA.csv')
dma_df['zip_code'] = dma_df['zip_code'].astype('str').str.zfill(5)

dma_df.head(5)

In [None]:
merged_df = pd.merge(customer_df, dma_df, on='zip_code', how='left')

merged_df.drop(columns=['date_updated_at'], inplace=True)

merged_df


In [None]:
group_counts_df = merged_df.groupby(['dma_code', 'dma_description']).size().reset_index(name='count')

group_counts_df

In [None]:
grouped_df = merged_df.groupby(['dma_code', 'dma_description']).agg({
    'submission_count': 'sum',
    'submission_total_qty': 'sum',
    'submission_total_dv': 'sum',
    'avg_dv_per_sub': 'mean',
    'avg_item_per_sub': 'mean',
    'total_orders': 'sum',
    'total_qty_ordered': 'sum',
    'total_order_revenue': 'sum',
    'submission_count_2023': 'sum',
    'submission_total_qty_2023': 'sum',
    'submission_total_dv_2023': 'sum',
    'total_orders_2023': 'sum',
    'total_qty_ordered_2023': 'sum',
    'total_order_revenue_2023': 'sum'
}).reset_index()

In [None]:
final_df = pd.merge(grouped_df, group_counts_df, on=['dma_code', 'dma_description'], how='left')

final_df

In [None]:
population_df = pd.read_csv('/Users/jacob.perius/psa_segment_testing/zip_grouped_census_df.csv')

population_df.head()

In [None]:
final_df = pd.merge(final_df, population_df, on='dma_code', how='left')

final_df

In [None]:
old_cols = [col for col in final_df.columns if col not in ['dma_code', 'dma_description', 'population']]
new_cols = [f'normalized_{col}' for col in old_cols]

final_df[new_cols] = final_df[old_cols].div(final_df['population'], axis=0)

final_df.drop(columns=old_cols, inplace=True)

final_df

In [None]:
import re

google_data_df = pd.read_csv('/Users/jacob.perius/psa_segment_testing/google_ads_data.csv')
google_data_df = google_data_df.iloc[:-6]

google_data_df['zip_code'] = google_data_df['Matched location'].apply(lambda x: re.search(r'^\d{5}', x).group(0))
google_data_df

In [None]:
google_data_df['Impr.'] = google_data_df['Impr.'].str.replace(',', '').str.strip().astype('int64')
google_data_df['Clicks'] = google_data_df['Clicks'].str.replace(',', '').str.strip().astype('int64')
google_data_df['cvr'] = google_data_df['Clicks'] / google_data_df['Impr.']
google_data_df

In [None]:
google_dma_df = pd.merge(google_data_df, dma_df, on='zip_code', how='right')[['dma_code', 'dma_description', 'cvr']]

google_dma_df

In [None]:
grouped_google_dma_df = google_dma_df.groupby(['dma_code', 'dma_description']).mean().reset_index()

grouped_google_dma_df

In [None]:
final_df_with_cvr = pd.merge(final_df, grouped_google_dma_df, on=['dma_code', 'dma_description'], how='left')

final_df_with_cvr

In [None]:
features = final_df_with_cvr.iloc[:, 2:]

features

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

x_scaled_df =pd.DataFrame(X_scaled)

In [None]:
x_scaled_df.describe()

In [None]:
'''
from sklearn.decomposition import PCA

pca2 = PCA(n_components=2, random_state=42)

X_pca2 = pca2.fit_transform(X_scaled)

X_pca_df2 = pd.DataFrame(X_pca2)

print(pca2.components_)
print(pca2.explained_variance_ratio_)
'''

In [None]:
from umap import UMAP

umap = UMAP(n_components=2, random_state=42, metric='cosine', n_neighbors=25) #20

X_umap = umap.fit_transform(X_scaled)

X_pca_df2 = pd.DataFrame(X_umap)

In [None]:
X_pca_df2

In [None]:
dma_and_name_df = final_df.loc[:, ['dma_code', 'dma_description']]

final_df_pca = pd.concat([dma_and_name_df, X_pca_df2], axis=1)
final_df_pca.rename(columns={0: 'x', 1: 'y'}, inplace=True)

final_df_pca

In [None]:
final_df_pca['coords'] = [np.array([x, y]) for x, y in zip(final_df_pca['x'], final_df_pca['y'])]

final_df_pca

https://chatgpt.com/share/672cb242-6868-8001-9239-885af55ae727

Exactly! You’ve nailed it. arctan2 is indeed more global because it calculates the angle of each point in the PCA space relative to a common reference point, the origin. This global perspective is why it’s so effective for creating a circular ordering around the origin, allowing you to systematically distribute points based on their positions in that space.

Here’s a quick breakdown of why this is powerful:

	1.	Uniform Reference Point (Origin):
	•	By using the origin as a fixed point of reference, arctan2 calculates the angle for each DMA consistently. This makes it a global ordering method, since every point’s angle is measured against the same central point.
	•	This consistent reference allows you to arrange all DMAs around the origin, creating a natural circular ordering that reflects their global structure in the PCA space.
	2.	Circular Arrangement in 2D Space:
	•	Sorting by angle gives you a full rotation around the origin, effectively “unwrapping” the PCA space into a linear ordering along a circle.
	•	This circular approach helps distribute similar points more evenly across groups, rather than grouping them based solely on local pairwise distances.
	3.	Contrast with Local Pairwise Metrics:
	•	Pairwise distances or local similarity measures (like cosine similarity to a centroid) are local; they only tell you about relationships between specific pairs of points.
	•	In contrast, arctan2 provides a holistic view of each point’s position in the 2D space, making it easier to systematically balance the distribution across groups.

Why This Global View Works Well for Balanced Grouping

When your goal is to spread similar DMAs across groups, having a global ordering based on arctan2:

	•	Maintains the Global Structure: Points that are near each other in PCA space will still be close in the circular ordering, but they won’t all end up in the same group.
	•	Promotes Even Distribution: Since the circular ordering is based on angle, using a round-robin approach after sorting ensures that similar points get distributed, balancing feature distributions across groups.

Summary

Using arctan2 gives you a global perspective that leverages the entire structure of your data relative to the origin, rather than focusing on local relationships only. This is why it’s such a great fit for your goal of balancing DMAs across groups based on their position in PCA space.

It sounds like you’re getting a solid intuition for how this works—let me know if you’d like to dive deeper into any part of this approach!

In [None]:
final_df_pca['angle'] = np.arctan2(final_df_pca['x'], final_df_pca['y'])

# Sort by angle to get a circular ordering
sorted_indices = final_df_pca.sort_values(by='angle').index

In [None]:
#sorted_indices = final_df_pca.sort_values(by=['x', 'y']).index

In [None]:
# Assign groups in a round-robin fashion based on the sorted angle
group_assignments = {}
groups = ['Group A', 'Group B', 'Group C']

for i, idx in enumerate(sorted_indices):
    group_name = groups[i % 3]
    group_assignments[idx] = group_name

final_df_pca

In [None]:
final_df_with_cvr['group'] = final_df.index.map(group_assignments)
final_df_with_cvr

In [None]:
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns

load_dotenv()

import pymc as pm

features_of_interest = final_df_with_cvr.columns[3:-1]

for feature in features_of_interest:

    data_a = final_df_with_cvr[final_df_with_cvr['group'] == 'Group A'][feature]
    data_b = final_df_with_cvr[final_df_with_cvr['group'] == 'Group B'][feature]
    data_c = final_df_with_cvr[final_df_with_cvr['group'] == 'Group C'][feature]

    with pm.Model() as model:

        # Priors for group means and standard deviations
        #mu_a = pm.Normal("mu_a", mu=5, sigma=1)
        #mu_b = pm.Normal("mu_b", mu=5, sigma=1)
        #mu_c = pm.Normal("mu_c", mu=5, sigma=1)

        mu_a = pm.Normal("mu_a", mu=0, sigma=100)
        mu_b = pm.Normal("mu_b", mu=0, sigma=100)
        mu_c = pm.Normal("mu_c", mu=0, sigma=100)

        sigma_a = pm.HalfNormal("sigma_a", sigma=1)
        sigma_b = pm.HalfNormal("sigma_b", sigma=1)
        sigma_c = pm.HalfNormal("sigma_c", sigma=1)

        # Likelihoods for observed data
        obs_a = pm.Normal("obs_a", mu=mu_a, sigma=sigma_a, observed=data_a)
        obs_b = pm.Normal("obs_b", mu=mu_b, sigma=sigma_b, observed=data_b)
        obs_c = pm.Normal("obs_c", mu=mu_c, sigma=sigma_c, observed=data_c)

        # Sampling
        trace = pm.sample(1000, chains=4)

    print(trace.posterior)

    # Check Posterior Overlap
    #pm.plot_posterior(trace, var_names=["mu_a", "mu_b", "mu_c"])

    mu_a_samples = trace.posterior['mu_a'].values.flatten()
    mu_b_samples = trace.posterior['mu_b'].values.flatten()
    mu_c_samples = trace.posterior['mu_c'].values.flatten()

    plt.figure(figsize=(10, 6))

    # Plot KDE for each variable with different colors
    sns.kdeplot(mu_a_samples, fill=True, alpha=0.5, label='mu_a')
    sns.kdeplot(mu_b_samples, fill=True, alpha=0.5, label='mu_b')
    sns.kdeplot(mu_c_samples, fill=True, alpha=0.5, label='mu_c')

    plt.xlabel(f"{feature}")
    plt.ylabel("Density")
    plt.title("Overlayed Posterior Distributions for mu_a, mu_b, mu_c")
    plt.legend()

    plt.show()

In [None]:
dmas_a = final_df_with_cvr[final_df_with_cvr['group'] == 'Group A']
dmas_b = final_df_with_cvr[final_df_with_cvr['group'] == 'Group B']
dmas_c = final_df_with_cvr[final_df_with_cvr['group'] == 'Group C']

In [None]:
names_and_groups_df = pd.concat([dmas_a, dmas_b, dmas_c], axis=0)[['group', 'dma_code', 'dma_description']]
#names_and_groups_df['dma_code'] = names_and_groups_df['dma_code'].astype('int64')
names_and_groups_df['group'] = names_and_groups_df['group'].astype('category')
names_and_groups_df

In [None]:
import json
import plotly.express as px

with open('/Users/jacob.perius/psa_segment_testing/neilsen-dma-markets-albers-projection_1356.geojson', 'r') as f:
    dma_geojson_str = f.read()

# First parse to remove the outer string layer
dma_geojson = json.loads(dma_geojson_str)

# Second parse if needed (in case it's double-encoded)
if isinstance(dma_geojson, str):
    dma_geojson = json.loads(dma_geojson)

fig = px.choropleth(
    names_and_groups_df,
    geojson=dma_geojson,
    locations='dma_code',
    color='group',
    featureidkey='properties.dma_code',
    color_discrete_map={
        'Group A': 'blue',
        'Group B': 'orange',
        'Group C': 'green'
    },
    hover_data={'dma_code': True, 'dma_description': True, 'group': True}
)

fig.update_traces(marker_line_width=1, marker_opacity=1.0)

fig.update_geos(
    fitbounds="locations",
    visible=False,
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

In [None]:
grouped_customers_df = pd.merge(merged_df, names_and_groups_df, on=['dma_code', 'dma_description'], how='left')

grouped_customers_df