In [8]:

import pandas as pd
import numpy as np
import plotly.express as px

import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
import plotly.graph_objects as go

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_colwidth', 100)

In [2]:
# updates for database and webscraping code
# TODO brand_source_id is not working - null
# TODO no actual product names in data - adjust displayname col


In [3]:
df = pd.read_csv('../data/preprocessed_data.csv')


### Day 1 - Basic scatter, mascara pricing at sephora

In [None]:
df.loc[df['size_refinement'].isna(),'size_refinement'] = 'Standard'
mascaras = df[df['category_root_name_l3']=='Mascara']

mascaras = mascaras[~mascaras['target_url'].str.contains('serum')]
mascaras = mascaras[mascaras['unit_ml']<20]



fig = px.scatter(
    mascaras, 
    x="unit_ml", 
    y="price", 
    symbol='size_refinement', 
    color='value_CAD_ml',
    opacity=0.75,
    title='Mascara pricing at Sephora (Jan 2025)',
    color_continuous_scale=px.colors.sequential.Agsunset,
    custom_data=["brand_name", "target_url", "product_code"]  # Add additional custom data
)

fig.update_traces(
    marker=dict(
        size=12,
        line=dict(
            width=1,
            color='black'
        )
    ),
    selector=dict(mode='markers'),
    hovertemplate=(
            "<b>Product Code:</b> %{customdata[2]}<br>" +
            "<b>Brand:</b> %{customdata[0]}<br>" +
            "<b>Product URL:</b> %{customdata[1]}<br>" +
            "<b>Price (CAD):</b> %{y}<br>" +
            "<b>Volume (mL):</b> %{x}<br>" +
            "<b>Unit Price (CAD/mL):</b> %{marker.color:.2}"
    )    
)


fig.update_layout(
     font=dict(
        # family="Geist, serif",  # Font family (as defined in Google Fonts)
        size=14,                     # Font size
        color="black"                # Font color
    ),
    title=dict(
        x=0.5,
        font=dict(
            size=20
        )),
    legend=dict(
        title='Product Type',
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01,
    ),
    margin=dict(l=50, r=50, t=50, b=50),
    template='simple_white',
    xaxis_title="Volume (mL)", 
    yaxis_title="Price (CAD)",
    coloraxis=dict(
        colorbar=dict(
            title=dict(
                text="Unit Price <br>(CAD/mL)", 
                side="top"      
            ),
            outlinecolor="black",  
            outlinewidth=1,     
            tickcolor="black",
            tickfont=dict(color="black") 
        )
    )
)

fig.show(height=400, width=600, scale=10)

fig.write_html("../figures/dynamic/scatter_mascara_pricing.html")
fig.write_image("../figures//scatter_mascara_pricing.png", height=400, width=600, scale=10)


### Day 2 - Basic scatter, mascara pricing at sephora

In [None]:
product_groups = mascaras.groupby(['product_code','size_refinement'], as_index=False).agg({
    'value_CAD_ml':max,
    'target_url':'first',
    'brand_name':'first',
    'parent_product_code':'first',
    'sku_id':'first'
})

product_groups = product_groups.pivot(index='product_code', columns='size_refinement', values='value_CAD_ml')

product_groups = product_groups[(product_groups['Mini'].notnull()) & (product_groups['Standard'].notnull())]

product_groups['size_ratio'] = product_groups['Mini'] / product_groups['Standard']
product_groups = product_groups.sort_values(by='size_ratio')


The provided callable <built-in function max> is currently using SeriesGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "max" instead.



In [120]:
min_value = round(np.floor(product_groups[['Mini','Standard']].min().min()))
max_value = round(product_groups[['Mini','Standard']].max().max())

equal_value_line = [x for x in range(min_value,max_value+1)]



fig = go.Figure()

fig.add_trace(go.Scatter(x=product_groups['Mini'], y=product_groups['Standard'],
                    mode='markers',
                    name='Products',
                    opacity=0.8,
                    marker=dict(
                        color='DarkBlue',
                        size=12,
                        line=dict(width=1,
                                        color='white')
                    ),
                    ))
fig.add_trace(go.Scatter(x=equal_value_line, y=equal_value_line,
                    mode='lines',
                    name='Equal Value',
                    marker=dict(
                        color='RoyalBlue',
                    ),

                    opacity=0.5))


product_annotations = {
    'P111902':'Tarte, Lights Camera Lashes',
    'P466443':'Sephora Collection, Size Up Mascara',
    'P128706':'Lancôme, Clis Booster XL Super Enhancing Mascara Base'
}


fig.add_annotation(x=5.750000, y=8.181818,
            text="Lancôme<br>Clis Booster XL Super Enhancing Base",
            showarrow=True,
            arrowhead=1,
            ax=20,
            ay=-25)

fig.add_annotation(x=4.200000, y=5.285714,
            text="Tarte<br>Lights Camera Lashes",
            showarrow=True,
            arrowhead=1,
            ax=20,
            ay=-25)

fig.add_annotation(x=1.690674, y=1.259013,
            text="Sephora Collection<br>Size Up",
            showarrow=True,
            arrowhead=1,
            ax=20,
            ay=25,
            )


# Line of best fit 

model = LinearRegression()
model.fit(product_groups['Mini'].values.reshape(-1, 1), product_groups['Standard'].values.reshape(-1, 1) )

x_range = np.linspace(min_value, max_value+1, 50)
y_range = model.predict(x_range.reshape(-1, 1))

b_0 = model.intercept_[0]
b_1 = model.coef_[0][0]
equation = f"y={b_0:.2f}{b_1:+.2f}x"
fig.add_trace(go.Scatter(x=np.hstack(x_range), y=np.hstack(y_range),
                    mode='lines',
                    name=equation))

fig.update_layout(
    margin=dict(l=50, r=50, t=50, b=50),
    template='simple_white',
    yaxis_range=[0,10],
    xaxis_range=[0,10],
    yaxis_title="Mini Size Unit Price (CAD/mL)",
    xaxis_title="Standard Size Unit Price (CAD/mL)",
    title=dict(
    text='Unit Prices of Mascaras with Standard and Mini variations<br><sup>Mini size products under the Equal Value curve are cheaper than Standard size versions</sup>',
    x=0.5,
    font=dict(
        size=20
    )),
)

fig.show()
fig.write_image("../figures/scatter_mascara_mini_standard.png", scale=10)
