In [1]:
import pandas as pd 
import numpy as np
import altair as alt
import eco_style
alt.themes.enable("light")
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [2]:
full_df = pd.read_stata("forWeb/db_prices.dta")

# Strawberries Long-run

In [3]:
full_df.query("item_id == 212720").price.describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])

count    44131.000000
mean         5.605603
std          2.780360
min          0.250000
1%           0.950000
5%           1.490000
25%          3.750000
50%          5.000000
75%          7.480000
95%         10.130000
99%         12.220000
max         32.000000
Name: price, dtype: float64

In [4]:
df = full_df.query("item_id == 212720").copy()
df = df.query("quote_date > 202001.0")

percentiles = [10, 20, 30, 40, 50, 60, 70, 80, 90]

def make_percentile(p):
    return lambda x: np.percentile(x, p)

percentiles_df = df.groupby("quote_date").price.agg(
    **{f"{p}": make_percentile(p) for p in percentiles}
).reset_index()

temp_df = percentiles_df.copy()

# calculate just the addition over the percentile below
for i, p in enumerate(percentiles[1:]):
    upper_percentile = f"{p}"
    lower_percentile = f"{percentiles[i]}"
    percentiles_df[upper_percentile] = temp_df[upper_percentile] - temp_df[lower_percentile]



percentiles_df = percentiles_df.melt(
    id_vars=["quote_date"],
    value_vars= [f"{p}" for p in percentiles],
    var_name="percentile",
    value_name="value"
)        


percentiles_df['quote_date'] = pd.to_datetime(
    percentiles_df['quote_date'], format='%Y%m')

percentile_shades = alt.Chart(percentiles_df).mark_area(
    interpolate='monotone',
).encode(
    x=alt.X('quote_date:T',
            axis=alt.Axis(format='%b %Y', labelAngle=0, title=''),
             title=''),
    y=alt.Y('value:Q', title=''),
    color=alt.Color('percentile:O', 
                    sort=alt.EncodingSortField(field='percentile'),
                    scale=alt.Scale(range=[
  "rgba(255,230,240,0.0)",
  "rgba(255,200,215,0.3)",
  "rgba(255,170,200,0.4)",
  "rgba(250,135,180,0.5)",
  "rgba(244,67,120,0.8)",
  "rgba(250,135,180,0.5)",
  "rgba(255,170,200,0.4)",
  "rgba(255,200,215,0.3)",
  "rgba(255,230,240,0.0)"
]), title='')
)



median_df = df.groupby("quote_date").price.median().reset_index()
median_df['quote_date'] = pd.to_datetime(median_df['quote_date'].astype(str), format='%Y%m.0')
median_line = alt.Chart(median_df).mark_line(
    color='black',
    strokeWidth=2,
).encode(
    x=alt.X('quote_date:T', title=''),
    y=alt.Y('price:Q', title=''),
)

percentile_shades + median_line

percentile_shades

# Trying again

In [52]:
df = full_df.query("item_id == 212720").copy()
df = df.query("quote_date > 202001.0")

df['quote_date'] = pd.to_datetime(df.quote_date, format="%Y%m")



quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
#quantiles = [0.05, 0.15, 0.25, 0.35, 0.45, 0.5, 0.55, 0.65, 0.75, 0.85, 0.95]

# group by item_id and month and get the deciles
df = df[['item_id', 'quote_date', 'price']].groupby(['item_id', 'quote_date']).quantile(quantiles).reset_index()

df = df.rename(columns={'level_2': 'quantile', 'price': 'price'})

proper_quantiles = df.copy()

median_df = df.query("quantile == 0.5")
df = df.query("quantile != 0.5")

df = df.pivot_table(index=['item_id', 'quote_date'], columns='quantile', values='price').reset_index()




cols = [c for c in quantiles if c!= 0.5]
for i in range(len(cols), 0,-1):
    df[cols[i-1]] = df[cols[i-1]] - df[cols[i-2]] if i > 1 else df[cols[i-1]]

df = df.melt(id_vars=['item_id', 'quote_date'], value_vars=cols, var_name='quantile', value_name='price')
df = df.merge(proper_quantiles, on=['item_id', 'quote_date', 'quantile'], suffixes=('', '_true'))



In [53]:
# df = full_df.query("item_id == 212720").copy()
# df = df.query("quote_date > 202001.0")

# df['quote_date'] = pd.to_datetime(df.quote_date, format="%Y%m")



# quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
# #quantiles = [0.05, 0.15, 0.25, 0.35, 0.45, 0.5, 0.55, 0.65, 0.75, 0.85, 0.95]

# # group by item_id and month and get the deciles
# df = df[['item_id', 'quote_date', 'price']].groupby(['item_id', 'quote_date']).quantile(quantiles).reset_index()

# df = df.rename(columns={'level_2': 'quantile', 'price': 'price'})

# proper_quantiles = df.copy()


# proper_quantiles.pivot_table(
#     index=['item_id', 'quote_date'],
#     columns='quantile',
#     values='price'
# ).reset_index()

# df.query("quote_date == '2021-03-01'")

In [58]:
quantiles = alt.Chart(df).mark_area(
    interpolate='monotone',
).encode(
    x='quote_date:T',
    y=alt.Y('price:Q', title="", axis=alt.Axis(labelExpr="'£'+datum.label")),
    tooltip=[
        {'type': 'quantitative', 'field': 'quantile', 'title': 'Quantile'},
        {'type': 'quantitative', 'field': 'price_true', 'title': 'Price'},
        {'type': 'temporal', 'field': 'quote_date', 'title': 'Date'},
    ],
    color=alt.Color('quantile:O', 
                    legend=None,
                    scale=alt.Scale(
                        range=  ["rgba(255,230,240,0.0)",
  "rgba(235,51,54,0.3)",
  "rgba(235,51,54,0.4)",
  "rgba(235,51,54,0.5)",
  "rgba(235,51,54,0.8)",
  "rgba(235,51,54,0.5)",
  "rgba(235,51,54,0.4)",
  "rgba(235,51,54,0.3)",
  "rgba(255,230,240,0.0)"]
    # range=[
    #     "transparent",
    # ]+["rgb(235,51,54)"]*10

        )),
    order=alt.Order('quantile:O', sort='ascending')
)

median_line = alt.Chart(median_df).mark_line(color='rgb(235,51,54)', strokeWidth=2).encode(
    x=alt.X('quote_date:T', title='', axis=alt.Axis(format="%b %Y")),
    y=alt.Y('price:Q', title='', axis=alt.Axis(labelExpr="'£'+datum.label")),
)

chart = quantiles + median_line

chart = chart.properties(
    width=275,
    height=250
)

chart.save("strawberry_prices_pctiles.png", scale_factor=4)
chart.save("strawberry_prices_pctiles.json", scale_factor=4)  # Save as JSON for use in web applications
chart

## Range shaded

Unnamed: 0,item_id,quote_date,quantile,price
0,212720,2020-02-01,0.1,4.765
1,212720,2020-02-01,0.2,6.730
2,212720,2020-02-01,0.3,7.230
3,212720,2020-02-01,0.4,7.380
4,212720,2020-02-01,0.5,8.075
...,...,...,...,...
526,212720,2024-12-01,0.5,10.090
527,212720,2024-12-01,0.6,10.130
528,212720,2024-12-01,0.7,10.570
529,212720,2024-12-01,0.8,11.010


In [57]:
range_df = proper_quantiles.copy()

range_df = range_df.pivot_table(
    index=['item_id', 'quote_date'], 
    columns='quantile', 
    values='price'
).reset_index()
range_df = range_df.rename(columns={0.1: 'low', 0.9: 'high'})
range_shade = alt.Chart(range_df).mark_area(
    interpolate='monotone',
).encode(
    x=alt.X('quote_date:T', title='', axis=alt.Axis(format="%b %Y")),
    y=alt.Y('low:Q', title='', axis=alt.Axis(labelExpr="'£'+datum.label")),
    y2='high:Q',
    color=alt.value('rgba(224, 17, 43, 0.8)'),
    opacity=alt.value(0.5)
)

median_line = alt.Chart(median_df).mark_line(
    color='rgba(224, 17, 43, 1)',
    strokeWidth=2,
).encode(
    x=alt.X('quote_date:T', title='', axis=alt.Axis(format="%b %Y")),
    y=alt.Y('price:Q', title='', axis=alt.Axis(labelExpr="'£'+datum.label")),
)

chart = range_shade + median_line

chart.save("strawberry_prices_range.png", scale_factor=4)
chart.save("strawberry_prices_range.json", scale_factor=4)  # Save as
chart



In [30]:
df = full_df.query("item_id == 212720 and quote_date == 202201.0").copy()
df.price.describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])

count    302.000000
mean       8.376523
std        2.242547
min        1.000000
1%         2.000000
5%         2.990000
25%        7.480000
50%        8.810000
75%        9.910000
95%       11.000000
99%       12.110000
max       14.100000
Name: price, dtype: float64

# Cream vs Strawberries vs CPI vs Food vs Sparkling wine

In [288]:
targets = {
    211720: "Strawberries",
    211803: "Single cream",
    # 211809: "Single cream (old)",
    211815: "Double cream",
    # 310214: "Champagne (old)",
    310423: "Champagne",
    310426: "Sparkling wine"
}

df = full_df.query("item_id in @targets").copy()
df['description'] = df['item_id'].map(targets)

df.groupby(['item_id', 'description']).agg({
    "quote_date": ["min", "count", "max"],
})

first_date = 201504.0
df = df.query("quote_date >= 201504.0")
df = df.sort_values(by=['item_id', 'quote_date'])

df = df.groupby(['item_id', 'description', 'quote_date']).price.mean().reset_index()


# # calculate the index change since 2015
init_price = df.drop_duplicates(subset=['item_id'], keep='first')[['item_id', 'price']].copy()
df = df.merge(init_price[['item_id', 'price']], on=['item_id'], suffixes=('', '_init'))
df['index'] = df['price'] / df['price_init'] * 100

df['quote_date'] = pd.to_datetime(df.quote_date, format="%Y%m")


chart = alt.Chart(df).mark_line(
    interpolate='monotone', 
).encode(
    x=alt.X('quote_date:T', title='', axis=alt.Axis(format="%b %Y")),
    y=alt.Y('index:Q', title='', axis=alt.Axis(labelExpr="datum.label"), scale=alt.Scale(zero=False)),
    color=alt.Color('description:N', title=''),
)

chart.save("champage_cream_strawberries.png", scale_factor=4)
chart.save("champage_cream_strawberries.json", scale_factor=4)  #
chart

# Strawberries seasonality

In [284]:
df = full_df.query("item_id == 212720").copy()
#df = df.query("quote_date > 202001.0")

df['quote_date'] = pd.to_datetime(df.quote_date, format="%Y%m")

# Get the mean for each year
annual_df = df.groupby(df['quote_date'].dt.year).price.mean().reset_index()
annual_df = annual_df.rename(columns={'price': 'annual_mean', 'quote_date': 'year'})

df = df.groupby(df['quote_date']).agg({
    "price": "mean"
}).reset_index()

df['year'] = df['quote_date'].dt.year
df = df.merge(annual_df, on='year', how='left')
df['value'] = (df['price'] / df['annual_mean'])-1
df['month'] = df['quote_date'].dt.strftime("%b")

df = df.groupby(df['month']).agg({
    "value": "mean"
})

df['month'] = pd.to_datetime(df.index, format="%b")

# shade June-September
shade_df = pd.DataFrame([
    {"start": "2024-04-01", "end": "2024-10-30", "color": "rgba(0, 0, 0, 0.1)"},
])
shade = alt.Chart(shade_df).mark_rect(
    fillOpacity=0.5
).encode(
    x=alt.X('month(start):O', title=''),
    x2=alt.X2('month(end):O'),
    color=alt.value("rgba(0, 0, 0, 0.1)")
)

# A label for British strawberry season
label = alt.Chart(pd.DataFrame({
    "date": ["2024-10-01"],
    "text": ["(British strawberry season)"]
})).mark_text(
    align='right',
    fontSize=11,
    color='rgba(0, 0, 0, 0.8)',
).encode(
    x=alt.X('month(date):O', title=''),
    y=alt.value(8),
    text='text:N'
)


bars = alt.Chart(df).mark_bar(
    color="#36B7B4"
).encode(
    x=alt.X('month(month):O',
        axis=alt.Axis(format='%b',
            ticks=False,
         labelAngle=0, title=''),
     title=''),
    y=alt.Y('value:Q',
    axis=alt.Axis(
        format='%',
        # labelExpr="datum.label + (datum.value == 0.4 ? )",
    ),
     title=''),
)

chart = shade + bars + label

chart = chart.properties(
    width=275,
    height=200
)

chart.save("strawberry_prices_monthly.png", scale_factor=4)
chart.save("strawberry_prices_monthly.json", scale_factor=4)  # Save
chart

In [244]:
df

Unnamed: 0_level_0,value,month
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Apr,-0.082039,1900-04-01
Aug,-0.127795,1900-08-01
Dec,0.412586,1900-12-01
Feb,0.186846,1900-02-01
Jan,0.362421,1900-01-01
Jul,-0.149672,1900-07-01
Jun,-0.124012,1900-06-01
Mar,0.002649,1900-03-01
May,-0.001297,1900-05-01
Nov,0.262838,1900-11-01


In [227]:
annual_df.dtypes

quote_date       int32
annual_mean    float32
dtype: object

In [226]:
df.dtypes

quote_date    datetime64[ns]
price                float32
dtype: object

In [None]:

df = full_df.query("item_id == '211720'").copy()
df

Unnamed: 0,quote_date,shop_code,item_id_raw,region,price,indicator_box,item_id


# Three types of cream

In [287]:
targets = {
    211803: "Single cream\n142ml carton", #FRESH CREAM SINGLE 142ML CARTN
    211809: "Single cream\n284ml-300ml", #FRESH SINGLE CREAM 284-300ML
    211815: "Double cream\n250ml-300ml",  #FRESH DOUBLE CREAM 250ML-300ML
}

multipliers = {
    211803: 1.0,  # FRESH CREAM SINGLE 142
    211809: 142/((284+300)/2), # FRESH SINGLE CREAM 284-300ML
    211815: 142/(250+300)/2,  # FRESH DOUBLE CREAM 250ML-300ML
}

df = full_df.query("item_id in @targets").copy()
df['description'] = df['item_id'].map(targets)
df = df.groupby(['item_id', 'description', 'quote_date']).price.mean().reset_index()
df['quote_date'] = pd.to_datetime(df.quote_date, format="%Y%m")

lines = alt.Chart(df).mark_line(
    interpolate='monotone', 
).encode(
    x=alt.X('quote_date:T', title='', axis=alt.Axis(format="%Y")),
    y=alt.Y('price:Q', title='', axis=alt.Axis(labelExpr="'£'+datum.label"), scale=alt.Scale(zero=False)),
    color=alt.Color('description:N', title='', legend=None)
)

df['label'] = np.where(df['quote_date'] == df.groupby('item_id')['quote_date'].transform('max'), df.description, '')
df['label'] = df.label.apply(lambda x: x.split('\n'))

end_labels = alt.Chart(df).mark_text(
    align='left',
    baseline='middle',
    dx=5,
).encode(
    x=alt.X('quote_date:T', title='', axis=alt.Axis(format="%b %Y")),
    y=alt.Y('price:Q', title='', axis=alt.Axis(labelExpr="'£'+datum.label"), scale=alt.Scale(zero=False)),
    text=alt.Text('label:N', title=''),
    color=alt.Color('description:N', title=''),
)

chart = (lines + end_labels).properties(
    width=275,
    height=230
)

chart.save("cream_prices.png", scale_factor=4)
chart.save("cream_prices.json", scale_factor=4)  # Save as JSON for use
chart


# LRPD (Historical)

In [89]:
items_df = pd.read_stata("forWeb/db_item_clean.dta")
items_df[items_df.description.str.contains("cream", case=False, na=False)].sort_values(by='description').sort_values(by='date_quote_s')

Unnamed: 0,item_id,description,date_quote_s,date_quote_e,n_obs
1084,520203,ANTISEPTIC CREAM,198802,199701,17607
340,212914,CHOC COVERED ICE CREAM BAR,198802,201201,54770
33,210301,CREAM CRACKERS PACK 200G-300G,198802,202412,65387
163,211803,FRESH CREAM SINGLE 142ML CARTN,198802,200101,24570
727,430509,HOUSEHOLD CLEANER CREAM/LIQUID,198802,202412,91543
339,212913,VANILLA ICE CREAM-2 LITRES,198802,200201,55950
341,212916,SALAD CREAM-10 OZ/ 285G BOTTLE,198903,200101,23131
41,210318,FROZN CAKE/GATEAU NO ICE-CREAM,199502,200601,18229
169,211809,FRESH SINGLE CREAM 284-300ML,200102,200901,11221
357,212933,ICE CREAM 450ML-1L,200202,202412,35570


In [91]:
# strawberries
scale = 0.453592  # 1 lb = 0.453592 kg

# 1988 price: STRAWBERRIES-PER 1/2LB
start_price_0_5lb = full_df.query("item_id == 212706 and quote_date == 198807").price.mean()
start_price_kg = start_price_0_5lb * (1 / scale)

# 2024 price: STRAWBERRIES PER KG OR PUNNET
end_price_kg = full_df.query("item_id == 212720 and quote_date == 202407").price.mean()
end_price_0_5lb = end_price_kg * scale

print(f"Strawberries price in 1988: £{start_price_0_5lb:.2f} per 1/2 lb")
print(f"Strawberries price in 1988: £{start_price_kg:.2f} per kg")
print(f"Strawberries price in 2024: £{end_price_kg:.2f} per kg")
print(f"Strawberries price in 2024: £{end_price_0_5lb:.2f} per 1/2 lb (scaled to 1988 price)")
print(f"Strawberries price change: {((end_price_kg - start_price_kg) / start_price_kg) * 100:.2f}%")
print(f"Strawberries price multiplier: {end_price_kg / start_price_kg:.2f}x")
print("\n\n")

# Cream
# 1988 price: FRESH CREAM SINGLE 142ML CARTN
scale = 142 / 275

start_price_cream_142ml = full_df.query("item_id == 211803 and quote_date == 198807").price.mean()
start_price_cream_275ml = start_price_cream_142ml * scale

# 2024 price: FRESH DOUBLE CREAM 250ML-300ML
end_price_cream_275ml = full_df.query("item_id == 211815 and quote_date == 202407").price.mean()
end_price_cream_142ml = end_price_cream_275ml * (1 / scale)

print(f"Single cream price in 1988: £{start_price_cream_142ml:.2f} per 142 ml")
print(f"Single cream price in 1988: £{start_price_cream_275ml:.2f} per 275 ml (scaled to 1988 price)")
print(f"Double cream price in 2024: £{end_price_cream_275ml:.2f} per 275 ml")
print(f"Double cream price in 2024: £{end_price_cream_142ml:.2f} per 142 ml (scaled to 1988 price)")
print(f"Cream price change: {((end_price_cream_275ml - start_price_cream_275ml) / start_price_cream_275ml) * 100:.2f}%")
print(f"Cream price multiplier: {end_price_cream_275ml / start_price_cream_275ml:.2f}x")


Strawberries price in 1988: £0.58 per 1/2 lb
Strawberries price in 1988: £1.28 per kg
Strawberries price in 2024: £6.50 per kg
Strawberries price in 2024: £2.95 per 1/2 lb (scaled to 1988 price)
Strawberries price change: 407.03%
Strawberries price multiplier: 5.07x



Single cream price in 1988: £0.32 per 142 ml
Single cream price in 1988: £0.16 per 275 ml (scaled to 1988 price)
Double cream price in 2024: £1.29 per 275 ml
Double cream price in 2024: £2.50 per 142 ml (scaled to 1988 price)
Cream price change: 687.25%
Cream price multiplier: 7.87x


# Gross median wages lr

In [110]:
df = pd.read_excel("19682023_gross_median.xls", sheet_name="GB Median Time-series", skiprows=7)
df.columns = ["empty", "date"] + list(df.columns[2:])
df = df[["date", "0.5.2"]]
df.columns = ['date', 'value']
df['date'] = pd.to_datetime(df['date'], format='%Y', errors='coerce')
df['value'] = pd.to_numeric(df['value'], errors='coerce')

df = df.dropna()
df = pd.concat([
    df,
    pd.DataFrame([
        {"date": pd.to_datetime("2024-01-01"), "value": 728} # https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/earningsandworkinghours/bulletins/annualsurveyofhoursandearnings/2024
    ])]
    )

avg_median_wage_1988 = df.query("date == '1988-01-01'").value.values[0]
avg_median_wage_2024 = df.query("date == '2024-01-01'").value.values[0]
print(f"Average gross median wage in 1988: £{avg_median_wage_1988:.2f}")
print(f"Average gross median wage in 2024: £{avg_median_wage_2024:.2f}")
print(f"Average gross median wage change: {((avg_median_wage_2024 - avg_median_wage_1988) / avg_median_wage_1988) * 100:.2f}%")
print(f"Average gross median wage multiplier: {avg_median_wage_2024 / avg_median_wage_1988:.2f}x")

Average gross median wage in 1988: £190.90
Average gross median wage in 2024: £728.00
Average gross median wage change: 281.35%
Average gross median wage multiplier: 3.81x


  avg_median_wage_1988 = df.query("date == '1988-01-01'").value.values[0]
  avg_median_wage_2024 = df.query("date == '2024-01-01'").value.values[0]


In [82]:
end_price

6.4998913

In [72]:
items_df = pd.read_stata("forWeb/db_item_clean.dta")
items_df[items_df.description.str.contains("cream", case=False, na=False)].sort_values(by='description')

Unnamed: 0,item_id,description,date_quote_s,date_quote_e,n_obs
1084,520203,ANTISEPTIC CREAM,198802,199701,17607
340,212914,CHOC COVERED ICE CREAM BAR,198802,201201,54770
362,212938,CHOCOLATE COVERED ICE CREAM,201202,202412,32953
33,210301,CREAM CRACKERS PACK 200G-300G,198802,202412,65387
505,310428,CREAM LIQUER 70CL-1LT 14-20%,201602,202412,23294
163,211803,FRESH CREAM SINGLE 142ML CARTN,198802,200101,24570
174,211815,FRESH DOUBLE CREAM 250ML-300ML,200902,202412,26191
169,211809,FRESH SINGLE CREAM 284-300ML,200102,200901,11221
41,210318,FROZN CAKE/GATEAU NO ICE-CREAM,199502,200601,18229
727,430509,HOUSEHOLD CLEANER CREAM/LIQUID,198802,202412,91543


In [63]:
items_df.query("item_id == 212720")

Unnamed: 0,item_id,description,date_quote_s,date_quote_e,n_obs
310,212720,STRAWBERRIES PER KG OR PUNNET,199905,202412,44131


In [185]:
df

Unnamed: 0,item_id,description,quote_date,price
0,211815,Double cream,2015-04-01,0.979930
1,211815,Double cream,2015-05-01,0.980414
2,211815,Double cream,2015-06-01,0.979861
3,211815,Double cream,2015-07-01,0.932429
4,211815,Double cream,2015-08-01,0.961631
...,...,...,...,...
346,310426,Sparkling wine,2024-08-01,8.491310
347,310426,Sparkling wine,2024-09-01,8.788590
348,310426,Sparkling wine,2024-10-01,8.861217
349,310426,Sparkling wine,2024-11-01,8.589192


In [180]:
init_price[['item_id', 'description', 'price']]

Unnamed: 0,item_id,description,price
0,211815,Double cream,0.97993
1,310423,Champagne,28.526964
2,310426,Sparkling wine,7.991739


In [178]:
df

Unnamed: 0,item_id,description,quote_date,price
0,211815,Double cream,2015-04-01,0.979930
1,211815,Double cream,2015-05-01,0.980414
2,211815,Double cream,2015-06-01,0.979861
3,211815,Double cream,2015-07-01,0.932429
4,211815,Double cream,2015-08-01,0.961631
...,...,...,...,...
346,310426,Sparkling wine,2024-08-01,8.491310
347,310426,Sparkling wine,2024-09-01,8.788590
348,310426,Sparkling wine,2024-10-01,8.861216
349,310426,Sparkling wine,2024-11-01,8.589193


In [177]:
df

Unnamed: 0,item_id,description,quote_date,price
0,211815,Double cream,2015-04-01,0.979930
1,211815,Double cream,2015-05-01,0.980414
2,211815,Double cream,2015-06-01,0.979861
3,211815,Double cream,2015-07-01,0.932429
4,211815,Double cream,2015-08-01,0.961631
...,...,...,...,...
346,310426,Sparkling wine,2024-08-01,8.491310
347,310426,Sparkling wine,2024-09-01,8.788590
348,310426,Sparkling wine,2024-10-01,8.861216
349,310426,Sparkling wine,2024-11-01,8.589193


In [173]:
init_price


Unnamed: 0,item_id,description,price
0,211815,Double cream,0.97993
1,310423,Champagne,28.526964
2,310426,Sparkling wine,7.991739


In [82]:
percentiles_df.query("quote_date == 202412.0")

Unnamed: 0,quote_date,percentile,value
58,202412.0,10_percentile,8.33
117,202412.0,20_percentile,1.32
176,202412.0,30_percentile,8.33
235,202412.0,40_percentile,1.67
294,202412.0,50_percentile,8.42
353,202412.0,60_percentile,1.71
412,202412.0,70_percentile,8.86
471,202412.0,80_percentile,2.150001
530,202412.0,90_percentile,9.52


In [None]:


percentiles_df['percentile'] = percentiles_df['percentile'].str.replace('_percentile', '').astype(int)

# boost up the percentiles by 5: e.g. the 10-20th should now be 15, 25, 35, etc.
# percentiles_df['percentile'] = percentiles_df['percentile'] + 5

percentiles_df['quote_date'] = pd.to_datetime(percentiles_df['quote_date'].astype(str), format='%Y%m.0')



In [61]:
percentiles_df

Unnamed: 0,quote_date,percentile,value
0,2020-02-01,10,10.00
1,2020-03-01,10,10.00
2,2020-04-01,10,10.00
3,2020-05-01,10,10.00
4,2020-06-01,10,6.67
...,...,...,...
526,2024-08-01,90,7.88
527,2024-09-01,90,8.81
528,2024-10-01,90,11.01
529,2024-11-01,90,11.01


In [None]:
df.query("quote_date == 202304.0").price.median()
df.query("quote_date == 202304.0").price.median()


7.480000019073486

In [55]:
np.percentile(df.query("quote_date == 202304.0").price, 50)

7.480000019073486

In [49]:
df.query("quote_date == 202304.0")

Unnamed: 0,quote_date,shop_code,item_id_raw,region,price,indicator_box,item_id
10615897,202304.0,127.0,212720,13.0,7.48,Q,212720
10616032,202304.0,54.0,212720,7.0,5.48,C,212720
10616054,202304.0,9803.0,212720,5.0,7.48,C,212720
10616291,202304.0,67.0,212720,10.0,5.48,C,212720
10616344,202304.0,4.0,212720,5.0,9.91,C,212720
...,...,...,...,...,...,...,...
10659568,202304.0,9802.0,212720,6.0,7.38,C,212720
10659619,202304.0,72.0,212720,3.0,5.47,Q,212720
10659687,202304.0,24.0,212720,3.0,5.75,C,212720
10659693,202304.0,814.0,212720,12.0,9.91,C,212720


In [42]:
percentiles_df.query("percentile == 50")

Unnamed: 0,quote_date,percentile,value
236,2020-02-01,50,1.0
237,2020-03-01,50,1.0068
238,2020-04-01,50,1.90389
239,2020-05-01,50,1.96685
240,2020-06-01,50,2.0
241,2020-07-01,50,2.0
242,2020-08-01,50,1.0
243,2020-09-01,50,1.39056
244,2020-10-01,50,2.06165
245,2020-11-01,50,2.0236


In [39]:
median_df

Unnamed: 0,quote_date,price
0,2020-02-01,8.075
1,2020-03-01,4.89
2,2020-04-01,5.0
3,2020-05-01,5.0
4,2020-06-01,5.0
5,2020-07-01,5.0
6,2020-08-01,4.38
7,2020-09-01,4.41
8,2020-10-01,5.25
9,2020-11-01,6.67


In [8]:
points = alt.Chart(df).mark_circle(
    opacity=0.2,
    clip=True,
).encode(
    x=alt.X('quote_date:T', title=''),
    y=alt.Y('price:Q', title='')

)

points

Unnamed: 0,quote_date,shop_code,item_id_raw,region,price,indicator_box,item_id
10615828,202308.0,9802.0,212720,4.0,5.63,,212720
10615830,202306.0,98.0,212720,8.0,7.48,,212720
10615831,202407.0,941.0,212720,5.0,7.50,C,212720
10615832,202109.0,92.0,212720,6.0,3.98,Q,212720
10615835,202211.0,801.0,212720,3.0,8.81,C,212720
...,...,...,...,...,...,...,...
10659940,202107.0,86.0,212720,4.0,5.00,C,212720
10659941,202002.0,77.0,212720,3.0,9.69,C,212720
10659943,202010.0,807.0,212720,3.0,7.50,C,212720
10659951,202307.0,101.0,212720,7.0,4.73,C,212720
