In [104]:
import glob
import json
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import re

import sys
sys.path.append('src/')


from webscraper import parse_url_info, drop_duplicate_product_urls

In [105]:
# thank you chatgpt <3
def parse_volume_string(input_string):
    pattern = r'(\d+(\.\d+)?)\s*(\w+)\s+(\d+(\.\d+)?)\s*(\w+)\s*(.*)'
    matches = re.search(pattern, input_string)
    if matches:
        amount1 = matches.group(1)
        unit1 = matches.group(3)
        amount2 = matches.group(4)
        unit2 = matches.group(6)
        trailing_text = matches.group(7).strip()
        return amount1, unit1, amount2, unit2, trailing_text
    return None

def parse_single_volume(input_string):
    pattern = r'\s*(\d+(\.\d*)?|\.\d+)\s*(\w+)\s*'
    matches = re.match(pattern, input_string)
    if matches:
        amount = matches.group(1)
        unit = matches.group(3)
        return amount, unit
    return None

def shorthand_amt_conversion(value):
    if value:
        return value.replace(".","").replace("K","00").replace("M","000000")
    return None

In [106]:
product_files = glob.glob("data/products/*")
products = []
for product_file in product_files:
    with open(product_file) as file:
        products.append(pd.json_normalize(json.loads(file.read())))
        
df_products = pd.concat(products, axis=0)
df_products = df_products[df_products['product_name'].notnull()]
df_products = df_products.reset_index(drop=True).reset_index().rename(columns={'index':'internal_product_id'})

In [108]:
# product ratings
df_products['rating'] = df_products['rating'].str.replace("width:","")
df_products['rating'] = df_products['rating'].str.replace("%","")
df_products['rating'] = df_products['rating'].astype(float)
df_products['rating'] = df_products['rating']/100 * 5

df_products['n_loves'] = df_products['n_loves'].apply(shorthand_amt_conversion)
df_products['n_loves'] = df_products['n_loves'].astype(float)

df_products['product_reviews'] = df_products['product_reviews'].apply(shorthand_amt_conversion)
df_products['product_reviews'] = df_products['product_reviews'].astype(float)


product_options = []
for product in df_products.iterrows():
    url = product[1]['url']
    df_options = pd.json_normalize(product[1]['options'])
    df_options['url'] = url
    product_options.append(df_options)
    
df_products = df_products.merge(pd.concat(product_options), how='left', on='url')

df_products['name'] =  df_products['name'].str.lower()
df_products['size'] =  df_products['size'].str.lower()

df_products['price'] = df_products['price'].str.replace("$","")
df_products['price'] = df_products['price'].astype(float)

df_products['out_of_stock'] = df_products['name'].str.contains('out of stock')
df_products['limited_edition'] = df_products['name'].str.contains('limited edition')
df_products['new_product'] = df_products['name'].str.contains('new')
df_products['few_left'] = df_products['name'].str.contains('only a few left')
df_products['sale'] = df_products['name'].str.contains('sale')
df_products['refill'] = df_products['name'].str.contains('refill')

df_products.loc[df_products['size'].isna(), 'size'] = df_products['name']
df_products.loc[df_products['size']==df_products['name'],'name'] = None

misc_text = {
    "out of stock":"",
    "limited edition":"",
    "new":"",
    "only a few left":"",
    "sale":"",
    "size":"",
    "refill":"",
    "color":"",
    ":":"",
    "-":"",
    "mini":"",
    "fl oz":"floz",
    "fl. oz":"floz",
    "oz.":"oz ",
    "/":" ",
    r'\s+': ' '
}

def series_replace(df, ids):
    df['size'] = df['size'].replace(ids, regex=True)
    return df

df_products =  series_replace(df_products,misc_text)

df_products['sku'] = df_products['sku'].str.replace("Item ","")

df_products['size'] = df_products['size'].fillna("")

df_products['amount_a'], df_products['unit_a'], df_products['amount_b'], df_products['unit_b'], df_products['misc_info'] = df_products['size'].apply(parse_volume_string).str
df_products[['amount_a','amount_b']] = df_products[['amount_a','amount_b']].astype(float)

df_products['amount_single'], df_products['unit_single'] = df_products[df_products['amount_a'].isna()]['size'].apply(parse_single_volume).str

df_products['url_path'], df_products['url_sku'], df_products['url_params'] =  df_products['url'].apply(parse_url_info).str
df_products = df_products.drop_duplicates(subset=['brand_name','product_name','url_path', 'sku','price'], keep='last')
df_products['amount_single']= df_products['amount_single'].astype(float)

df_products.loc[df_products['amount_a'].isna(), 'amount_a'] = df_products['amount_single']
df_products.loc[df_products['amount_a'].isna(), 'unit_a'] = df_products['unit_single']

df_products = df_products.drop(['amount_single','unit_single'],axis=1)

df_products['categories'] = df_products['categories'].fillna('   ')
df_products['lvl_0_cat'], df_products['lvl_1_cat'], df_products['lvl_2_cat'] = df_products['categories'].str

df_products['swatch_group'] = df_products['swatch_group'].str.replace('Matte finish - ',"")
df_products['swatch_group'] = df_products['swatch_group'].str.replace('Natural finish - ',"")


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


Columnar iteration over characters will be deprecated in future releases.


Columnar iteration over characters will be deprecated in future releases.


Columnar iteration over characters will be deprecated in future releases.


Columnar iteration over characters will be deprecated in future releases.



In [16]:
df_products_tree = df_products.groupby(['lvl_0_cat', 'lvl_1_cat', 'lvl_2_cat','brand_name'], as_index=False)['url'].count()

In [17]:
import plotly.express as px
df = px.data.tips()
fig = px.treemap(df_products_tree, path=[px.Constant("all"), 'lvl_0_cat', 'lvl_1_cat', 'lvl_2_cat','brand_name'], values='url')
fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

In [18]:
df_products[(df_products['lvl_2_cat']=='Concealer') & (df_products['brand_name']=='Glossier')]

Unnamed: 0,internal_product_id,url,product_name,brand_name,options,rating,product_reviews,ingredients,n_loves,categories,swatch_group,size,name,price,sku,out_of_stock,limited_edition,new_product,few_left,sale,refill,amount_a,unit_a,amount_b,unit_b,misc_info,amount_single,unit_single,url_path,url_sku,url_params,lvl_0_cat,lvl_1_cat,lvl_2_cat
14190,4924,https://www.sephora.com/ca/en/product/glossier...,Stretch Concealer for Dewy Buildable Coverage,Glossier,[{'swatch_group': 'Radiant finish - Standard s...,4.2775,191.0,-Beeswax and Microcrystalline Wax: Give concea...,41900.0,"[Makeup, Face, Concealer]",Radiant finish - Standard size,0.17 oz 4.8 g,color: g1 - deepest,26.0,2650018,False,False,False,False,False,False,0.17,oz,4.8,g,,,,/ca/en/product/glossier-stretch-concealer-for-...,2650018,"{'skuId': ['2650018'], 'icid2': ['products gri...",Makeup,Face,Concealer
14191,4924,https://www.sephora.com/ca/en/product/glossier...,Stretch Concealer for Dewy Buildable Coverage,Glossier,[{'swatch_group': 'Radiant finish - Standard s...,4.2775,191.0,-Beeswax and Microcrystalline Wax: Give concea...,41900.0,"[Makeup, Face, Concealer]",Radiant finish - Standard size,0.17 oz 4.8 g,color: g2 - very deep warm,26.0,2650026,False,False,False,False,False,False,0.17,oz,4.8,g,,,,/ca/en/product/glossier-stretch-concealer-for-...,2650018,"{'skuId': ['2650018'], 'icid2': ['products gri...",Makeup,Face,Concealer
14192,4924,https://www.sephora.com/ca/en/product/glossier...,Stretch Concealer for Dewy Buildable Coverage,Glossier,[{'swatch_group': 'Radiant finish - Standard s...,4.2775,191.0,-Beeswax and Microcrystalline Wax: Give concea...,41900.0,"[Makeup, Face, Concealer]",Radiant finish - Standard size,0.17 oz 4.8 g,color: g3 - very deep,26.0,2650133,False,False,False,False,False,False,0.17,oz,4.8,g,,,,/ca/en/product/glossier-stretch-concealer-for-...,2650018,"{'skuId': ['2650018'], 'icid2': ['products gri...",Makeup,Face,Concealer
14193,4924,https://www.sephora.com/ca/en/product/glossier...,Stretch Concealer for Dewy Buildable Coverage,Glossier,[{'swatch_group': 'Radiant finish - Standard s...,4.2775,191.0,-Beeswax and Microcrystalline Wax: Give concea...,41900.0,"[Makeup, Face, Concealer]",Radiant finish - Standard size,0.17 oz 4.8 g,color: g4 - deep cool,26.0,2650141,False,False,False,False,False,False,0.17,oz,4.8,g,,,,/ca/en/product/glossier-stretch-concealer-for-...,2650018,"{'skuId': ['2650018'], 'icid2': ['products gri...",Makeup,Face,Concealer
14194,4924,https://www.sephora.com/ca/en/product/glossier...,Stretch Concealer for Dewy Buildable Coverage,Glossier,[{'swatch_group': 'Radiant finish - Standard s...,4.2775,191.0,-Beeswax and Microcrystalline Wax: Give concea...,41900.0,"[Makeup, Face, Concealer]",Radiant finish - Standard size,0.17 oz 4.8 g,color: g5 - deep warm,26.0,2650158,False,False,False,False,False,False,0.17,oz,4.8,g,,,,/ca/en/product/glossier-stretch-concealer-for-...,2650018,"{'skuId': ['2650018'], 'icid2': ['products gri...",Makeup,Face,Concealer
14195,4924,https://www.sephora.com/ca/en/product/glossier...,Stretch Concealer for Dewy Buildable Coverage,Glossier,[{'swatch_group': 'Radiant finish - Standard s...,4.2775,191.0,-Beeswax and Microcrystalline Wax: Give concea...,41900.0,"[Makeup, Face, Concealer]",Radiant finish - Standard size,0.17 oz 4.8 g,color: g6 - deep neutral,26.0,2650166,False,False,False,False,False,False,0.17,oz,4.8,g,,,,/ca/en/product/glossier-stretch-concealer-for-...,2650018,"{'skuId': ['2650018'], 'icid2': ['products gri...",Makeup,Face,Concealer
14196,4924,https://www.sephora.com/ca/en/product/glossier...,Stretch Concealer for Dewy Buildable Coverage,Glossier,[{'swatch_group': 'Radiant finish - Standard s...,4.2775,191.0,-Beeswax and Microcrystalline Wax: Give concea...,41900.0,"[Makeup, Face, Concealer]",Radiant finish - Standard size,0.17 oz 4.8 g,color: g7 - medium deep,26.0,2650174,False,False,False,False,False,False,0.17,oz,4.8,g,,,,/ca/en/product/glossier-stretch-concealer-for-...,2650018,"{'skuId': ['2650018'], 'icid2': ['products gri...",Makeup,Face,Concealer
14197,4924,https://www.sephora.com/ca/en/product/glossier...,Stretch Concealer for Dewy Buildable Coverage,Glossier,[{'swatch_group': 'Radiant finish - Standard s...,4.2775,191.0,-Beeswax and Microcrystalline Wax: Give concea...,41900.0,"[Makeup, Face, Concealer]",Radiant finish - Standard size,0.17 oz 4.8 g,color: g8 - medium neutral,26.0,2650182,False,False,False,False,False,False,0.17,oz,4.8,g,,,,/ca/en/product/glossier-stretch-concealer-for-...,2650018,"{'skuId': ['2650018'], 'icid2': ['products gri...",Makeup,Face,Concealer
14198,4924,https://www.sephora.com/ca/en/product/glossier...,Stretch Concealer for Dewy Buildable Coverage,Glossier,[{'swatch_group': 'Radiant finish - Standard s...,4.2775,191.0,-Beeswax and Microcrystalline Wax: Give concea...,41900.0,"[Makeup, Face, Concealer]",Radiant finish - Standard size,0.17 oz 4.8 g,color: g9 - medium warm,26.0,2648954,False,False,False,False,False,False,0.17,oz,4.8,g,,,,/ca/en/product/glossier-stretch-concealer-for-...,2650018,"{'skuId': ['2650018'], 'icid2': ['products gri...",Makeup,Face,Concealer
14199,4924,https://www.sephora.com/ca/en/product/glossier...,Stretch Concealer for Dewy Buildable Coverage,Glossier,[{'swatch_group': 'Radiant finish - Standard s...,4.2775,191.0,-Beeswax and Microcrystalline Wax: Give concea...,41900.0,"[Makeup, Face, Concealer]",Radiant finish - Standard size,0.17 oz 4.8 g,color: g10 - light to medium,26.0,2648962,False,False,False,False,False,False,0.17,oz,4.8,g,,,,/ca/en/product/glossier-stretch-concealer-for-...,2650018,"{'skuId': ['2650018'], 'icid2': ['products gri...",Makeup,Face,Concealer


In [88]:
# df_products['unit_a'].value_counts()
mascara_df = df_products[(df_products['lvl_2_cat']=='Mascara') & (df_products['unit_a']=='oz')].groupby(['brand_name','product_name','swatch_group'], as_index=False).agg({
    'price':'first',
    'amount_a':'first',
    'unit_a':'first'
})

In [92]:
fig = px.scatter(mascara_df, x="amount_a", y="price", color="swatch_group", hover_data=['brand_name', 'product_name'])
fig.show()

In [90]:
df_products[(df_products['brand_name']=='ONE/SIZE by Patrick Starrr') &(df_products['lvl_2_cat']=='Mascara')]

Unnamed: 0,internal_product_id,url,product_name,brand_name,options,rating,product_reviews,ingredients,n_loves,categories,swatch_group,size,name,price,sku,out_of_stock,limited_edition,new_product,few_left,sale,refill,amount_a,unit_a,amount_b,unit_b,misc_info,url_path,url_sku,url_params,lvl_0_cat,lvl_1_cat,lvl_2_cat
18988,6498,https://www.sephora.com/ca/en/product/mini-fan...,Mini Fantasize Lifting & Lengthening Mascara,ONE/SIZE by Patrick Starrr,"[{'swatch_group': 'Mini size', 'size': None, '...",4.0705,312.0,-Castor Seed Oil: Known to nourish and visibly...,9600.0,"[Makeup, Eye, Mascara]",Mini size,26 oz 6 ml black,,19.0,2589620,False,False,False,False,False,False,26.0,oz,6.0,ml,black,/ca/en/product/mini-fantasize-lifting-lengthen...,2589620,"{'skuId': ['2589620'], 'icid2': ['products gri...",Makeup,Eye,Mascara
19032,6506,https://www.sephora.com/ca/en/product/fantasiz...,Fantasize Lifting & Lengthening Mascara,ONE/SIZE by Patrick Starrr,"[{'swatch_group': 'Standard size', 'size': Non...",4.2731,443.0,-Castor Seed Oil: Known to nourish and visibly...,13500.0,"[Makeup, Eye, Mascara]",Standard size,.46 oz 9 ml black,,33.0,2589612,False,False,False,False,False,False,46.0,oz,9.0,ml,black,/ca/en/product/fantasize-lifting-lengthening-m...,2589612,"{'skuId': ['2589612'], 'icid2': ['products gri...",Makeup,Eye,Mascara
19033,6506,https://www.sephora.com/ca/en/product/fantasiz...,Fantasize Lifting & Lengthening Mascara,ONE/SIZE by Patrick Starrr,"[{'swatch_group': 'Standard size', 'size': Non...",4.2731,443.0,-Castor Seed Oil: Known to nourish and visibly...,13500.0,"[Makeup, Eye, Mascara]",Mini size,26 oz 6 ml black,,19.0,2589620,False,False,False,False,False,False,26.0,oz,6.0,ml,black,/ca/en/product/fantasize-lifting-lengthening-m...,2589612,"{'skuId': ['2589612'], 'icid2': ['products gri...",Makeup,Eye,Mascara


In [98]:
df_products['out_of_stock'].value_counts()

False    19044
True       963
Name: out_of_stock, dtype: int64