In [1]:
import pandas as pd
import re

df = pd.read_csv("cleaned_costco_data.csv")

In [2]:
keywords = ["organic", "gluten free", "gluten-free", "GF", "vegan", "non-GMO", "non GMO", "kosher", "no added preservatives",  "no preservatives", "fat free", "fat-free", "low-fat", "low fat", "less fat", "reduced fat", "unsweetened", "no sugar added", "zero sugar", "sugar-free", "sugar free"]

In [3]:
def count_keywords(text1, text2, keywords):
    combined_text = f"{text1} {text2}".lower() if pd.notnull(text1) and pd.notnull(text2) else str(text1).lower() if pd.notnull(text1) else str(text2).lower()
    return {kw: int(bool(re.search(rf"\b{kw}\b", combined_text))) for kw in keywords}

keyword_counts = df.apply(lambda row: count_keywords(row["Product Description"], row["Feature"], keywords), axis=1)
keyword_df = pd.DataFrame(keyword_counts.tolist())

df = pd.concat([df, keyword_df], axis=1)

In [4]:
price_bins = [0, 15, 30, 50, 100, 200, float('inf')]
price_labels = ["<$15", "$15-30", "$30-50", "$50-100", "$100-200", ">$200"]

df["price_category"] = pd.cut(df["Price"], bins=price_bins, labels=price_labels, right=False)

In [5]:
keyword_groups = {
    "organic": ["organic"],
    "gluten-free": ["gluten free", "gluten-free", "GF"],
    "vegan": ["vegan"],
    "kosher": ["kosher"],
    "non-GMO": ["non-GMO", "non GMO"],
    "no-preservatives": ["no added preservatives", "no preservatives"],
    "low-fat": ["low-fat", "low fat", "reduced fat", "less fat", "fat free", "fat-free"],
    "sugar-free": ["unsweetened", "no sugar added", "zero sugar", "sugar-free", "sugar free"]
}

In [6]:
combined_keyword_counts = pd.DataFrame()

for new_col, old_cols in keyword_groups.items():
    combined_keyword_counts[new_col] = df[old_cols].sum(axis=1)

df_combined = pd.concat([df[["price_category"]], combined_keyword_counts], axis=1)

keyword_summary = df_combined.groupby("price_category").sum().reset_index()

print(keyword_summary)

  price_category  organic  gluten-free  vegan  kosher  non-GMO  \
0           <$15       63           93     21     187        0   
1         $15-30       44           91     15     150        0   
2         $30-50       47           61     15      76        0   
3        $50-100       11           26     11      31        0   
4       $100-200        3           12      1       5        0   
5          >$200        0            5      1       5        0   

   no-preservatives  low-fat  sugar-free  
0                16       15          32  
1                 9       23          23  
2                20        3          14  
3                16        0           2  
4                 9        1           0  
5                 2        0           0  


  keyword_summary = df_combined.groupby("price_category").sum().reset_index()


In [7]:
df_combined.to_csv("updated_keywords_price_data.csv", index=False)