<a href="https://colab.research.google.com/github/haydenkirkeide/Data-Mining-Final-Project/blob/main/ANOVA_Anon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Code adapted from: https://www.pythonfordatascience.org/anova-python/

import pandas as pd
import scipy.stats as stats

# Load in demographic and survey data
survey = pd.read_csv("OPT ANON Preliminary Survey.csv")
trips = pd.read_csv("OPT ANON Trip-by-Trip.csv")

trips = trips.rename(columns={
    "Unnamed: 0": "ID",
    "Unnamed: 1": "Store",
    "Unnamed: 2": "Date",
    "Unnamed: 3": "Cost",
    "Unnamed: 4": "Time Spent",
    "Unnamed: 5": "Items",
    "Unnamed: 6": "Additional Info"
})

# Vet trips data
trips = trips[trips["ID"].astype(str).str.contains("#", na=False)]

trips["Cost"] = (
    trips["Cost"]
    .astype(str)
    .str.replace("$", "", regex=False)
    .str.replace(",", "", regex=False)
    .str.strip()
)

trips["Cost"] = pd.to_numeric(trips["Cost"], errors="coerce")

survey['membership'] = survey['membership'].astype(str).str.lower().str.strip()
survey['is_member'] = survey['membership'].apply(lambda x: 1 if x == "yes" else 0)

# Merge trips and survey data
merged = trips.merge(
    survey[["ID", "is_member"]],
    on="ID",
    how="left"
)

merged = merged.dropna(subset=["is_member", "Cost"])

member_spending = merged.loc[merged["is_member"] == 1, "Cost"]
nonmember_spending = merged.loc[merged["is_member"] == 0, "Cost"]

f_stat, p_value = stats.f_oneway(member_spending, nonmember_spending)

# Calculate degrees of freedom
df_between = 1  # 2 groups - 1
df_within = len(member_spending) + len(nonmember_spending) - 2

# Calculate Eta-squared
eta_squared = (f_stat * df_between) / (f_stat * df_between + df_within)

print("=== ANOVA: Grocery Spending vs Costco Membership ===")
print(f"Number of member trips: {len(member_spending)}")
print(f"Number of non-member trips: {len(nonmember_spending)}\n")

print(f"F-statistic: {f_stat:.4f}")
print(f"P-value:     {p_value:.6f}")
print(f"Eta-squared: {eta_squared:.4f}")

# Calculate p-value
if p_value < 0.05:
    print("\nResult: Significant difference detected between groups.")
else:
    print("\nResult: No statistically significant difference detected.")

=== ANOVA: Grocery Spending vs Costco Membership ===
Number of member trips: 12
Number of non-member trips: 2

F-statistic: 0.3652
P-value:     0.556897
Eta-squared: 0.0295

Result: No statistically significant difference detected.
