In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from scipy import stats

df = pd.read_csv("amazon_uk_product_data.csv")


In [None]:
crosstab = pd.crosstab(df["category"], df["isBestSeller"])
crosstab.head()


In [None]:
best_seller_ratio = (
    df.groupby("category")["isBestSeller"]
      .mean()
      .sort_values(ascending=False)
)

best_seller_ratio.head(10)


In [None]:
chi2, p, dof, expected = chi2_contingency(crosstab)

chi2, p


In [None]:
n = crosstab.sum().sum()
cramers_v = np.sqrt(chi2 / (n * (min(crosstab.shape) - 1)))

cramers_v


In [None]:
crosstab_norm = crosstab.div(crosstab.sum(axis=1), axis=0)

crosstab_norm.plot(kind="bar", stacked=True)
plt.title("Best Seller Distribution by Category")
plt.ylabel("Proportion")
plt.show()


In [None]:
Q1 = df["price"].quantile(0.25)
Q3 = df["price"].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

df_clean = df[(df["price"] >= lower) & (df["price"] <= upper)]


In [None]:
top20 = df_clean["category"].value_counts().head(20).index
df_top20 = df_clean[df_clean["category"].isin(top20)]

plt.figure(figsize=(12,6))
sns.violinplot(data=df_top20, x="category", y="price")
plt.xticks(rotation=90)
plt.show()


In [None]:
df_clean.groupby("category")["price"].median().sort_values(ascending=False).head()


In [None]:
top10 = df_clean["category"].value_counts().head(10).index

avg_price = (
    df_clean[df_clean["category"].isin(top10)]
    .groupby("category")["price"]
    .mean()
    .sort_values(ascending=False)
)

avg_price.plot(kind="bar")
plt.title("Average Price - Top 10 Categories")
plt.show()


In [None]:
df_clean.groupby("category")["price"].mean().sort_values(ascending=False).head()


In [None]:
top10 = df_clean["category"].value_counts().head(10).index
df_top10 = df_clean[df_clean["category"].isin(top10)]

plt.figure(figsize=(12,6))
sns.boxplot(data=df_top10, x="category", y="stars")
plt.xticks(rotation=90)
plt.show()


In [None]:
df_clean.groupby("category")["stars"].median().sort_values(ascending=False).head()


In [None]:
correlation = df_clean["price"].corr(df_clean["stars"])
correlation


In [None]:
plt.figure()
plt.scatter(df_clean["stars"], df_clean["price"], alpha=0.3)
plt.xlabel("Rating")
plt.ylabel("Price")
plt.show()


In [None]:
plt.figure()
sns.heatmap(df_clean.corr(numeric_only=True), annot=True, cmap="coolwarm")
plt.show()


In [None]:
stats.probplot(df_clean["price"], dist="norm", plot=plt)
plt.title("QQ Plot - Price")
plt.show()
