In [62]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from scipy import stats
import numpy as np

In [None]:
#Data loading and preprocessing
data = pd.read_csv('data/amz_uk_price_prediction_dataset.csv')
display(data.head(),data.shape)


In [None]:
# Create a crosstab between the product category and the isBestSeller status.
crosstab_raw = pd.crosstab(data['category'], data['isBestSeller'])
display(crosstab_raw)

crosstab = pd.crosstab(data['category'], data['isBestSeller'], normalize='index')
crosstab = crosstab.sort_values(by=True, ascending=False)
display(crosstab)

# Best-seller status is more prevalent in the categories 'Grocery' and 'Smart Home Security & Lighting'.



In [None]:
# Conduct a Chi-square test to determine if the best-seller distribution is independent of the product category.
chi2, p, dof, ex = chi2_contingency(crosstab_raw)

print(f"Chi2 Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")

# The p-value is less than 0.05, the best-seller status is dependent on the product category.

# Compute Cramér's V to understand the strength of association between best-seller status and category.
n = data.shape[0]
cramers_v = np.sqrt(chi2 / (n * min(crosstab_raw.shape) - 1))

print(f"Cramér's V: {cramers_v}")

# Cramér's V value indicates a low association between the category and best-seller status.



In [None]:
# Visualize the relationship between product categories and the best-seller status using a stacked bar chart.
crosstab.head(100).plot(kind='bar', stacked=True, figsize=(12, 8))
plt.xlabel('Product Category')
plt.ylabel('Proportion')
plt.legend(title='Best-Seller Status')
plt.show()


In [None]:
# Remove outliers based on price
Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the data to remove outliers
filtered_data = data[(data['price'] >= lower_bound) & (data['price'] <= upper_bound)]
display(filtered_data.shape)


In [None]:
# Filter the top 20 categories based on count
top_20_categories = data['category'].value_counts().nlargest(20).index
print(top_20_categories)
filtered_top_20_data = filtered_data[filtered_data['category'].isin(top_20_categories)]

# Create a violin plot to visualize the distribution of price across different product categories
plt.figure(figsize=(15, 10))
sns.violinplot(x='category', y='price', data=filtered_top_20_data)
plt.xticks(rotation=90)
plt.xlabel('Product Category')
plt.ylabel('Price')
plt.show()

# Determine the product category with the highest median price
median_prices = data.groupby('category')['price'].median().sort_values(ascending=False)
display(median_prices)
highest_median_category = median_prices.idxmax()
highest_median_price = median_prices.max()

print(f"The product category with the highest median price is '{highest_median_category}' with a median price of {highest_median_price}.")


In [None]:
# Calculate the average price for each product category
average_prices = data.groupby('category')['price'].mean().sort_values(ascending=False)

# Get the top 10 product categories based on count
top_10_categories = data['category'].value_counts().nlargest(10).index

# Filter the average prices for the top 10 categories
top_10_average_prices = average_prices[top_10_categories]

# Plot the bar chart
plt.figure(figsize=(12, 8))
top_10_average_prices.plot(kind='bar')
plt.xlabel('Product Category')
plt.ylabel('Average Price')
plt.show()

# Determine the product category with the highest average price
highest_avg_price_category = top_10_average_prices.idxmax()
highest_avg_price = top_10_average_prices.max()

print(f"The product category with the highest average price is '{highest_avg_price_category}' with an average price of {highest_avg_price:.2f}.")


In [None]:
ratings = data[data['stars'] > 0]

filtered_top_10_ratings = ratings[data['category'].isin(top_10_categories)]
# Create side-by-side box plots to visualize the distribution of product ratings across different product categories
plt.figure(figsize=(15, 10))
sns.boxplot(x='category', y='stars', data=filtered_top_10_ratings)
plt.xticks(rotation=90)
plt.xlabel('Product Category')
plt.ylabel('Rating')
plt.show()

# Determine the product category with the highest median rating
median_ratings = data.groupby('category')['stars'].median().sort_values(ascending=False)
highest_median_rating_category = median_ratings.idxmax()
highest_median_rating = median_ratings.max()

print(f"The product category with the highest median rating is '{highest_median_rating_category}' with a median rating of {highest_median_rating}.")


In [None]:
# Get the top 10 product categories based on count
top_10_categories = data['category'].value_counts().nlargest(10).index

# Filter the average prices for the top 10 categories
top_10_average_prices = average_prices[top_10_categories]

# Plot the bar chart
plt.figure(figsize=(12, 8))
top_10_average_prices.plot(kind='bar')
plt.xlabel('Product Category')
plt.ylabel('Average Price')
plt.show()

# Determine the product category with the highest average price
highest_avg_price_category = top_10_average_prices.idxmax()
highest_avg_price = top_10_average_prices.max()

print(f"The product category with the highest average price is '{highest_avg_price_category}' with an average price of {highest_avg_price:.2f}.")


# Create side-by-side box plots to visualize the distribution of product ratings across different product categories
plt.figure(figsize=(15, 10))
sns.boxplot(x='category', y='stars', data=filtered_top_10_ratings)
plt.xticks(rotation=90)
plt.xlabel('Product Category')
plt.ylabel('Rating')
plt.show()

# Determine the product category with the highest median rating
median_ratings = ratings.groupby('category')['stars'].median().sort_values(ascending=False)
highest_median_rating_category = median_ratings.idxmax()
highest_median_rating = median_ratings.max()

print(f"The product category with the highest median rating is '{highest_median_rating_category}' with a median rating of {highest_median_rating}.")


In [None]:
# Calculate the correlation coefficient between price and stars
correlation_coefficient = ratings['price'].corr(ratings['stars'])
print(f"Correlation Coefficient between price and stars: {correlation_coefficient}")
# The correlation coefficient is close to zero so weak



In [None]:

# Create a scatter plot to visualize the relationship between product rating and price
plt.figure(figsize=(12, 8))

# Exclude non-numeric columns from the dataframe
numeric_data = ratings[['price', 'stars', 'reviews','boughtInLastMonth']]
display(numeric_data.head())
# Calculate the correlation matrix
correlation_matrix = numeric_data.corr()

# Create a heatmap to visualize the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.show()

# Create a QQ plot to examine if product prices follow a normal distribution
plt.figure(figsize=(12, 8))
stats.probplot(ratings['price'], dist="norm", plot=plt)
plt.show()