In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
#Data loading and preprocessing
data = pd.read_csv('data/amz_uk_price_prediction_dataset.csv')
display(data.head(),data.shape)


In [None]:
#Generate a frequency table for the product category.
category_freq = data['category'].value_counts()
display(category_freq)

In [None]:
top_categories = category_freq.head(5)
display(top_categories)

In [None]:
#Display the distribution of products across different categories using a bar chart.

plt.figure(figsize=(10,5))
sns.barplot(x=category_freq.head(20).index, y=category_freq.head(20).values)
plt.xticks(rotation=90)
plt.xlabel('Category')
plt.ylabel('Frequency')
plt.show()


In [None]:
#For a subset of top categories, visualize their proportions using a pie chart.
top_categories.plot.pie(y='count', autopct='%1.1f%%', figsize=(8, 8), legend=False)
plt.ylabel('')
plt.title('Proportion of Top Categories')
plt.show()

#Sports & Outdoors dominates the listings



In [None]:
#Calculate the mean, median, and mode for the price of products.
price_mean = data['price'].mean()
price_median = data['price'].median()
price_mode = data['price'].mode()

#What's the average price point of products listed? How does this compare with the most common price point (mode)?
print(f"Mean Price: {price_mean}", f"Median Price: {price_median}", f"Mode Price: {price_mode}", sep='\n')
#The average price point of products listed is 89, while the most common price point (mode) is 9.99.


In [None]:
#Determine the variance, standard deviation, range, and interquartile range for product price.
price_variance = data['price'].var()
price_std_dev = data['price'].std()
price_range = data['price'].max() - data['price'].min()
price_iqr = data['price'].quantile(0.75) - data['price'].quantile(0.25)

print(f"Variance: {price_variance}", f"Standard Deviation: {price_std_dev}", f"Range: {price_range}", f"Interquartile Range: {price_iqr}", sep='\n')

#How varied are the product prices? Are there any indicators of a significant spread in prices?
#The variance and standard deviation of the product prices are quite high, indicating a significant spread in prices.


In [None]:
#Plot a histogram to visualize the distribution of product prices.
plt.figure(figsize=(10,5))
sns.histplot(data['price'], bins=30)
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

#If its hard to read these diagrams, think why this is, and explain how it could be solved. :
# Too many low priced products are making it hard to read the histogram. We could use a subset, or ranges.


In [None]:
#Use a box plot to showcase the spread and potential outliers in product pricing.
plt.figure(figsize=(10,5))
sns.boxplot(data['price'])
plt.xlabel('Price')
plt.show()


In [None]:
""" Business Question: How do customers rate products on Amazon UK, and are there any patterns or tendencies in the ratings?

Measures of Centrality:

Calculate the mean, median, and mode for the rating of products.
How do customers generally rate products? Is there a common trend?
Measures of Dispersion:

Determine the variance, standard deviation, and interquartile range for product rating.
Are the ratings consistent, or is there a wide variation in customer feedback?
Shape of the Distribution:

Calculate the skewness and kurtosis for the rating column.
Are the ratings normally distributed, or do they lean towards higher or lower values?
Visualizations:

Plot a histogram to visualize the distribution of product ratings. Is there a specific rating that is more common? """

#Calculate the mean, median, and mode for the rating of products.

ratings = data[data['stars'] != 0]
rating_mean = ratings['stars'].mean()
rating_median = ratings['stars'].median()
rating_mode = ratings['stars'].mode()

display(rating_mean, rating_median, rating_mode)
#products are generally rated around 4.5 stars

# Determine the variance, standard deviation, and interquartile range for product rating.
rating_variance = ratings['stars'].var()
rating_std_dev = ratings['stars'].std()
rating_iqr = ratings['stars'].quantile(0.75) - ratings['stars'].quantile(0.25)

display(rating_variance, rating_std_dev, rating_iqr)

#Are the ratings consistent, or is there a wide variation in customer feedback?
#The variance and standard deviation of the ratings are relatively low, indicating that there is not a wide variation in customer feedback.

#Calculate the skewness and kurtosis for the rating column.
rating_skewness = ratings['stars'].skew()
rating_kurtosis = ratings['stars'].kurt()

display(rating_skewness, rating_kurtosis)

#(skewness:-2.28, kurtosis: 9.78) Are the ratings normally distributed, or do they lean towards higher or lower values?
#The skewness value of -2.28 indicates that the ratings are negatively skewed, leaning towards higher values. The kurtosis value of 9.78 indicates that the distribution has heavy tails and is leptokurtic.

#graphical representation
plt.figure(figsize=(10,5))
sns.histplot(ratings['stars'])
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()

#4.5 is the most common rating, followed by 5.0