In [None]:
import numpy as np  # For numerical operations and handling arrays
import pandas as pd  # For data manipulation and analysis (DataFrames)
import matplotlib.pyplot as plt  # For plotting static, interactive, and animated visualizations
import seaborn as sns  # For advanced data visualization (built on top of matplotlib)

In [None]:
filepath = your dataset path...!
df = pd.read_csv(filepath)  # Load CSV data into DataFrame
print(df.head())  # Display first 5 rows

In [None]:
print(df["retail_price"].dtype)  # Print the data type of the 'retail_price' column
print(df["retail_price"].unique()[:20])  # Print the first 20 unique values in the 'retail_price' column
print(df["retail_price"])  # Print all values in the 'retail_price' column

In [None]:
print(df.isnull().sum())  # Print the count of missing values in each column

In [None]:
print(df.columns)  # Print the list of column names in Dataset

In [None]:
# Fill missing values in 'retail_price' with the median
df["retail_price"] = df["retail_price"].fillna(df["retail_price"].median())
# Fill missing values in 'price' with the median
df["price"] = df["price"].fillna(df["price"].median())
# Fill missing values in 'crawl_month' with 'UNKNOWN'
df["crawl_month"] = df["crawl_month"].fillna("UNKNOWN") 

In [None]:
print(df["crawl_month"].dtype)  # Print the data type of the 'crawl_month' column
print(df["units_sold"].dtype)  # Print the data type of the 'units_sold' column

In [None]:
print(df.head(10))  # Print the first 10 rows of the Dataset

In [None]:
print(df.dtypes)  # Print the data type of each column in the Dataset

In [None]:
# Create bins for 'retail_price' using quantiles
df["retail_price_bin"] = pd.qcut(df["retail_price"], q = 10)
# Group by 'retail_price_bin' and sum 'units_sold'
sales_grouped = df.groupby("retail_price_bin", observed=True)["units_sold"].sum()  
plt.figure(figsize = (12, 5))  # Set the figure size for the plot
# Plot the data with markers and line
plt.plot(sales_grouped.index.astype(str), sales_grouped.values, marker = 'o', linestyle = '-', color = 'b')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.xlabel("Retail_Price")  # Set the x-axis label
plt.ylabel("Units Sold")  # Set the y-axis label
plt.title("Retail Price and Units Sold Plot")  # Set the title of the plot
plt.savefig("retail_price_vs_units_sold.png", dpi=300, bbox_inches="tight") # Saves as PNG with high resolution
plt.show()  # Display the plot

In [None]:
# Create bins for 'retail_price' using equal-width bins
df["price_bin"] = pd.cut(df["retail_price"], bins=10)  
plt.figure(figsize=(15, 8))  # Set the figure size for the plot
# Create a bar plot with sum of 'price' grouped by 'price_bin'
sns.barplot(x=df["price_bin"], y=df["price"], estimator=np.sum, palette="coolwarm", hue="retail_price", data=df)  
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.xlabel("retail_price bins")  # Set the x-axis label
plt.ylabel("price")  # Set the y-axis label
plt.title("total price vs. retail price bins")  # Set the title of the plot
plt.savefig("total_price_vs_retail_price_bins.png", dpi=300, bbox_inches="tight") # Saves as PNG with high resolution
plt.show()  # Display the plot

In [None]:
# Select only numeric columns
df_numeric = df.select_dtypes(include=[np.number])  # Extract numeric columns from the DataFrame

# Correlation Matrix (filtered for correlations > 0.5)
corr_matrix = df_numeric.corr().abs()  # Compute the absolute value of the correlation matrix
# Filter correlations greater than 0.5 and unstack the matrix
filtered_corr = corr_matrix[corr_matrix > 0.5].stack().reset_index()  
# Remove self-correlations (same column pairs)
filtered_corr = filtered_corr[filtered_corr['level_0'] != filtered_corr['level_1']]  

plt.figure(figsize=(20, 10))  # Set the figure size for the heatmap
# Create a heatmap of the correlation matrix with annotations
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", linewidths=0.5)  
plt.title("Filtered Correlation Matrix (|corr| > 0.5)")  # Set the title of the heatmap
plt.savefig("correlation_heatmap.png", dpi=300, bbox_inches="tight") # Saves as PNG with high resolution
plt.show()  # Display the heatmap

In [None]:
# Group by 'product_color', sum 'units_sold', sort by descending order, and select top 10
top_products = df.groupby("product_color")["units_sold"].sum().sort_values(ascending = False).head(10)  
print(top_products)  # Print the top 10 products by 'units_sold'

In [None]:
plt.figure(figsize = (12, 8))  # Set the figure size for the plot
# Create a horizontal bar plot for top products by color
sns.barplot(x=top_products.values, y=top_products.index, palette="viridis", hue=top_products.index, dodge=False)  
plt.xlabel("Total Units Sold")  # Set the x-axis label
plt.ylabel("Product Color")  # Set the y-axis label
plt.title("Top 10 best selling Product Color")  # Set the title of the plot
plt.savefig("top_10_best_selling_product_colors.png", dpi=300, bbox_inches="tight") # Saves as PNG with high resolution
plt.show()  # Display the plot

In [None]:
# Group by 'product_color', sum 'units_sold', sort by ascending order, and select bottom 10
worst_products = df.groupby("product_color")["units_sold"].sum().sort_values(ascending = True).head(10)  
print(worst_products)  # Print the worst 10 products by 'units_sold'

In [None]:
plt.figure(figsize = (12, 8))  # Set the figure size for the plot
# Create a horizontal bar plot for the worst products by color
sns.barplot(x=worst_products.values, y=worst_products.index, palette="coolwarm", hue=worst_products.index, dodge=False) 
plt.xlabel("Total Units Sold")  # Set the x-axis label
plt.ylabel("Product Color")  # Set the y-axis label
plt.title("Bottom 10 worst selling Product Color")  # Set the title of the plot
plt.savefig("bottom_10_worst_selling_product_colors.png", dpi=300, bbox_inches="tight") # Saves as PNG with high resolution
plt.show()  # Display the plot

In [None]:
# Group by 'product_color', sum 'rating_five_count', sort by descending order, and select top 10
best_ratings = df.groupby("product_color")["rating_five_count"].sum().sort_values(ascending = False).head(10)  
print(best_ratings)  # Print the top 10 products by 'rating_five_count'

In [None]:
plt.figure(figsize = (12, 8))  # Set the figure size for the plot
# Create a horizontal bar plot for the top-rated products by color
plt.xlabel("Best Ratings")  # Set the x-axis label
sns.barplot(x=best_ratings.values, y=best_ratings.index, palette="coolwarm", hue=best_ratings.index, dodge=False)  
plt.ylabel("Product Color")  # Set the y-axis label
plt.title("Top 10 best rated Product Color")  # Set the title of the plot
plt.savefig("top_10_best_rated_product_colors.png", dpi=300, bbox_inches="tight") # Saves as PNG with high resolution
plt.show()  # Display the plot

In [None]:
print(df["crawl_month"])  # Print the values in the 'crawl_month' column

In [None]:
# Convert 'crawl_month' to datetime format (year-month)
df["crawl_month"] = pd.to_datetime(df["crawl_month"], format="%Y-%m")  
# Print the data type of the 'crawl_month' column after conversion
print(df["crawl_month"].dtype)  

In [None]:
# Extract the year from 'crawl_month' and create a new 'year' column
df["year"] = df["crawl_month"].dt.year
# Extract the month from 'crawl_month' and create a new 'month' column
df["month"] = df["crawl_month"].dt.month
# Print the first 5 rows of 'crawl_month', 'year', and 'month' columns
print(df[["crawl_month", "year", "month"]].head())  

In [None]:
#Print the number of unique values in the 'crawl_month' column
print(df["crawl_month"].nunique())  # if 1, all rows have the same month
# Print the unique values in the 'crawl_month' column
print(df["crawl_month"].unique())  

In [None]:
# Calculate the 25th percentile (Q1) of the 'retail_price' column
Q1 = df["retail_price"].quantile(0.25)  
# Calculate the 75th percentile (Q3) of the 'retail_price' column
Q3 = df["retail_price"].quantile(0.75)
# Calculate the Interquartile Range (IQR) by subtracting Q1 from Q3
IQR = Q3 - Q1  
# Remove outliers in 'retail_price' based on the IQR method
df_clean = df[~((df["retail_price"] < (Q1 - 1.5 * IQR)) | (df["retail_price"] > (Q3 + 1.5 * IQR)))]  

In [None]:
sns.scatterplot(x="retail_price", y="units_sold", data=df)  # Create a scatter plot of 'retail_price' vs 'units_sold'
plt.savefig("retail_price_vs_units_sold_scatter_plot.png", dpi=300, bbox_inches="tight")

In [None]:
plt.figure(figsize=(15, 8))  # Set the figure size for the box plot
 # Create a box plot of 'retail_price' grouped by 'product_color'
sns.boxplot(x="product_color", y="retail_price", data=df) 
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.title("Retail Price Distribution by Product Color")  # Set the title of the plot
plt.savefig("retail_price_distribution_by_product_color.png", dpi=300, bbox_inches="tight")

In [None]:
# Calculate the discount percentage based on 'retail_price' and 'price'
df["discount_percentage"] = (df["retail_price"] - df["price"]) / df["retail_price"] * 100  
print(df["discount_percentage"])  # Print the 'discount_percentage' column

In [None]:
 # Group by 'rating_five_count' and calculate the mean 'units_sold', then select the top 10
sales_by_rating = df.groupby("rating_five_count")["units_sold"].mean().head(10) 
# Create a bar plot of average units sold by five-star rating count
sales_by_rating.plot(kind="bar", title="Average Units Sold by Five-Star Rating Count")  
plt.savefig("average_units_sold_by_rating.png", dpi=300, bbox_inches="tight")

In [None]:
# Calculate the correlation between 'price' and 'units_sold'
corr = df["price"].corr(df["units_sold"])  
print(f"Correlation between price and units sold: {corr:.2f}")  # Print the correlation value rounded to 2 decimal places

In [None]:
# Group by 'product_color' and generate descriptive statistics for 'units_sold'
df.groupby("product_color")["units_sold"].describe()  

In [None]:
# Create synthetic months
months = pd.date_range("2020-06", "2020-08", freq="ME").strftime("%Y-%m").tolist()

# Randomly assign months to rows
df["crawl_month"] = np.random.choice(months, size=len(df))


In [None]:
# Group by crawl_month for monthly sales analysis (once multiple months exist)
monthly_sales = df.groupby("crawl_month")["units_sold"].sum()
monthly_sales.plot(kind="bar", figsize=(12, 6), title="Units Sold by Month")
plt.xlabel("Month")
plt.ylabel("Units Sold")
plt.savefig("monthly_sales_units_sold.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Extract the month from 'crawl_month' and create a new 'month' column
df["month"] = pd.to_datetime(df["crawl_month"]).dt.month  
# Extract the year from 'crawl_month' and create a new 'year' column
df["year"] = pd.to_datetime(df["crawl_month"]).dt.year  