# Import Required Libraries
Import the necessary libraries, including pandas, numpy, and matplotlib.

In [2]:
# Importing the necessary libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations
import matplotlib.pyplot as plt  # For data visualization

# Ensuring that the plots are displayed inline in the Jupyter notebook
%matplotlib inline

# Load the Dataset
Use pandas to load the ecom.csv dataset.

In [12]:
# Load the Dataset
# Using pandas to load the ecom.csv dataset

# Define the path to the dataset
df = 'data/ecom.csv'

# Use pandas read_csv function to load the dataset with 'latin1' encoding
ecom_data = pd.read_csv(df, encoding='iso-8859-1')

# Display the first few rows of the dataset to confirm it's loaded correctly
ecom_data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


# Data Cleaning
Identify and handle missing values, outliers, and incorrect data types.

In [17]:
# Check for missing values in the dataset
missing_values = ecom_data.isnull().sum()

# Fill missing values with "No Description" in the 'Description' column
ecom_data['Description'] = ecom_data['Description'].fillna('No Description')

# Drop all rows with missing values in CustomerID
ecom_data = ecom_data.dropna(subset=['CustomerID'])

# Print the number of missing values in each column
print(missing_values)

ecom_data['InvoiceDate'] = pd.to_datetime(ecom_data['InvoiceDate'])

# Remove any rows where quantity or unitprice is less than or equal to 0
ecom_data = ecom_data[(ecom_data['Quantity'] > 0) & (ecom_data['UnitPrice'] > 0)]

# Display the first few rows of the dataset to confirm it's loaded correctly
ecom_data.head()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


# Exploratory Data Analysis
Perform exploratory data analysis to understand the data better. This can include computing descriptive statistics, creating visualizations, and checking for correlations.

In [24]:
# Exploratory Data Analysis


# Summary statistics
summary_stats = ecom_data.describe()
# print(summary_stats)

# Count unique items sold
unique_items = ecom_data['StockCode'].nunique()
# print(unique_items)

# What are the total sales?
ecom_data['Sales'] = ecom_data['Quantity'] * ecom_data['UnitPrice'] 
top_selling_items = ecom_data.groupby('Description')['Sales'].sum().sort_values(ascending=False).head(10)
# print(top_selling_items)

# Product Popularity: What are the most popular products based on quantity sold?
product_sales = ecom_data.groupby('Description').agg(Total_Quantity_Sold=('Quantity', 'sum')).reset_index()
product_sales_sorted = product_sales.sort_values(by='Total_Quantity_Sold', ascending=False)
top_10_products = product_sales_sorted.head(10)

plt.figure(figsize=(12, 8))
plt.barh(top_10_products['Description'], top_10_products['Total_Quantity_Sold'], color='skyblue')
plt.xlabel('Total Quantity Sold')
plt.ylabel('Product Description')
plt.title('Top 10 Products by Total Quantity Sold')
plt.gca().invert_yaxis() # To have the product with the highest quantity sold at the top
plt.show()


# # Revenue Generated: What is the total revenue generated by each product by multiplying the quantity sold by the unit price?
# revenue_generated = ecom_data.groupby('Description')['Sales'].sum().sort_values(ascending=False).head(10)
# print(revenue_generated)

# # Product trends over time: analyze how popular a product is over time by grouping the data by month and year
# ecom_data['YearMonth'] = ecom_data['InvoiceDate'].dt.to_period('M')
# product_trends = ecom_data.groupby(['YearMonth', 'Description'])['Quantity'].sum().unstack().fillna(0)
# print(product_trends)


# # Stock Code Analysis: Investigate the most popular stock codes based on quantities sold and revenue generated
# stock_code_popularity = ecom_data.groupby('StockCode')['Quantity', 'Sales'].sum().sort_values(by='Quantity', ascending=False).head(10)
# print(stock_code_popularity)

# # Prouct Description Analysis: Investigate the most popular product descriptions based on quantities sold and revenue generated
# product_description_popularity = ecom_data.groupby('Description')['Quantity', 'Sales'].sum().sort_values(by='Quantity', ascending=False).head(10)
# print(product_description_popularity)

# -------------------------------------------- 

# # Compute descriptive statistics for numerical columns
# numerical_descriptive_stats = ecom_data.describe()
# print(numerical_descriptive_stats)

# # Compute descriptive statistics for categorical columns
# categorical_descriptive_stats = ecom_data.describe(include=['O'])
# print(categorical_descriptive_stats)

# # Visualize the distribution of 'Price' column
# plt.figure(figsize=(10, 6))
# plt.hist(ecom_data['Price'], bins=30, color='blue', alpha=0.7)
# plt.title('Distribution of Price')
# plt.xlabel('Price')
# plt.ylabel('Frequency')
# plt.show()

# # Visualize the correlation matrix
# correlation_matrix = ecom_data.corr()
# plt.figure(figsize=(10, 8))
# plt.matshow(correlation_matrix, cmap='coolwarm', fignum=1)
# plt.colorbar()
# plt.xticks(np.arange(correlation_matrix.shape[1]), correlation_matrix.columns, rotation=90)
# plt.yticks(np.arange(correlation_matrix.shape[1]), correlation_matrix.columns)
# plt.title('Correlation Matrix', pad=90)
# plt.show()

# # Check for correlations between 'Price' and other numerical variables
# price_correlations = ecom_data.corr()['Price'].sort_values(ascending=False)
# print(price_correlations)