# Import Required Libraries
Import the necessary libraries, including pandas, numpy, and matplotlib.

In [2]:
# Importing the necessary libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations
import matplotlib.pyplot as plt  # For data visualization

# Ensuring that the plots are displayed inline in the Jupyter notebook
%matplotlib inline

# Load the Dataset
Use pandas to load the ecom.csv dataset.

In [None]:
# Load the Dataset
# Using pandas to load the ecom.csv dataset

# Define the path to the dataset
df = 'data/ecom.csv'

# Use pandas read_csv function to load the dataset with 'latin1' encoding
ecom_data = pd.read_csv(df, encoding='iso-8859-1')

# Display the first few rows of the dataset to confirm it's loaded correctly
ecom_data.head()

# Data Cleaning
Identify and handle missing values, outliers, and incorrect data types.

In [None]:
# Check for missing values in the dataset
missing_values = ecom_data.isnull().sum()

# Fill missing values with "No Description" in the 'Description' column
ecom_data['Description'] = ecom_data['Description'].fillna('No Description')

# Drop all rows with missing values in CustomerID
ecom_data = ecom_data.dropna(subset=['CustomerID'])

# Print the number of missing values in each column
print(missing_values)

ecom_data['InvoiceDate'] = pd.to_datetime(ecom_data['InvoiceDate'])

# Remove any rows where quantity or unitprice is less than or equal to 0
ecom_data = ecom_data[(ecom_data['Quantity'] > 0) & (ecom_data['UnitPrice'] > 0)]

# Display the first few rows of the dataset to confirm it's loaded correctly
ecom_data.head()

# Exploratory Data Analysis
Perform exploratory data analysis to understand the data better. This can include computing descriptive statistics, creating visualizations, and checking for correlations.

In [None]:
# Exploratory Data Analysis


# Summary statistics
summary_stats = ecom_data.describe()
print(summary_stats)

# Count unique items sold
unique_items = ecom_data['StockCode'].nunique()
print(unique_items)

# What are the total sales?
ecom_data['Sales'] = ecom_data['Quantity'] * ecom_data['UnitPrice'] 
top_selling_items = ecom_data.groupby('Description')['Sales'].sum().sort_values(ascending=False).head(10)
print(top_selling_items)


#### **Product Popularity**: What are the most popular products based on quantity sold?

In [None]:
product_sales = ecom_data.groupby('Description').agg(Total_Quantity_Sold=('Quantity', 'sum')).reset_index()
product_sales_sorted = product_sales.sort_values(by='Total_Quantity_Sold', ascending=False)
top_10_products = product_sales_sorted.head(10)

plt.figure(figsize=(12, 8))
plt.barh(top_10_products['Description'], top_10_products['Total_Quantity_Sold'], color='skyblue')
plt.xlabel('Total Quantity Sold')
plt.ylabel('Product Description')
plt.title('Top 10 Products by Total Quantity Sold')
plt.gca().invert_yaxis() # To have the product with the highest quantity sold at the top
plt.show()

#### **Time Series Analysis**: 'World War 2 Gliders' Sales Over Time

In [None]:
# convert invoice date to datetime
ecom_data['InvoiceDate'] = pd.to_datetime(ecom_data['InvoiceDate'])

# Time series analysis of sales of world war 2 gliders over time
gliders_data = ecom_data[ecom_data['Description'] == 'WORLD WAR 2 GLIDERS ASSTD DESIGNS']

# Group data by invoice date and sum up quantities over time
gliders_sales_over_time = gliders_data.groupby(gliders_data['InvoiceDate'].dt.to_period('M'))['Quantity'].sum().reset_index()

# Convert 'InvoiceDate' to datetime
gliders_sales_over_time['InvoiceDate'] = gliders_sales_over_time['InvoiceDate'].dt.to_timestamp()

# Plot the time series for the quantity of world war 2 gliders sold over time
plt.figure(figsize=(14, 7))
plt.plot(gliders_sales_over_time['InvoiceDate'], gliders_sales_over_time['Quantity'], marker='o')
plt.title('World War 2 Gliders Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Quantity Sold')
plt.grid(True)
plt.show()

#### **Analyzing Seasonal Sales**: Comparing November and December with Other Months

In [33]:
from scipy import stats

november_sales = ecom_data[(ecom_data['InvoiceDate'].dt.month == 11) & (ecom_data['InvoiceDate'].dt.year == 2011)]['Quantity']

march_sales = ecom_data[(ecom_data['InvoiceDate'].dt.month == 3) & (ecom_data['InvoiceDate'].dt.year == 2011)]['Quantity']

t_stat, p_value = stats.ttest_ind(november_sales, march_sales, equal_var=False)

print(f"T-statistic: {t_stat}, P-value: {p_value}")

if p_value < 0.05:
    print("The difference in sales between November and March is statistically significant")
else:
    print("The difference in sales between November and March is not statistically significant")



T-statistic: -8.844858351959235, P-value: 9.508165097584817e-19
The difference in sales between November and March is statistically significant
