# Import Required Libraries
Import the necessary libraries, including pandas, numpy, and matplotlib.

In [None]:
# Importing the necessary libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations
import matplotlib.pyplot as plt  # For data visualization

# Ensuring that the plots are displayed inline in the Jupyter notebook
%matplotlib inline

# Load the Dataset
Use pandas to load the ecom.csv dataset.

In [None]:
# Load the Dataset
# Using pandas to load the ecom.csv dataset

# Define the path to the dataset
data_path = 'data/ecom.csv'

# Use pandas read_csv function to load the dataset
ecom_data = pd.read_csv(data_path)

# Display the first few rows of the dataset to confirm it's loaded correctly
ecom_data.head()

# Data Cleaning
Identify and handle missing values, outliers, and incorrect data types.

In [None]:
# Check for missing values in the dataset
missing_values = ecom_data.isnull().sum()

# Print the number of missing values in each column
print(missing_values)

# If there are missing values, we can handle them in several ways. 
# For simplicity, let's just remove the rows with missing values.
ecom_data = ecom_data.dropna()

# Check the data types of each column
print(ecom_data.dtypes)

# If there are columns that should be a different data type, we can convert them. 
# For example, if 'Price' is an object but should be a float, we can convert it like this:
# ecom_data['Price'] = ecom_data['Price'].astype(float)

# Identify outliers in the 'Price' column using the IQR method
Q1 = ecom_data['Price'].quantile(0.25)
Q3 = ecom_data['Price'].quantile(0.75)
IQR = Q3 - Q1

# Define the acceptable range for 'Price'
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers from the dataset
ecom_data = ecom_data[(ecom_data['Price'] >= lower_bound) & (ecom_data['Price'] <= upper_bound)]

# Exploratory Data Analysis
Perform exploratory data analysis to understand the data better. This can include computing descriptive statistics, creating visualizations, and checking for correlations.

In [None]:
# Exploratory Data Analysis

# Compute descriptive statistics for numerical columns
numerical_descriptive_stats = ecom_data.describe()
print(numerical_descriptive_stats)

# Compute descriptive statistics for categorical columns
categorical_descriptive_stats = ecom_data.describe(include=['O'])
print(categorical_descriptive_stats)

# Visualize the distribution of 'Price' column
plt.figure(figsize=(10, 6))
plt.hist(ecom_data['Price'], bins=30, color='blue', alpha=0.7)
plt.title('Distribution of Price')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

# Visualize the correlation matrix
correlation_matrix = ecom_data.corr()
plt.figure(figsize=(10, 8))
plt.matshow(correlation_matrix, cmap='coolwarm', fignum=1)
plt.colorbar()
plt.xticks(np.arange(correlation_matrix.shape[1]), correlation_matrix.columns, rotation=90)
plt.yticks(np.arange(correlation_matrix.shape[1]), correlation_matrix.columns)
plt.title('Correlation Matrix', pad=90)
plt.show()

# Check for correlations between 'Price' and other numerical variables
price_correlations = ecom_data.corr()['Price'].sort_values(ascending=False)
print(price_correlations)