### 1. The Setup - Preparing the Groundwork

#### 1.1 Importing Necessary Libraries

In [3]:
# Import pandas for data manipulation and analysis
import pandas as pd
# Import numpy for numerical operations (though less used here, it's a good practice)
import numpy as np
# Import matplotlib.pyplot for basic plotting
import matplotlib.pyplot as plt
# Import seaborn for enhanced statistical data visualization
import seaborn as sns

# Configure display options for better readability
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

#### 1.2 Loading the Sales Data

In [4]:
df = pd.read_csv(r"C:\Users\HP\Desktop\Retail\retail_sales_synthetic.csv")

### 2. Data Preparation

#### 2.1 Removing Invalid Records

In [5]:
# Drop rows where 'revenue' is zero, indicating invalid or incomplete sales records
initial_rows = len(df)
df.drop(df[df["revenue"] == 0].index, inplace=True)
final_rows = len(df)

print(f"Initial number of rows: {initial_rows}")
print(f"Rows removed (Revenue = 0): {initial_rows - final_rows}")
print(f"Final number of rows: {final_rows}")

Initial number of rows: 164400
Rows removed (Revenue = 0): 4749
Final number of rows: 159651


#### 2.2 Converting Date column to the datetime format

In [6]:
# Convert the 'date' column to datetime objects
df["date"] = pd.to_datetime(df["date"])

# Extract the Year and Month into new columns for easy grouping and analysis
df["Year"] = df["date"].dt.year
df["Month"] = df["date"].dt.month

print("New columns 'Year' and 'Month' created successfully.")
print(df[['date', 'Year', 'Month']].head())

New columns 'Year' and 'Month' created successfully.
        date  Year  Month
0 2024-09-05  2024      9
1 2022-10-24  2022     10
2 2023-04-19  2023      4
3 2024-06-22  2024      6
4 2024-07-20  2024      7
