In [None]:
import pandas as pd

In [None]:
# Load the sales data
sales = pd.read_excel("data/sales.xlsx")

In [None]:
# View the data structure
sales.info()

In [None]:
# Check for missing values
sales.isnull().sum()

In [None]:
# Remove rows with missing values
# In reality, we might want to handle missing values more carefully such as imputing them.
sales = sales.dropna()

In [None]:
# We should see that there are no missing values now
sales.info()

In [None]:
# Check for duplicates
sales.duplicated().sum()

In [None]:
# View duplicate rows
sales[sales.duplicated()]

In [None]:
# Remove duplicate rows
sales = sales.drop_duplicates()

In [None]:
# We should see that there are less rows now
sales.info()

In [None]:
# Calculate total cost, total revenue, profit, and margin 
sales["Total Cost"] = sales["Order Quantity"] * sales["Unit Cost"]
sales["Total Revenue"] = sales["Order Quantity"] * sales["Unit Price"]
sales["Profit"] = sales["Total Revenue"] - sales["Total Cost"]
sales["Margin"] = sales["Profit"] / sales["Total Revenue"]

In [None]:
# Load the customers data
customers = pd.read_excel("data/customers.xlsx")

In [None]:
# View the data structure
customers.info()

In [None]:
# Fill in missing gender values with "Unknown"
customers["Gender"] = customers["Gender"].fillna("Unknown")

In [None]:
# View the counts of each gender
customers["Gender"].value_counts()

In [None]:
# We should not see any missing values for the "Gender" column now
customers.info()

In [None]:
# Calculate customer age groups
def age_group(age):
    if age < 18:
        return "Child"
    elif age < 30:
        return "Young Adult"
    elif age < 60:
        return "Adult"
    else:
        return "Senior"
customers["Age Group"] = customers["Age"].apply(age_group)

In [None]:
# Check the final structure of both datasets. We want to merge them later.
print(sales.columns)
print(customers.columns)

In [None]:
sm = sales.merge(customers, on="Customer ID", how="left")

In [None]:
# You should see that there are null values in the customer columns since there are sales with unknown customers
sm.info() 

In [None]:
sm["MaritalStatus"].value_counts()
#sm["EnglishEducation"].value_counts()
#sm["HouseOwnerFlag"].value_counts()

In [None]:
# Handle missing customer data
sm["MaritalStatus"] = sm["MaritalStatus"].fillna("Unknown")
sm["YearlyIncome"] = sm["YearlyIncome"].fillna(sm["YearlyIncome"].median())
sm["TotalChildren"] = sm["TotalChildren"].fillna(sm["TotalChildren"].median())
sm["EnglishEducation"] = sm["EnglishEducation"].fillna("Unknown")
sm["HouseOwnerFlag"] = sm["HouseOwnerFlag"].fillna(False)
sm["Age"] = sm["Age"].fillna(sm["Age"].median())
sm["Gender"] = sm["Gender"].fillna("Unknown")
sm["Age Group"] = sm["Age Group"].fillna("Unknown")

In [None]:
# Final data check
sm.info()

In [None]:
# Export final data
sm.to_excel("sales_customers.xlsx", index=False)

In [None]:
# Export small sample for testing
sm.sample(frac=0.1).to_excel("sales_customer_small.xlsx", index=False)