## Feature Engineering

#### Import Libraries

In [2]:
import pandas as pd
import numpy as np

#### Load Dataset

In [4]:
df = pd.read_csv('/Users/hpourmand/Desktop/Retail/PreparedDataset.csv')

#### Extract Date Components for Order Date

In [9]:
# Convert 'Order Date' and 'Ship Date' columns to datetime format
df['Order Date'] = pd.to_datetime(df['Order Date'], errors='coerce')
df['Ship Date'] = pd.to_datetime(df['Ship Date'], errors='coerce')

# Extract year, month, day, weekday, and ISO week number
df['Order Year'] = df['Order Date'].dt.year
df['Order Month'] = df['Order Date'].dt.month
df['Order Day'] = df['Order Date'].dt.day
df['Order Weekday'] = df['Order Date'].dt.weekday  # Monday=0, Sunday=6
df['Order Week'] = df['Order Date'].dt.isocalendar().week  # ISO week number

#### Calculate Order Fulfillment Time (Shipping Duration)

In [10]:
df['Fulfillment Time (Days)'] = (df['Ship Date'] - df['Order Date']).dt.days

#### Create Revenue per Order (Sales per Order)

In [12]:
df['Revenue per Order'] = df['Sales']

####  Identify Holiday Seasons

In [14]:
df['Is_Holiday_Season'] = df['Order Month'].apply(lambda x: 1 if x in [11, 12] else 0)

#### RFM Features for Customer Segmentation

##### Recency: Days since the last purchase, Frequency: Total number of orders by the customer, Monetary: Total spending by the customer

In [15]:
rfm = df.groupby('Customer ID').agg({
    'Order Date': lambda x: (df['Order Date'].max() - x.max()).days,  # Recency
    'Order ID': 'nunique',  # Frequency
    'Sales': 'sum'          # Monetary
}).reset_index()

# Rename columns for clarity in RFM
rfm.columns = ['Customer ID', 'Recency', 'Frequency', 'Monetary']

# Merge RFM data back with the main DataFrame
df = df.merge(rfm, on='Customer ID', how='left')

#### Product-Level Aggregates

In [17]:
# Calculate total sales and total units sold per product
product_agg = df.groupby(['Product ID']).agg({
    'Sales': 'sum',
    'Order ID': 'count'  # Assuming each row in 'Order ID' represents a unit sold
}).reset_index()

# Rename columns for clarity
product_agg.columns = ['Product ID', 'Total Sales', 'Total Units Sold']

# Merge Product-Level Aggregates back to the main DataFrame
df = df.merge(product_agg, on='Product ID', how='left')

#### Customer Lifetime Value (CLV) Approximation

In [18]:
df['Avg_Monetary_Value'] = df['Monetary'] / df['Frequency']
df['CLV'] = df['Avg_Monetary_Value'] * df['Frequency']

#### Encoding Categorical Variables

In [20]:
df = pd.get_dummies(df, columns=['Ship Mode', 'Segment', 'Region'], drop_first=True)

KeyError: "None of [Index(['Ship Mode', 'Segment', 'Region'], dtype='object')] are in the [columns]"