In [1]:
import pandas as pd
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Diagnostic function to examine an Excel file
def examine_excel_file(file_path):
    """Examine the structure of an Excel file to understand its columns"""
    try:
        print(f"\nExamining file: {os.path.basename(file_path)}")
        
        # Try to read with pandas
        df = pd.read_excel(file_path)
        print(f"Successfully read the file. Shape: {df.shape}")
        
        # Display column info
        print("\nColumns:")
        for i, col in enumerate(df.columns):
            print(f"{i+1}. {col} - Type: {df[col].dtype} - Sample: {df[col].dropna().iloc[0] if not df[col].dropna().empty else 'No data'}")
        
        # Check for potential duplicate columns
        dupe_cols = df.columns[df.columns.duplicated()].tolist()
        if dupe_cols:
            print(f"\nWarning: Found {len(dupe_cols)} duplicate column names: {dupe_cols}")
        
        # Look for specific columns we need
        print("\nLooking for required columns:")
        email_cols = [col for col in df.columns if 'email' in str(col).lower() or 'mail' in str(col).lower()]
        date_cols = [col for col in df.columns if 'date' in str(col).lower() or 'time' in str(col).lower()]
        type_cols = [col for col in df.columns if 'type' in str(col).lower() or 'source' in str(col).lower()]
        order_type_cols = [col for col in df.columns if 'type' in str(col).lower() or 'return' in str(col).lower() or 'order' in str(col).lower()]
        
        print(f"Potential email columns: {email_cols}")
        print(f"Potential date columns: {date_cols}")
        print(f"Potential type columns: {type_cols}")
        print(f"Potential order type columns: {order_type_cols}")
        
        return df
    except Exception as e:
        print(f"Error examining file {file_path}: {str(e)}")
        return None

# Examine a sample file from each financial year
import os

# Sample from financial year 1
fy1_path = r"C:\Users\91843\Documents\VsCode Codes\ReportAutomation\q\financial_year_1"
fy1_files = [f for f in os.listdir(fy1_path) if f.endswith('.xlsx') and not f.startswith('~$')]
if fy1_files:
    sample_file1 = os.path.join(fy1_path, fy1_files[0])
    print(f"Examining sample from Financial Year 1: {fy1_files[0]}")
    sample_df1 = examine_excel_file(sample_file1)

# Sample from financial year 2
fy2_path = r"C:\Users\91843\Documents\VsCode Codes\ReportAutomation\q\financial_year_2"
fy2_files = [f for f in os.listdir(fy2_path) if f.endswith('.xlsx') and not f.startswith('~$')]
if fy2_files:
    sample_file2 = os.path.join(fy2_path, fy2_files[0])
    print(f"Examining sample from Financial Year 2: {fy2_files[0]}")
    sample_df2 = examine_excel_file(sample_file2)

Examining sample from Financial Year 1: CUSTOMER PROFILE AUG23.xlsx

Examining file: CUSTOMER PROFILE AUG23.xlsx
Successfully read the file. Shape: (464, 2)

Columns:
1. Item Dim Cat Code (L1) - Type: object - Sample: Family Code (L2)
2. (All) - Type: object - Sample: (All)

Looking for required columns:
Potential email columns: []
Potential date columns: []
Potential type columns: []
Potential order type columns: []
Examining sample from Financial Year 2: CUSTOMER PROFILE _Sep24.xlsx

Examining file: CUSTOMER PROFILE _Sep24.xlsx
Successfully read the file. Shape: (464, 2)

Columns:
1. Item Dim Cat Code (L1) - Type: object - Sample: Family Code (L2)
2. (All) - Type: object - Sample: (All)

Looking for required columns:
Potential email columns: []
Potential date columns: []
Potential type columns: []
Potential order type columns: []
Examining sample from Financial Year 2: CUSTOMER PROFILE _Sep24.xlsx

Examining file: CUSTOMER PROFILE _Sep24.xlsx
Successfully read the file. Shape: (10483

In [2]:
# Define paths to financial year directories
fy1_path = r"C:\Users\91843\Documents\VsCode Codes\ReportAutomation\q\financial_year_1"
fy2_path = r"C:\Users\91843\Documents\VsCode Codes\ReportAutomation\q\financial_year_2"
q_main_path = r"C:\Users\91843\Documents\VsCode Codes\ReportAutomation\q"  # Main q directory with 2023 files and April25 data

print("Starting customer analysis across financial years...")

def process_excel_file(file_path):
    """Process each Excel file and extract relevant data."""
    try:
        # Skip temporary Excel files
        if os.path.basename(file_path).startswith('~$'):
            print(f"Skipping temporary file: {os.path.basename(file_path)}")
            return None
        
        # Get all sheet names in the Excel file
        excel_file = pd.ExcelFile(file_path)
        sheet_names = excel_file.sheet_names
        
        # Look for sheets with 'export' in the name (case-insensitive)
        export_sheets = [sheet for sheet in sheet_names if 'export' in sheet.lower()]
        
        if export_sheets:
            # Use the first sheet with 'export' in the name
            sheet_name = export_sheets[0]
            print(f"Reading '{sheet_name}' sheet from {os.path.basename(file_path)}")
            df = pd.read_excel(file_path, sheet_name=sheet_name)
        else:
            # If no export sheet found, use the default first sheet
            sheet_name = sheet_names[0]
            print(f"No 'Export' sheet found, using first sheet '{sheet_name}' from {os.path.basename(file_path)}")
            df = pd.read_excel(file_path, sheet_name=sheet_name)
        
        # Print basic file info
        print(f"File: {os.path.basename(file_path)} - Shape: {df.shape}")
        
        # Convert all column names to strings and lowercase for easier matching
        df.columns = [str(col).lower().strip() for col in df.columns]
        
        # Based on the examination, directly map common column names found in the files
        # Initialize columns
        email_col = None
        date_col = None
        channel_col = None
        order_type_col = None
        
        # Map column names based on common patterns in the files
        for col in df.columns:
            col_lower = col.lower()
            
            # Email columns
            if 'customer' in col_lower and 'email' in col_lower:
                email_col = col
            elif 'email' in col_lower:
                email_col = col
            elif 'customer' in col_lower and 'mail' in col_lower:
                email_col = col
            elif col_lower == 'email':
                email_col = col
            
            # Date columns
            if 'order' in col_lower and 'date' in col_lower:
                date_col = col
            elif 'transaction' in col_lower and 'date' in col_lower:
                date_col = col
            elif 'purchase' in col_lower and 'date' in col_lower:
                date_col = col
            elif col_lower == 'date' or col_lower == 'orderdate':
                date_col = col
            
            # Type columns (replacing channel columns)
            if col_lower.strip() == 'type':
                channel_col = col
            elif 'sales' in col_lower and 'type' in col_lower:
                channel_col = col
            elif 'customer' in col_lower and 'type' in col_lower:
                channel_col = col
            elif 'retail' in col_lower and 'type' in col_lower:
                channel_col = col
            
            # Order type columns
            if 'order' in col_lower and 'type' in col_lower:
                order_type_col = col
            elif col_lower == 'retailordertype':
                order_type_col = col
            elif 'retail' in col_lower and 'type' in col_lower and col_lower != channel_col:
                order_type_col = col
            elif 'transaction' in col_lower and 'type' in col_lower:
                order_type_col = col
            elif col_lower == 'ordertype':
                order_type_col = col
        
        # Fall back to more general searches if the specific patterns didn't match
        if email_col is None:
            # First try exact column names that might be email fields
            exact_email_cols = ['email', 'customeremail', 'customer email', 'mail', 'customer_email']
            found = False
            for exact_col in exact_email_cols:
                matches = [col for col in df.columns if col.lower() == exact_col]
                if matches:
                    email_col = matches[0]
                    print(f"Using exact match {email_col} as email column")
                    found = True
                    break
            
            # If still not found, try broader pattern matching
            if not found:
                for col in df.columns:
                    col_lower = col.lower()
                    if ('mail' in col_lower or 'customer' in col_lower or 'id' in col_lower) and len(col_lower) > 2:
                        # Check if column contains email-like values (with @ symbol)
                        if df[col].astype(str).str.contains('@', na=False).any():
                            email_col = col
                            print(f"Using column with @ symbols: {col} as email column")
                            found = True
                            break
                
                # Last resort, try any column with "type" in the name
                if not found:
                    for col in df.columns:
                        if 'type' in col.lower() or 'customer' in col.lower():
                            email_col = col
                            print(f"Using fallback column: {col} as email column")
                            break
        
        if date_col is None:
            for col in df.columns:
                col_lower = col.lower()
                if 'date' in col_lower or 'time' in col_lower or 'day' in col_lower:
                    date_col = col
                    print(f"Using {col} as date column")
                    break
        
        if channel_col is None:
            # First try exact columns that might be type fields
            exact_type_cols = ['type', 'salestype', 'sales type', 'customertype']
            found = False
            for exact_col in exact_type_cols:
                matches = [col for col in df.columns if col.lower() == exact_col]
                if matches:
                    channel_col = matches[0]
                    print(f"Using exact match {channel_col} as type column")
                    found = True
                    break
            
            # If still not found, try broader pattern matching
            if not found:
                for col in df.columns:
                    col_lower = col.lower()
                    if ('type' in col_lower or 'source' in col_lower or 'medium' in col_lower or 
                        'platform' in col_lower or 'store' in col_lower):
                        channel_col = col
                        print(f"Using {col} as type column")
                        break
                        
                # If still no type, use retailcustomertype if available (based on your data structure)
                if not found and 'retailcustomertype' in [c.lower() for c in df.columns]:
                    for col in df.columns:
                        if col.lower() == 'retailcustomertype':
                            channel_col = col
                            print(f"Using {col} as fallback type column")
                            break
        
        if order_type_col is None:
            exact_type_cols = ['ordertype', 'retailordertype', 'ordertype', 'order_type']
            found = False
            for exact_col in exact_type_cols:
                matches = [col for col in df.columns if col.lower() == exact_col]
                if matches:
                    order_type_col = matches[0]
                    print(f"Using exact match {order_type_col} as order type column")
                    found = True
                    break
            
            if not found:
                for col in df.columns:
                    col_lower = col.lower()
                    if ('type' in col_lower or 'status' in col_lower or 'order' in col_lower):
                        order_type_col = col
                        print(f"Using {col} as order type column")
                        break
                        
                # If still no order type, use retailcustomertype if available
                if not found and channel_col and channel_col.lower() == 'retailcustomertype':
                    order_type_col = channel_col
                    print(f"Using same field for both channel and order type: {order_type_col}")
        
        # Log what we found
        print(f"Column mapping for {os.path.basename(file_path)}:")
        print(f"  Email: {email_col}")
        print(f"  Date: {date_col}")
        print(f"  Type: {channel_col}")
        print(f"  Order Type: {order_type_col}")
        
        # Check if we found all required columns
        if email_col is None:
            print(f"Warning: Required email column not found in {os.path.basename(file_path)}")
            return None
        
        # If channel or order_type is missing, we'll create them with default values
        missing = []
        if channel_col is None: 
            missing.append("type")
            print(f"Warning: Type column not found in {os.path.basename(file_path)}, will use default value 'UNKNOWN'")
        
        if order_type_col is None: 
            missing.append("order type")
            print(f"Warning: Order type column not found in {os.path.basename(file_path)}, will use default value 'sales order'")
        
        if missing:
            print(f"Creating default values for missing columns: {', '.join(missing)}")
        
        # Create a clean DataFrame with consistent column names
        new_df = pd.DataFrame()
        
        # Add each column with standardized names
        try:
            new_df['customeremail'] = df[email_col].astype(str)
        except:
            print(f"Error extracting email column '{email_col}' from {os.path.basename(file_path)}")
            return None
            
        # Add date if available
        if date_col:
            try:
                new_df['order_date'] = pd.to_datetime(df[date_col], errors='coerce')
                # Extract month and year
                new_df['month'] = new_df['order_date'].dt.month
                new_df['year'] = new_df['order_date'].dt.year
            except:
                print(f"Error extracting date column '{date_col}' from {os.path.basename(file_path)}")
                new_df['order_date'] = None
        
        # Add channel and order type (with fallback defaults)
        if channel_col:
            try:
                new_df['channel'] = df[channel_col].astype(str)
            except:
                print(f"Error extracting type column '{channel_col}' from {os.path.basename(file_path)}")
                new_df['channel'] = 'UNKNOWN'
        else:
            # Default type value if column is missing
            new_df['channel'] = 'UNKNOWN'
            
        if order_type_col:
            try:
                new_df['retailordertype'] = df[order_type_col].astype(str)
            except:
                print(f"Error extracting order type column '{order_type_col}' from {os.path.basename(file_path)}")
                new_df['retailordertype'] = 'sales order'
        else:
            # Default order type value if column is missing
            new_df['retailordertype'] = 'sales order'
        
        # Add file name as metadata
        new_df['file_name'] = os.path.basename(file_path)
        
        # Drop rows with missing values in critical columns
        new_df = new_df.dropna(subset=['customeremail', 'channel', 'retailordertype'])
        
        # Final check on data quality
        valid_rows = len(new_df)
        print(f"Processed {os.path.basename(file_path)}: Found {valid_rows} valid records (after filtering for POS/Jumbo.ae types)")
        
        if valid_rows == 0:
            print(f"Warning: No valid data found in {os.path.basename(file_path)} after processing and type filtering")
            return None
            
        return new_df
    
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None

# Process all files in financial year 1 (excluding temporary Excel files)
fy1_files = [os.path.join(fy1_path, f) for f in os.listdir(fy1_path) 
             if f.endswith('.xlsx') and not f.startswith('~$')]
print(f"Found {len(fy1_files)} Excel files in Financial Year 1")

# Process all files in financial year 2 (excluding temporary Excel files)
fy2_files = [os.path.join(fy2_path, f) for f in os.listdir(fy2_path) 
             if f.endswith('.xlsx') and not f.startswith('~$')]
print(f"Found {len(fy2_files)} Excel files in Financial Year 2")

# Process files in the main q directory (2023 files)
main_q_files = [os.path.join(q_main_path, f) for f in os.listdir(q_main_path) 
                if f.endswith('.xlsx') and not f.startswith('~$') and os.path.isfile(os.path.join(q_main_path, f))]
print(f"Found {len(main_q_files)} Excel files in Main Q Directory (2023 files)")

# Process FY1 files in chunks to optimize memory usage
print("Processing Financial Year 1 data...")
fy1_dfs = []
fy1_processed_files = set()  # Track which files have been processed

for file in fy1_files:
    filename = os.path.basename(file)
    
    # Skip if already processed
    if filename in fy1_processed_files:
        print(f"Skipping already processed file: {filename}")
        continue
        
    print(f"Processing: {filename}")
    fy1_processed_files.add(filename)
    
    df = process_excel_file(file)
    if df is not None and not df.empty:
        fy1_dfs.append(df)
        print(f"Added {len(df)} records from {filename}")

# Combine all FY1 data
if fy1_dfs:
    fy1_data = pd.concat(fy1_dfs, ignore_index=True)
    print(f"FY1 data shape: {fy1_data.shape} with {fy1_data['customeremail'].nunique()} unique customers")
else:
    print("No valid data found for Financial Year 1")
    fy1_data = pd.DataFrame()

# Process FY2 files in chunks to optimize memory usage
print("\nProcessing Financial Year 2 data...")
fy2_dfs = []
fy2_processed_files = set()  # Track which files have been processed

for file in fy2_files:
    filename = os.path.basename(file)
    
    # Skip if already processed
    if filename in fy2_processed_files:
        print(f"Skipping already processed file: {filename}")
        continue
        
    print(f"Processing: {filename}")
    fy2_processed_files.add(filename)
    
    df = process_excel_file(file)
    if df is not None and not df.empty:
        fy2_dfs.append(df)
        print(f"Added {len(df)} records from {filename}")

# Combine all FY2 data
if fy2_dfs:
    fy2_data = pd.concat(fy2_dfs, ignore_index=True)
    print(f"FY2 data shape: {fy2_data.shape} with {fy2_data['customeremail'].nunique()} unique customers")
else:
    print("No valid data found for Financial Year 2")
    fy2_data = pd.DataFrame()

# Process Main Q files (2023 data) in chunks to optimize memory usage
print("\nProcessing Main Q directory data (2023 files)...")
q_2023_dfs = []
q_2023_processed_files = set()  # Track which files have been processed

for file in main_q_files:
    filename = os.path.basename(file)
    
    # Skip April25 file as it belongs to 2025, not 2023
    if 'april25' in filename.lower() or 'april_25' in filename.lower():
        print(f"Skipping April25 file (belongs to 2025): {filename}")
        continue
    
    # Skip if already processed
    if filename in q_2023_processed_files:
        print(f"Skipping already processed file: {filename}")
        continue
        
    print(f"Processing: {filename}")
    q_2023_processed_files.add(filename)
    
    df = process_excel_file(file)
    if df is not None and not df.empty:
        q_2023_dfs.append(df)
        print(f"Added {len(df)} records from {filename}")

# Combine all 2023 data
if q_2023_dfs:
    q_2023_data = pd.concat(q_2023_dfs, ignore_index=True)
    print(f"2023 data shape: {q_2023_data.shape} with {q_2023_data['customeremail'].nunique()} unique customers")
else:
    print("No valid data found for 2023")
    q_2023_data = pd.DataFrame()

Starting customer analysis across financial years...
Found 12 Excel files in Financial Year 1
Found 12 Excel files in Financial Year 2
Found 4 Excel files in Main Q Directory (2023 files)
Processing Financial Year 1 data...
Processing: CUSTOMER PROFILE AUG23.xlsx
Reading 'Export' sheet from CUSTOMER PROFILE AUG23.xlsx
Reading 'Export' sheet from CUSTOMER PROFILE AUG23.xlsx
File: CUSTOMER PROFILE AUG23.xlsx - Shape: (83994, 49)
Column mapping for CUSTOMER PROFILE AUG23.xlsx:
  Email: customeremail
  Date: orderdate
  Type: type
  Order Type: retailordertype
Processed CUSTOMER PROFILE AUG23.xlsx: Found 83994 valid records (after filtering for POS/Jumbo.ae types)
Added 83994 records from CUSTOMER PROFILE AUG23.xlsx
Processing: CUSTOMER PROFILE SEP23.xlsx
File: CUSTOMER PROFILE AUG23.xlsx - Shape: (83994, 49)
Column mapping for CUSTOMER PROFILE AUG23.xlsx:
  Email: customeremail
  Date: orderdate
  Type: type
  Order Type: retailordertype
Processed CUSTOMER PROFILE AUG23.xlsx: Found 83994 

In [3]:
# Customer Analysis with Visualizations
from IPython.display import display, HTML
import pandas as pd

print("\n===== CUSTOMER ANALYSIS REPORT =====\n")

# === Handle empty dataframes ===
if fy1_data.empty and fy2_data.empty:
    print("Error: Data for both financial years is missing. Cannot complete analysis.")
elif fy1_data.empty:
    print("Warning: No data for Financial Year 1. Partial analysis will be performed.")
elif fy2_data.empty:
    print("Warning: No data for Financial Year 2. Partial analysis will be performed.")
else:
    # === Clean up emails for better matching ===
    print("Standardizing email formats for accurate matching...")
    
    # Function to clean and standardize email addresses
    def clean_email(email):
        if pd.isna(email) or not isinstance(email, str):
            return email
        return email.strip().lower()
    
    fy1_data['customeremail'] = fy1_data['customeremail'].apply(clean_email)
    fy2_data['customeremail'] = fy2_data['customeremail'].apply(clean_email)
    
    # === ANALYSIS 1: Basic Customer Counts ===
    print("\n1. BASIC CUSTOMER COUNTS")
    print("This table shows the total unique customers in each financial year and their year-over-year change.")
    
    fy1_total_emails = fy1_data['customeremail'].nunique()
    fy2_total_emails = fy2_data['customeremail'].nunique()
    
    # Create a DataFrame for display
    basic_counts_df = pd.DataFrame({
        'Metric': ['Unique Customers', 'Year-over-Year Change'],
        'FY1': [f"{fy1_total_emails:,}", ""],
        'FY2': [f"{fy2_total_emails:,}", ""],
        'Change': ["", f"{((fy2_total_emails - fy1_total_emails) / fy1_total_emails * 100):.2f}% ({'increase' if fy2_total_emails >= fy1_total_emails else 'decrease'})"]
    })
    
    # Calculate change percentage for later use
    change_percent = ((fy2_total_emails - fy1_total_emails) / fy1_total_emails * 100) if fy1_total_emails > 0 else 0
    
    # Apply styling to the DataFrame
    styled_basic_counts = basic_counts_df.style.set_properties(**{'text-align': 'center'})
    styled_basic_counts = styled_basic_counts.set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#4472C4'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
        {'selector': '.col0', 'props': [('text-align', 'left')]}
    ])
    
    print("\nData Sources and Formulas:")
    print("- FY1 Unique Customers: Count of distinct emails in fy1_data dataframe")
    print("- FY2 Unique Customers: Count of distinct emails in fy2_data dataframe")
    print("- Year-over-Year Change: ((FY2 Count - FY1 Count) / FY1 Count) * 100%")
    print("  Formula: ((fy2_total_emails - fy1_total_emails) / fy1_total_emails * 100)")
    
    display(styled_basic_counts)
    
    # === ANALYSIS 2: Customer Retention Analysis ===
    print("\n2. CUSTOMER RETENTION ANALYSIS")
    print("This table shows customer retention metrics including retained customers from FY1, new customers in FY2, and lost customers from FY1.")
    
    fy1_unique_customers = set(fy1_data['customeremail'].unique())
    fy2_unique_customers = set(fy2_data['customeremail'].unique())
    
    # Find overlapping customers (retained)
    retained_customers = fy1_unique_customers.intersection(fy2_unique_customers)
    new_customers_in_fy2 = fy2_unique_customers - fy1_unique_customers
    lost_customers_from_fy1 = fy1_unique_customers - fy2_unique_customers
    
    # Create retention analysis DataFrame
    retention_df = pd.DataFrame({
        'Metric': ['Retained Customers', 'New Customers in FY2', 'Lost Customers from FY1'],
        'Count': [
            len(retained_customers),
            len(new_customers_in_fy2),
            len(lost_customers_from_fy1)
        ],
        'Percentage': [
            f"{len(retained_customers)/len(fy1_unique_customers)*100:.2f}% of FY1",
            f"{len(new_customers_in_fy2)/len(fy2_unique_customers)*100:.2f}% of FY2",
            f"{len(lost_customers_from_fy1)/len(fy1_unique_customers)*100:.2f}% of FY1"
        ]
    })
    
    # Apply styling to retention DataFrame
    styled_retention = retention_df.style.set_properties(**{'text-align': 'center'})
    styled_retention = styled_retention.set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#4472C4'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
        {'selector': '.col0', 'props': [('text-align', 'left')]}
    ])
    
    print("\nData Sources and Formulas:")
    print("- Retained Customers: Intersection of unique customers in FY1 and FY2")
    print("  Formula: len(fy1_unique_customers.intersection(fy2_unique_customers))")
    print("- New Customers in FY2: Customers in FY2 not present in FY1")
    print("  Formula: len(fy2_unique_customers - fy1_unique_customers)")
    print("- Lost Customers from FY1: Customers in FY1 not present in FY2")
    print("  Formula: len(fy1_unique_customers - fy2_unique_customers)")
    print("- Percentage calculations: (Count / Total relevant customers) * 100%")
    
    display(styled_retention)
    
    # === ANALYSIS 3: Type Distribution Analysis ===
    print("\n3. TYPE DISTRIBUTION ANALYSIS")
    print("These tables show how customers are distributed across different types (POS and Jumbo.ae) for each financial year.")
    
    # FY1 Type Distribution
    print("\nFinancial Year 1 Type Distribution:")
    fy1_channel_dist = fy1_data.groupby('channel')['customeremail'].nunique().reset_index()
    fy1_channel_dist.columns = ['Type', 'Unique Customers']
    fy1_channel_dist['Percentage'] = fy1_channel_dist['Unique Customers'] / fy1_channel_dist['Unique Customers'].sum() * 100
    fy1_channel_dist['Percentage'] = fy1_channel_dist['Percentage'].apply(lambda x: f"{x:.2f}%")
    
    # Style FY1 type distribution
    styled_fy1_channel = fy1_channel_dist.style.set_properties(**{'text-align': 'center'})
    styled_fy1_channel = styled_fy1_channel.set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#5B9BD5'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
        {'selector': '.col0', 'props': [('text-align', 'left')]}
    ])
    
    print("\nFY1 Type Distribution - Data Sources and Formulas:")
    print("- Data Source: fy1_data dataframe, grouped by 'channel' column (now containing Type data)")
    print("- Unique Customers: Count of distinct email addresses per type")
    print("  Formula: fy1_data.groupby('channel')['customeremail'].nunique()")
    print("- Percentage: (Type customer count / Total unique customers) * 100%")
    print("  Formula: type_count / total_count * 100")
    
    display(styled_fy1_channel)
    
    # FY2 Type Distribution
    print("\nFinancial Year 2 Type Distribution:")
    fy2_channel_dist = fy2_data.groupby('channel')['customeremail'].nunique().reset_index()
    fy2_channel_dist.columns = ['Type', 'Unique Customers']
    fy2_channel_dist['Percentage'] = fy2_channel_dist['Unique Customers'] / fy2_channel_dist['Unique Customers'].sum() * 100
    fy2_channel_dist['Percentage'] = fy2_channel_dist['Percentage'].apply(lambda x: f"{x:.2f}%")
    
    # Style FY2 type distribution
    styled_fy2_channel = fy2_channel_dist.style.set_properties(**{'text-align': 'center'})
    styled_fy2_channel = styled_fy2_channel.set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#5B9BD5'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
        {'selector': '.col0', 'props': [('text-align', 'left')]}
    ])
    
    print("\nFY2 Type Distribution - Data Sources and Formulas:")
    print("- Data Source: fy2_data dataframe, grouped by 'channel' column (now containing Type data)")
    print("- Unique Customers: Count of distinct email addresses per type")
    print("  Formula: fy2_data.groupby('channel')['customeremail'].nunique()")
    print("- Percentage: (Type customer count / Total unique customers) * 100%")
    print("  Formula: type_count / total_count * 100")
    
    display(styled_fy2_channel)
    
    # Type Shift Analysis
    print("\nType Shift Analysis:")
    print("This table shows how customer type distribution has changed from FY1 to FY2.")
    
    if not fy1_channel_dist.empty and not fy2_channel_dist.empty:
        # Convert percentage strings back to floats for calculations
        fy1_channel_dist['Percentage_num'] = fy1_channel_dist['Percentage'].str.rstrip('%').astype(float)
        fy2_channel_dist['Percentage_num'] = fy2_channel_dist['Percentage'].str.rstrip('%').astype(float)
        
        # Merge and calculate changes
        channel_shift = pd.merge(
            fy1_channel_dist[['Type', 'Percentage_num']], 
            fy2_channel_dist[['Type', 'Percentage_num']], 
            on='Type', 
            suffixes=('_FY1', '_FY2')
        )
        
        if not channel_shift.empty:
            channel_shift['Change'] = channel_shift['Percentage_num_FY2'] - channel_shift['Percentage_num_FY1']
            
            # Format for display
            channel_shift['FY1 %'] = channel_shift['Percentage_num_FY1'].apply(lambda x: f"{x:.2f}%")
            channel_shift['FY2 %'] = channel_shift['Percentage_num_FY2'].apply(lambda x: f"{x:.2f}%")
            channel_shift['Change %'] = channel_shift['Change'].apply(lambda x: f"{x:.2f}%")
            
            # Final display columns
            shift_display = channel_shift[['Type', 'FY1 %', 'FY2 %', 'Change %']]
            
            # Style channel shift table
            styled_shift = shift_display.style.set_properties(**{'text-align': 'center'})
            styled_shift = styled_shift.set_table_styles([
                {'selector': 'th', 'props': [('background-color', '#4472C4'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
                {'selector': '.col0', 'props': [('text-align', 'left')]}
            ])
            
            # Apply conditional formatting to change column
            def style_change(val):
                val_num = float(val.rstrip('%'))
                if val_num > 0:
                    return 'color: green'
                elif val_num < 0:
                    return 'color: red'
                return ''
            
            styled_shift = styled_shift.applymap(style_change, subset=['Change %'])
            
            print("\nType Shift Analysis - Data Sources and Formulas:")
            print("- Data Sources: Percentages from FY1 and FY2 type distribution tables")
            print("- FY1 %: Percentage of customers using each type in FY1")
            print("- FY2 %: Percentage of customers using each type in FY2")
            print("- Change %: FY2 percentage - FY1 percentage")
            print("  Formula: channel_shift['Percentage_num_FY2'] - channel_shift['Percentage_num_FY1']")
            print("- Positive values (green) indicate growth, negative values (red) indicate decline")
            
            display(styled_shift)
    
   


===== CUSTOMER ANALYSIS REPORT =====

Standardizing email formats for accurate matching...

1. BASIC CUSTOMER COUNTS
This table shows the total unique customers in each financial year and their year-over-year change.

1. BASIC CUSTOMER COUNTS
This table shows the total unique customers in each financial year and their year-over-year change.

Data Sources and Formulas:
- FY1 Unique Customers: Count of distinct emails in fy1_data dataframe
- FY2 Unique Customers: Count of distinct emails in fy2_data dataframe
- Year-over-Year Change: ((FY2 Count - FY1 Count) / FY1 Count) * 100%
  Formula: ((fy2_total_emails - fy1_total_emails) / fy1_total_emails * 100)

Data Sources and Formulas:
- FY1 Unique Customers: Count of distinct emails in fy1_data dataframe
- FY2 Unique Customers: Count of distinct emails in fy2_data dataframe
- Year-over-Year Change: ((FY2 Count - FY1 Count) / FY1 Count) * 100%
  Formula: ((fy2_total_emails - fy1_total_emails) / fy1_total_emails * 100)


Unnamed: 0,Metric,FY1,FY2,Change
0,Unique Customers,247749.0,215634.0,
1,Year-over-Year Change,,,-12.96% (decrease)



2. CUSTOMER RETENTION ANALYSIS
This table shows customer retention metrics including retained customers from FY1, new customers in FY2, and lost customers from FY1.

Data Sources and Formulas:
- Retained Customers: Intersection of unique customers in FY1 and FY2
  Formula: len(fy1_unique_customers.intersection(fy2_unique_customers))
- New Customers in FY2: Customers in FY2 not present in FY1
  Formula: len(fy2_unique_customers - fy1_unique_customers)
- Lost Customers from FY1: Customers in FY1 not present in FY2
  Formula: len(fy1_unique_customers - fy2_unique_customers)
- Percentage calculations: (Count / Total relevant customers) * 100%

Data Sources and Formulas:
- Retained Customers: Intersection of unique customers in FY1 and FY2
  Formula: len(fy1_unique_customers.intersection(fy2_unique_customers))
- New Customers in FY2: Customers in FY2 not present in FY1
  Formula: len(fy2_unique_customers - fy1_unique_customers)
- Lost Customers from FY1: Customers in FY1 not present in FY2

Unnamed: 0,Metric,Count,Percentage
0,Retained Customers,44578,17.99% of FY1
1,New Customers in FY2,171056,79.33% of FY2
2,Lost Customers from FY1,203171,82.01% of FY1



3. TYPE DISTRIBUTION ANALYSIS
These tables show how customers are distributed across different types (POS and Jumbo.ae) for each financial year.

Financial Year 1 Type Distribution:

FY1 Type Distribution - Data Sources and Formulas:
- Data Source: fy1_data dataframe, grouped by 'channel' column (now containing Type data)
- Unique Customers: Count of distinct email addresses per type
  Formula: fy1_data.groupby('channel')['customeremail'].nunique()
- Percentage: (Type customer count / Total unique customers) * 100%
  Formula: type_count / total_count * 100

FY1 Type Distribution - Data Sources and Formulas:
- Data Source: fy1_data dataframe, grouped by 'channel' column (now containing Type data)
- Unique Customers: Count of distinct email addresses per type
  Formula: fy1_data.groupby('channel')['customeremail'].nunique()
- Percentage: (Type customer count / Total unique customers) * 100%
  Formula: type_count / total_count * 100


Unnamed: 0,Type,Unique Customers,Percentage
0,EA,10233,4.04%
1,Jumbo.ae,9468,3.74%
2,POS,232934,92.03%
3,,479,0.19%



Financial Year 2 Type Distribution:

FY2 Type Distribution - Data Sources and Formulas:
- Data Source: fy2_data dataframe, grouped by 'channel' column (now containing Type data)
- Unique Customers: Count of distinct email addresses per type
  Formula: fy2_data.groupby('channel')['customeremail'].nunique()
- Percentage: (Type customer count / Total unique customers) * 100%
  Formula: type_count / total_count * 100

FY2 Type Distribution - Data Sources and Formulas:
- Data Source: fy2_data dataframe, grouped by 'channel' column (now containing Type data)
- Unique Customers: Count of distinct email addresses per type
  Formula: fy2_data.groupby('channel')['customeremail'].nunique()
- Percentage: (Type customer count / Total unique customers) * 100%
  Formula: type_count / total_count * 100


Unnamed: 0,Type,Unique Customers,Percentage
0,EA,14282,6.45%
1,Jumbo.ae,13284,6.00%
2,POS,193790,87.55%
3,,1,0.00%



Type Shift Analysis:
This table shows how customer type distribution has changed from FY1 to FY2.

Type Shift Analysis - Data Sources and Formulas:
- Data Sources: Percentages from FY1 and FY2 type distribution tables
- FY1 %: Percentage of customers using each type in FY1
- FY2 %: Percentage of customers using each type in FY2
- Change %: FY2 percentage - FY1 percentage
  Formula: channel_shift['Percentage_num_FY2'] - channel_shift['Percentage_num_FY1']
- Positive values (green) indicate growth, negative values (red) indicate decline


Unnamed: 0,Type,FY1 %,FY2 %,Change %
0,EA,4.04%,6.45%,2.41%
1,Jumbo.ae,3.74%,6.00%,2.26%
2,POS,92.03%,87.55%,-4.48%
3,,0.19%,0.00%,-0.19%


In [4]:
# === ANALYSIS 4: Order Type Analysis (Sales vs. Returns) ===
print("\n4. ORDER TYPE ANALYSIS")
print("These tables show the distribution of customers by order type (sales orders vs. returned orders) for each financial year.")

# Function to analyze order types with tables
def analyze_order_types(data, year_label):
    # Count unique customers by order type
    order_type_counts = data.groupby('retailordertype')['customeremail'].nunique().reset_index()
    order_type_counts.columns = ['Order Type', 'Unique Customers']
    order_type_counts['Percentage'] = order_type_counts['Unique Customers'] / order_type_counts['Unique Customers'].sum() * 100
    order_type_counts['Percentage'] = order_type_counts['Percentage'].apply(lambda x: f"{x:.2f}%")
    
    # Style the table
    styled_order_types = order_type_counts.style.set_properties(**{'text-align': 'center'})
    styled_order_types = styled_order_types.set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#5B9BD5'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
        {'selector': '.col0', 'props': [('text-align', 'left')]}
    ])
    
    print(f"\n{year_label} Order Type Distribution:")
    print(f"Data Sources and Formulas for {year_label}:")
    print(f"- Data Source: '{year_label.lower()}' dataframe, grouped by 'retailordertype' column")
    print("- Unique Customers: Count of distinct email addresses per order type")
    print("  Formula: data.groupby('retailordertype')['customeremail'].nunique()")
    print("- Percentage: (Order type customer count / Total unique customers) * 100%")
    print("  Formula: count / total * 100")
    
    display(styled_order_types)
    
    # Identify customers who only returned products
    all_customers = set(data['customeremail'].unique())
    sales_customers = set(data[data['retailordertype'] == 'sales order']['customeremail'].unique())
    only_return_customers = all_customers - sales_customers
    
    return_percent = len(only_return_customers) / len(all_customers) * 100 if all_customers else 0
    
    # Create a DataFrame for customers with only returns
    return_only_df = pd.DataFrame({
        'Metric': ['Customers with only returns (no purchases)'],
        'Count': [len(only_return_customers)],
        'Percentage': [f"{return_percent:.2f}% of total customers"]
    })
    
    # Style the returns-only table
    styled_returns = return_only_df.style.set_properties(**{'text-align': 'center'})
    styled_returns = styled_returns.set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#C00000'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
        {'selector': '.col0', 'props': [('text-align', 'left')]}
    ])
    
    print(f"\n{year_label} Returns-Only Analysis:")
    print("- Data Source: Comparison of all customers vs customers who made sales")
    print("- Formula: Customers with only returns = all_customers - sales_customers")
    print("  where sales_customers = customers with retailordertype = 'sales order'")
    print("- Percentage: (Count / Total unique customers) * 100%")
    
    display(styled_returns)
    
    return only_return_customers

# Analyze order types for both financial years
fy1_only_returns = analyze_order_types(fy1_data, "Financial Year 1")
fy2_only_returns = analyze_order_types(fy2_data, "Financial Year 2")

# === ANALYSIS 5: Cross-Type Customer Behavior ===
print("\n5. CROSS-TYPE CUSTOMER BEHAVIOR")
print("These tables show how customers use different types - whether they shop exclusively through one type (POS or Jumbo.ae) or use multiple types.")

# Function to analyze cross-type behavior
def analyze_cross_type(data, year_label):
    # Count customers who used both types
    all_customers = set(data['customeremail'].unique())
    pos_customers = set(data[data['channel'] == 'POS']['customeremail'].unique())
    jumbo_customers = set(data[data['channel'] == 'Jumbo.ae']['customeremail'].unique())
    
    cross_type_customers = pos_customers.intersection(jumbo_customers)
    pos_only_customers = pos_customers - cross_type_customers
    jumbo_only_customers = jumbo_customers - cross_type_customers
    
    total = len(all_customers)
    
    # Create DataFrame for cross-type analysis
    cross_type_df = pd.DataFrame({
        'Customer Type': ['POS-only customers', 'Jumbo.ae-only customers', 'Cross-type customers'],
        'Count': [len(pos_only_customers), len(jumbo_only_customers), len(cross_type_customers)],
        'Percentage': [
            f"{len(pos_only_customers)/total*100:.2f}%",
            f"{len(jumbo_only_customers)/total*100:.2f}%",
            f"{len(cross_type_customers)/total*100:.2f}%"
        ]
    })
    
    # Style the cross-type table
    styled_cross_type = cross_type_df.style.set_properties(**{'text-align': 'center'})
    styled_cross_type = styled_cross_type.set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#4472C4'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
        {'selector': '.col0', 'props': [('text-align', 'left')]}
    ])
    
    print(f"\n{year_label} Cross-Type Analysis:")
    print(f"Data Sources and Formulas for {year_label}:")
    print("- Data Source: Customer sets categorized by type usage")
    print("- POS-only customers: Customers who only used POS type")
    print("  Formula: pos_customers - cross_type_customers")
    print("- Jumbo.ae-only customers: Customers who only used Jumbo.ae type")
    print("  Formula: jumbo_customers - cross_type_customers")
    print("- Cross-type customers: Customers who used both POS and Jumbo.ae")
    print("  Formula: pos_customers.intersection(jumbo_customers)")
    print("- Percentage: (Customer count / Total unique customers) * 100%")
    
    display(styled_cross_type)
    
    return cross_type_customers, pos_only_customers, jumbo_only_customers

# Analyze cross-type behavior for both financial years
fy1_cross, fy1_pos_only, fy1_jumbo_only = analyze_cross_type(fy1_data, "Financial Year 1")
fy2_cross, fy2_pos_only, fy2_jumbo_only = analyze_cross_type(fy2_data, "Financial Year 2")

# Cross-type loyalty analysis
print("\nCross-Type Customer Loyalty:")
print("This table shows the retention rates for different types of customers based on their type usage patterns.")

fy1_cross_retained = fy1_cross.intersection(fy2_unique_customers)
cross_type_retention_rate = len(fy1_cross_retained) / len(fy1_cross) * 100 if fy1_cross else 0

fy1_pos_only_retained = fy1_pos_only.intersection(fy2_unique_customers)
pos_only_retention_rate = len(fy1_pos_only_retained) / len(fy1_pos_only) * 100 if fy1_pos_only else 0

fy1_jumbo_only_retained = fy1_jumbo_only.intersection(fy2_unique_customers)
jumbo_only_retention_rate = len(fy1_jumbo_only_retained) / len(fy1_jumbo_only) * 100 if fy1_jumbo_only else 0

# Create DataFrame for retention rates by type
type_retention_df = pd.DataFrame({
    'Customer Type': ['Cross-type customers', 'POS-only customers', 'Jumbo.ae-only customers'],
    'Retention Rate': [
        f"{cross_type_retention_rate:.2f}%",
        f"{pos_only_retention_rate:.2f}%",
        f"{jumbo_only_retention_rate:.2f}%"
    ],
    'FY1 Count': [len(fy1_cross), len(fy1_pos_only), len(fy1_jumbo_only)],
    'Retained Count': [len(fy1_cross_retained), len(fy1_pos_only_retained), len(fy1_jumbo_only_retained)]
})

# Style the retention rates table
styled_type_retention = type_retention_df.style.set_properties(**{'text-align': 'center'})
styled_type_retention = styled_type_retention.set_table_styles([
    {'selector': 'th', 'props': [('background-color', '#5B9BD5'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
    {'selector': '.col0', 'props': [('text-align', 'left')]}
])

# Apply conditional formatting to retention rate column
def highlight_retention(val):
    rate = float(val.rstrip('%'))
    if rate >= 70:
        return 'background-color: #c6efce; color: #006100'  # Green for high retention
    elif rate >= 50:
        return 'background-color: #ffeb9c; color: #9c6500'  # Yellow for medium retention
    else:
        return 'background-color: #ffc7ce; color: #9c0006'  # Red for low retention

styled_type_retention = styled_type_retention.applymap(highlight_retention, subset=['Retention Rate'])

print("\nCross-Type Customer Loyalty - Data Sources and Formulas:")
print("- Data Source: FY1 customer sets by type compared with FY2 customer set")
print("- Cross-type customers retention: Percentage of FY1 cross-type customers also present in FY2")
print("  Formula: len(fy1_cross_retained) / len(fy1_cross) * 100")
print("- POS-only customers retention: Percentage of FY1 POS-only customers also present in FY2")
print("  Formula: len(fy1_pos_only_retained) / len(fy1_pos_only) * 100")
print("- Jumbo.ae-only customers retention: Percentage of FY1 Jumbo.ae-only customers also present in FY2")
print("  Formula: len(fy1_jumbo_only_retained) / len(fy1_jumbo_only) * 100")
print("- Color coding: Green ≥70%, Yellow ≥50%, Red <50%")

display(styled_type_retention)


4. ORDER TYPE ANALYSIS
These tables show the distribution of customers by order type (sales orders vs. returned orders) for each financial year.

Financial Year 1 Order Type Distribution:
Data Sources and Formulas for Financial Year 1:
- Data Source: 'financial year 1' dataframe, grouped by 'retailordertype' column
- Unique Customers: Count of distinct email addresses per order type
  Formula: data.groupby('retailordertype')['customeremail'].nunique()
- Percentage: (Order type customer count / Total unique customers) * 100%
  Formula: count / total * 100

Financial Year 1 Order Type Distribution:
Data Sources and Formulas for Financial Year 1:
- Data Source: 'financial year 1' dataframe, grouped by 'retailordertype' column
- Unique Customers: Count of distinct email addresses per order type
  Formula: data.groupby('retailordertype')['customeremail'].nunique()
- Percentage: (Order type customer count / Total unique customers) * 100%
  Formula: count / total * 100


Unnamed: 0,Order Type,Unique Customers,Percentage
0,Returned order,14654,5.65%
1,Sales order,244533,94.35%
2,,1,0.00%



Financial Year 1 Returns-Only Analysis:
- Data Source: Comparison of all customers vs customers who made sales
- Formula: Customers with only returns = all_customers - sales_customers
  where sales_customers = customers with retailordertype = 'sales order'
- Percentage: (Count / Total unique customers) * 100%

- Data Source: Comparison of all customers vs customers who made sales
- Formula: Customers with only returns = all_customers - sales_customers
  where sales_customers = customers with retailordertype = 'sales order'
- Percentage: (Count / Total unique customers) * 100%


Unnamed: 0,Metric,Count,Percentage
0,Customers with only returns (no purchases),247749,100.00% of total customers



Financial Year 2 Order Type Distribution:
Data Sources and Formulas for Financial Year 2:
- Data Source: 'financial year 2' dataframe, grouped by 'retailordertype' column
- Unique Customers: Count of distinct email addresses per order type
  Formula: data.groupby('retailordertype')['customeremail'].nunique()
- Percentage: (Order type customer count / Total unique customers) * 100%
  Formula: count / total * 100


Unnamed: 0,Order Type,Unique Customers,Percentage
0,Returned order,12226,5.43%
1,Sales order,212946,94.57%
2,,1,0.00%



Financial Year 2 Returns-Only Analysis:
- Data Source: Comparison of all customers vs customers who made sales
- Formula: Customers with only returns = all_customers - sales_customers
  where sales_customers = customers with retailordertype = 'sales order'
- Percentage: (Count / Total unique customers) * 100%


Unnamed: 0,Metric,Count,Percentage
0,Customers with only returns (no purchases),215634,100.00% of total customers



5. CROSS-TYPE CUSTOMER BEHAVIOR
These tables show how customers use different types - whether they shop exclusively through one type (POS or Jumbo.ae) or use multiple types.

Financial Year 1 Cross-Type Analysis:
Data Sources and Formulas for Financial Year 1:
- Data Source: Customer sets categorized by type usage
- POS-only customers: Customers who only used POS type
  Formula: pos_customers - cross_type_customers
- Jumbo.ae-only customers: Customers who only used Jumbo.ae type
  Formula: jumbo_customers - cross_type_customers
- Cross-type customers: Customers who used both POS and Jumbo.ae
  Formula: pos_customers.intersection(jumbo_customers)
- Percentage: (Customer count / Total unique customers) * 100%

Financial Year 1 Cross-Type Analysis:
Data Sources and Formulas for Financial Year 1:
- Data Source: Customer sets categorized by type usage
- POS-only customers: Customers who only used POS type
  Formula: pos_customers - cross_type_customers
- Jumbo.ae-only customers: Customers 

Unnamed: 0,Customer Type,Count,Percentage
0,POS-only customers,231665,93.51%
1,Jumbo.ae-only customers,8199,3.31%
2,Cross-type customers,1269,0.51%



Financial Year 2 Cross-Type Analysis:
Data Sources and Formulas for Financial Year 2:
- Data Source: Customer sets categorized by type usage
- POS-only customers: Customers who only used POS type
  Formula: pos_customers - cross_type_customers
- Jumbo.ae-only customers: Customers who only used Jumbo.ae type
  Formula: jumbo_customers - cross_type_customers
- Cross-type customers: Customers who used both POS and Jumbo.ae
  Formula: pos_customers.intersection(jumbo_customers)
- Percentage: (Customer count / Total unique customers) * 100%


Unnamed: 0,Customer Type,Count,Percentage
0,POS-only customers,192559,89.30%
1,Jumbo.ae-only customers,12053,5.59%
2,Cross-type customers,1231,0.57%



Cross-Type Customer Loyalty:
This table shows the retention rates for different types of customers based on their type usage patterns.

Cross-Type Customer Loyalty - Data Sources and Formulas:
- Data Source: FY1 customer sets by type compared with FY2 customer set
- Cross-type customers retention: Percentage of FY1 cross-type customers also present in FY2
  Formula: len(fy1_cross_retained) / len(fy1_cross) * 100
- POS-only customers retention: Percentage of FY1 POS-only customers also present in FY2
  Formula: len(fy1_pos_only_retained) / len(fy1_pos_only) * 100
- Jumbo.ae-only customers retention: Percentage of FY1 Jumbo.ae-only customers also present in FY2
  Formula: len(fy1_jumbo_only_retained) / len(fy1_jumbo_only) * 100
- Color coding: Green ≥70%, Yellow ≥50%, Red <50%


Unnamed: 0,Customer Type,Retention Rate,FY1 Count,Retained Count
0,Cross-type customers,35.15%,1269,446
1,POS-only customers,18.08%,231665,41895
2,Jumbo.ae-only customers,14.33%,8199,1175


In [5]:
# === ANALYSIS 6: Type Switching Behavior ===
print("\n6. TYPE SWITCHING BEHAVIOR")
print("This analysis shows how customers switch between types (POS and Jumbo.ae) from FY1 to FY2.")

# Analyze all customers present in both years for accuracy
common_customers = retained_customers
print(f"Analyzing type switching for all {len(common_customers):,} customers that appear in both years")

if common_customers:
    # More efficient approach: pre-compute primary type for all customers at once
    print("Computing primary types for all customers (this may take a moment for large datasets)...")
    
    # Create optimized dataframes with just email and type columns to minimize memory usage
    fy1_type_data = fy1_data[['customeremail', 'channel']].copy()
    fy2_type_data = fy2_data[['customeremail', 'channel']].copy()
    
    # Get primary type for each customer in FY1
    print("Calculating primary types for FY1...")
    fy1_primary_types = fy1_type_data[fy1_type_data['customeremail'].isin(common_customers)] \
                      .groupby('customeremail')['channel'] \
                      .agg(lambda x: x.value_counts().index[0] if len(x.value_counts()) > 0 else None) \
                      .to_dict()
    
    # Get primary type for each customer in FY2
    print("Calculating primary types for FY2...")
    fy2_primary_types = fy2_type_data[fy2_type_data['customeremail'].isin(common_customers)] \
                      .groupby('customeremail')['channel'] \
                      .agg(lambda x: x.value_counts().index[0] if len(x.value_counts()) > 0 else None) \
                      .to_dict()
    
    # Count type switching patterns
    print("Analyzing type switching patterns...")
    same_type = 0
    pos_to_jumbo = 0
    jumbo_to_pos = 0
    unknown_pattern = 0
    
    # Get customers with valid type data in both years
    valid_customers = [c for c in common_customers if c in fy1_primary_types and c in fy2_primary_types]
    print(f"Found {len(valid_customers):,} customers with valid type data in both years")
    
    # Count the different switching patterns
    for customer in valid_customers:
        fy1_type = fy1_primary_types[customer]
        fy2_type = fy2_primary_types[customer]
        
        if fy1_type == fy2_type:
            same_type += 1
        elif fy1_type == 'POS' and fy2_type == 'Jumbo.ae':
            pos_to_jumbo += 1
        elif fy1_type == 'Jumbo.ae' and fy2_type == 'POS':
            jumbo_to_pos += 1
        else:
            # This covers cases with UNKNOWN types or other edge cases
            unknown_pattern += 1
    
    total_tracked = same_type + pos_to_jumbo + jumbo_to_pos + unknown_pattern
    
    # Further analysis - Break down the "same type" category
    stayed_pos = 0
    stayed_jumbo = 0
    
    for customer in valid_customers:
        fy1_type = fy1_primary_types[customer]
        fy2_type = fy2_primary_types[customer]
        
        if fy1_type == fy2_type == 'POS':
            stayed_pos += 1
        elif fy1_type == fy2_type == 'Jumbo.ae':
            stayed_jumbo += 1
    
    # Create DataFrame for type switching analysis
    switching_df = pd.DataFrame({
        'Switching Pattern': [
            'Stayed with same type (total)',
            '- Stayed with POS',
            '- Stayed with Jumbo.ae',
            'Switched from POS to Jumbo.ae',
            'Switched from Jumbo.ae to POS',
            'Other patterns (involving UNKNOWN)'
        ],
        'Count': [
            same_type,
            stayed_pos,
            stayed_jumbo,
            pos_to_jumbo,
            jumbo_to_pos,
            unknown_pattern
        ],
        'Percentage': [
            f"{same_type/total_tracked*100:.2f}%" if total_tracked > 0 else "N/A",
            f"{stayed_pos/total_tracked*100:.2f}%" if total_tracked > 0 else "N/A",
            f"{stayed_jumbo/total_tracked*100:.2f}%" if total_tracked > 0 else "N/A",
            f"{pos_to_jumbo/total_tracked*100:.2f}%" if total_tracked > 0 else "N/A",
            f"{jumbo_to_pos/total_tracked*100:.2f}%" if total_tracked > 0 else "N/A",
            f"{unknown_pattern/total_tracked*100:.2f}%" if total_tracked > 0 else "N/A"
        ]
    })
    
    # Style the type switching table
    styled_switching = switching_df.style.set_properties(**{'text-align': 'center'})
    styled_switching = styled_switching.set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#4472C4'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
        {'selector': '.col0', 'props': [('text-align', 'left')]}
    ])
    
    print("\nType Switching Behavior for All Retained Customers:")
    print("Data Sources and Formulas:")
    print("- Data Source: Primary type analysis of retained customers between FY1 and FY2")
    print("- Primary type: Most frequent type used by each customer in each year")
    print("  Formula: customer_df.groupby('customeremail')['channel'].agg(lambda x: x.value_counts().index[0])")
    print("- Same type: Customers whose primary type remained the same from FY1 to FY2")
    print("- Stayed with POS/Jumbo.ae: Subset of 'same type' customers by specific type")
    print("- Switched from POS to Jumbo.ae: Customers with primary type POS in FY1 and Jumbo.ae in FY2")
    print("- Switched from Jumbo.ae to POS: Customers with primary type Jumbo.ae in FY1 and POS in FY2")
    print("- Percentage: (Count / Total valid customers with type data) * 100%")
    
    display(styled_switching)
    
    # Additional insights
    print("\nType Switching Insights:")
    if pos_to_jumbo > jumbo_to_pos:
        print(f"- More customers are moving from POS to Jumbo.ae ({pos_to_jumbo:,} vs {jumbo_to_pos:,})")
        print(f"- Net shift towards Jumbo.ae: {pos_to_jumbo - jumbo_to_pos:,} customers ({(pos_to_jumbo - jumbo_to_pos)/total_tracked*100:.2f}% of tracked customers)")
    elif jumbo_to_pos > pos_to_jumbo:
        print(f"- More customers are moving from Jumbo.ae to POS ({jumbo_to_pos:,} vs {pos_to_jumbo:,})")
        print(f"- Net shift towards POS: {jumbo_to_pos - pos_to_jumbo:,} customers ({(jumbo_to_pos - pos_to_jumbo)/total_tracked*100:.2f}% of tracked customers)")
    else:
        print(f"- Customer movement between types is perfectly balanced ({pos_to_jumbo:,} each way)")
        
    print(f"- Type loyalty is {same_type/total_tracked*100:.2f}%, with {stayed_pos/same_type*100:.2f}% loyal to POS and {stayed_jumbo/same_type*100:.2f}% loyal to Jumbo.ae")
else:
    print("No common customers found between financial years to analyze switching behavior.")

# === ANALYSIS 7: Comprehensive Summary Table ===
print("\n7. COMPREHENSIVE SUMMARY TABLE")
print("This table provides a consolidated view of all key customer metrics across both financial years.")

# Create a summary table
summary_data = {
    'Metric': [
        'Total Unique Customers',
        'Retained Customers',
        'Retention Rate (%)',
        'New Customers in FY2',
        'Lost Customers from FY1',
        'POS-only Customers',
        'Jumbo.ae-only Customers',
        'Cross-type Customers',
        'Customers with Only Returns',
        'POS-to-Jumbo.ae Switchers',
        'Jumbo.ae-to-POS Switchers'
    ],
    'FY1 Value': [
        f"{fy1_total_emails:,}",
        'N/A',
        'N/A',
        'N/A',
        'N/A',
        f"{len(fy1_pos_only):,}",
        f"{len(fy1_jumbo_only):,}",
        f"{len(fy1_cross):,}",
        f"{len(fy1_only_returns):,}",
        'N/A',
        'N/A'
    ],
    'FY2 Value': [
        f"{fy2_total_emails:,}",
        f"{len(retained_customers):,}",
        f"{len(retained_customers)/len(fy1_unique_customers)*100:.2f}%" if fy1_unique_customers else 'N/A',
        f"{len(new_customers_in_fy2):,}",
        f"{len(lost_customers_from_fy1):,}",
        f"{len(fy2_pos_only):,}",
        f"{len(fy2_jumbo_only):,}",
        f"{len(fy2_cross):,}",
        f"{len(fy2_only_returns):,}",
        f"{pos_to_jumbo:,}" if 'pos_to_jumbo' in locals() and isinstance(pos_to_jumbo, int) else 'N/A',
        f"{jumbo_to_pos:,}" if 'jumbo_to_pos' in locals() and isinstance(jumbo_to_pos, int) else 'N/A'
    ],
    'Change (%)': [
        f"{change_percent:.2f}%",
        'N/A',
        'N/A',
        f"{len(new_customers_in_fy2)/fy2_total_emails*100:.2f}%" if fy2_total_emails > 0 else 'N/A',
        f"{len(lost_customers_from_fy1)/fy1_total_emails*100:.2f}%" if fy1_total_emails > 0 else 'N/A',
        f"{(len(fy2_pos_only) - len(fy1_pos_only))/len(fy1_pos_only)*100:.2f}%" if len(fy1_pos_only) > 0 else 'N/A',
        f"{(len(fy2_jumbo_only) - len(fy1_jumbo_only))/len(fy1_jumbo_only)*100:.2f}%" if len(fy1_jumbo_only) > 0 else 'N/A',
        f"{(len(fy2_cross) - len(fy1_cross))/len(fy1_cross)*100:.2f}%" if len(fy1_cross) > 0 else 'N/A',
        f"{(len(fy2_only_returns) - len(fy1_only_returns))/len(fy1_only_returns)*100:.2f}%" if len(fy1_only_returns) > 0 else 'N/A',
        'N/A',
        'N/A'
    ]
}

summary_df = pd.DataFrame(summary_data)

# Style the summary table
styled_summary = summary_df.style.set_properties(**{'text-align': 'center'})
styled_summary = styled_summary.set_table_styles([
    {'selector': 'th', 'props': [('background-color', '#4472C4'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
    {'selector': '.col0', 'props': [('text-align', 'left')]}
])

# Apply conditional formatting to Change column
def style_change_column(val):
    if val == 'N/A':
        return ''
    
    try:
        val_num = float(val.rstrip('%'))
        if val_num > 5:
            return 'color: green; font-weight: bold'
        elif val_num < -5:
            return 'color: red; font-weight: bold'
        return ''
    except:
        return ''

styled_summary = styled_summary.applymap(style_change_column, subset=['Change (%)'])

print("\nComprehensive Summary Table - Data Sources and Formulas:")
print("- Data Source: Consolidated metrics from all previous analyses")
print("- Total Unique Customers: Count of distinct customers in each financial year")
print("- Retained Customers: Intersection of FY1 and FY2 customer sets")
print("- Retention Rate: (Retained customers / FY1 customers) * 100%")
print("- New Customers in FY2: FY2 customers not present in FY1")
print("- Lost Customers from FY1: FY1 customers not present in FY2")
print("- POS/Jumbo.ae/Cross-type customers: Counts from respective type analyses")
print("- Customers with Only Returns: Derived from order type analysis")
print("- Type Switchers: Derived from type switching analysis")
print("- Change percentages: ((FY2 value - FY1 value) / FY1 value) * 100%")

display(styled_summary)

# === FINAL REPORT AND KEY INSIGHTS ===
print("\n===== FINAL REPORT AND KEY INSIGHTS =====\n")

# Calculate retention rates for different segments
overall_retention_rate = len(retained_customers) / fy1_total_emails * 100 if fy1_total_emails > 0 else 0

# Create insights DataFrame
insights_data = {
    'Key Insight': [
        f"Overall customer retention rate: {overall_retention_rate:.2f}%",
        f"Cross-type customers retention rate: {cross_type_retention_rate:.2f}%",
        f"POS-only customers retention rate: {pos_only_retention_rate:.2f}%",
        f"Jumbo.ae-only customers retention rate: {jumbo_only_retention_rate:.2f}%"
    ],
    'Interpretation': [
        "Percentage of FY1 customers who remained in FY2",
        "Shows how loyal multi-type customers are",
        "Shows loyalty of POS customers",
        "Shows loyalty of Jumbo.ae-only customers"
    ]
}

insights_df = pd.DataFrame(insights_data)

# Style the insights table
styled_insights = insights_df.style.set_properties(**{'text-align': 'left'})
styled_insights = styled_insights.set_table_styles([
    {'selector': 'th', 'props': [('background-color', '#5B9BD5'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]}
])

print("KEY RETENTION INSIGHTS:")
print("Data Sources and Formulas:")
print("- Overall retention rate: (Retained customers / Total FY1 customers) * 100%")
print("  Formula: len(retained_customers) / fy1_total_emails * 100")
print("- Cross-type customers retention: (Retained cross-type / Total FY1 cross-type) * 100%")
print("  Formula: len(fy1_cross_retained) / len(fy1_cross) * 100")
print("- Type-specific retention rates use the same formula pattern for their respective customer sets")

display(styled_insights)

# Customer overlap metrics
overlap_data = {
    'Metric': [
        f"Total unique customers across both years: {len(fy1_unique_customers.union(fy2_unique_customers)):,}",
        f"Customers present in both years: {len(retained_customers):,} ({len(retained_customers)/len(fy1_unique_customers)*100:.2f}% of FY1)",
        f"New customers in FY2: {len(new_customers_in_fy2):,} ({len(new_customers_in_fy2)/len(fy2_unique_customers)*100:.2f}% of FY2)",
        f"Lost customers from FY1: {len(lost_customers_from_fy1):,} ({len(lost_customers_from_fy1)/len(fy1_unique_customers)*100:.2f}% of FY1)"
    ]
}

overlap_df = pd.DataFrame(overlap_data)

# Style the overlap metrics table
styled_overlap = overlap_df.style.set_properties(**{'text-align': 'left'})
styled_overlap = styled_overlap.set_table_styles([
    {'selector': 'th', 'props': [('background-color', '#5B9BD5'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]}
])

print("\nCUSTOMER OVERLAP METRICS:")
print("Data Sources and Formulas:")
print("- Total unique customers across both years: Union of FY1 and FY2 customer sets")
print("  Formula: len(fy1_unique_customers.union(fy2_unique_customers))")
print("- Customers present in both years: Intersection of FY1 and FY2 customer sets")
print("  Formula: len(retained_customers)")
print("- New customers in FY2: FY2 customers not in FY1")
print("  Formula: len(new_customers_in_fy2)")
print("- Lost customers from FY1: FY1 customers not in FY2")
print("  Formula: len(lost_customers_from_fy1)")
print("- Percentages calculated relative to their respective base year totals")

display(styled_overlap)

# Type distribution insights
fy1_pos_percent = len(fy1_pos_only) / fy1_total_emails * 100 if fy1_total_emails > 0 else 0
fy1_jumbo_percent = len(fy1_jumbo_only) / fy1_total_emails * 100 if fy1_total_emails > 0 else 0
fy1_cross_percent = len(fy1_cross) / fy1_total_emails * 100 if fy1_total_emails > 0 else 0

fy2_pos_percent = len(fy2_pos_only) / fy2_total_emails * 100 if fy2_total_emails > 0 else 0
fy2_jumbo_percent = len(fy2_jumbo_only) / fy2_total_emails * 100 if fy2_total_emails > 0 else 0
fy2_cross_percent = len(fy2_cross) / fy2_total_emails * 100 if fy2_total_emails > 0 else 0

type_insight_data = {
    'Type Insight': [
        f"POS-only customers: FY1 {fy1_pos_percent:.2f}% → FY2 {fy2_pos_percent:.2f}% ({fy2_pos_percent-fy1_pos_percent:.2f}% change)",
        f"Jumbo.ae-only customers: FY1 {fy1_jumbo_percent:.2f}% → FY2 {fy2_jumbo_percent:.2f}% ({fy2_jumbo_percent-fy1_jumbo_percent:.2f}% change)",
        f"Cross-type customers: FY1 {fy1_cross_percent:.2f}% → FY2 {fy2_cross_percent:.2f}% ({fy2_cross_percent-fy1_cross_percent:.2f}% change)"
    ]
}

type_insight_df = pd.DataFrame(type_insight_data)

# Style the type insights table
styled_type_insight = type_insight_df.style.set_properties(**{'text-align': 'left'})
styled_type_insight = styled_type_insight.set_table_styles([
    {'selector': 'th', 'props': [('background-color', '#5B9BD5'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]}
])

print("\nTYPE DISTRIBUTION INSIGHTS:")
print("Data Sources and Formulas:")
print("- POS-only customer percentages: len(fy1_pos_only) / fy1_total_emails * 100 for FY1")
print("  and len(fy2_pos_only) / fy2_total_emails * 100 for FY2")
print("- Jumbo.ae-only customer percentages: len(fy1_jumbo_only) / fy1_total_emails * 100 for FY1")
print("  and len(fy2_jumbo_only) / fy2_total_emails * 100 for FY2")
print("- Cross-type customer percentages: len(fy1_cross) / fy1_total_emails * 100 for FY1")
print("  and len(fy2_cross) / fy2_total_emails * 100 for FY2")
print("- Change calculated as FY2 percentage - FY1 percentage")

display(styled_type_insight)

# Final conclusions
conclusion_data = {
    'Conclusion & Recommendation': [
        f"Retention Analysis: The data shows an overall retention rate of {overall_retention_rate:.2f}%.",
        "Type Strategy: " + (
            "Cross-type customers show the highest retention rate, suggesting customers using both POS and Jumbo.ae are most loyal." 
            if cross_type_retention_rate > pos_only_retention_rate and cross_type_retention_rate > jumbo_only_retention_rate
            else "POS-only customers have higher retention than Jumbo.ae-only customers, suggesting focusing on in-store experience."
            if pos_only_retention_rate > jumbo_only_retention_rate
            else "Jumbo.ae-only customers have higher retention than POS-only customers, suggesting focusing on Jumbo.ae platform experience."
        ),
        f"Customer Growth: {'Increased' if fy2_total_emails > fy1_total_emails else 'Decreased'} by {abs(change_percent):.2f}% from FY1 to FY2.",
        f"Type Shift: " + (
            "More customers are migrating from POS to Jumbo.ae."
            if 'pos_to_jumbo' in locals() and 'jumbo_to_pos' in locals() and pos_to_jumbo > jumbo_to_pos
            else "More customers are returning to POS from Jumbo.ae."
            if 'pos_to_jumbo' in locals() and 'jumbo_to_pos' in locals() and jumbo_to_pos > pos_to_jumbo
            else "Similar numbers of customers switching between POS and Jumbo.ae."
        )
    ]
}

conclusion_df = pd.DataFrame(conclusion_data)

# Style the conclusions table
styled_conclusion = conclusion_df.style.set_properties(**{'text-align': 'left'})
styled_conclusion = styled_conclusion.set_table_styles([
    {'selector': 'th', 'props': [('background-color', '#4472C4'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]}
])

print("\nCONCLUSION AND RECOMMENDATIONS:")
print("Data Sources and Formulas for Conclusions:")
print("- Retention Analysis: Based on overall retention rate calculated from retained customers")
print("- Type Strategy: Based on comparison of retention rates across different type segments (POS-only, Jumbo.ae-only, Cross-type)")
print("- Customer Growth: Based on year-over-year change in total unique customers")
print("- Type Shift: Based on customer migration patterns between POS and Jumbo.ae")

display(styled_conclusion)

print("\nThis analysis provides a comprehensive overview of customer behavior across the two financial years, focusing on retention rates, channel preferences, and buying patterns.")


6. TYPE SWITCHING BEHAVIOR
This analysis shows how customers switch between types (POS and Jumbo.ae) from FY1 to FY2.
Analyzing type switching for all 44,578 customers that appear in both years
Computing primary types for all customers (this may take a moment for large datasets)...
Calculating primary types for FY1...Calculating primary types for FY1...

Calculating primary types for FY2...
Calculating primary types for FY2...
Analyzing type switching patterns...
Found 44,578 customers with valid type data in both years

Type Switching Behavior for All Retained Customers:
Data Sources and Formulas:
- Data Source: Primary type analysis of retained customers between FY1 and FY2
- Primary type: Most frequent type used by each customer in each year
  Formula: customer_df.groupby('customeremail')['channel'].agg(lambda x: x.value_counts().index[0])
- Same type: Customers whose primary type remained the same from FY1 to FY2
- Stayed with POS/Jumbo.ae: Subset of 'same type' customers by speci

Unnamed: 0,Switching Pattern,Count,Percentage
0,Stayed with same type (total),42049,94.33%
1,- Stayed with POS,41097,92.19%
2,- Stayed with Jumbo.ae,285,0.64%
3,Switched from POS to Jumbo.ae,509,1.14%
4,Switched from Jumbo.ae to POS,318,0.71%
5,Other patterns (involving UNKNOWN),1702,3.82%



Type Switching Insights:
- More customers are moving from POS to Jumbo.ae (509 vs 318)
- Net shift towards Jumbo.ae: 191 customers (0.43% of tracked customers)
- Type loyalty is 94.33%, with 97.74% loyal to POS and 0.68% loyal to Jumbo.ae

7. COMPREHENSIVE SUMMARY TABLE
This table provides a consolidated view of all key customer metrics across both financial years.

Comprehensive Summary Table - Data Sources and Formulas:
- Data Source: Consolidated metrics from all previous analyses
- Total Unique Customers: Count of distinct customers in each financial year
- Retained Customers: Intersection of FY1 and FY2 customer sets
- Retention Rate: (Retained customers / FY1 customers) * 100%
- New Customers in FY2: FY2 customers not present in FY1
- Lost Customers from FY1: FY1 customers not present in FY2
- POS/Jumbo.ae/Cross-type customers: Counts from respective type analyses
- Customers with Only Returns: Derived from order type analysis
- Type Switchers: Derived from type switching analys

Unnamed: 0,Metric,FY1 Value,FY2 Value,Change (%)
0,Total Unique Customers,247749.0,215634,-12.96%
1,Retained Customers,,44578,
2,Retention Rate (%),,17.99%,
3,New Customers in FY2,,171056,79.33%
4,Lost Customers from FY1,,203171,82.01%
5,POS-only Customers,231665.0,192559,-16.88%
6,Jumbo.ae-only Customers,8199.0,12053,47.01%
7,Cross-type Customers,1269.0,1231,-2.99%
8,Customers with Only Returns,247749.0,215634,-12.96%
9,POS-to-Jumbo.ae Switchers,,509,



===== FINAL REPORT AND KEY INSIGHTS =====

KEY RETENTION INSIGHTS:
Data Sources and Formulas:
- Overall retention rate: (Retained customers / Total FY1 customers) * 100%
  Formula: len(retained_customers) / fy1_total_emails * 100
- Cross-type customers retention: (Retained cross-type / Total FY1 cross-type) * 100%
  Formula: len(fy1_cross_retained) / len(fy1_cross) * 100
- Type-specific retention rates use the same formula pattern for their respective customer sets


Unnamed: 0,Key Insight,Interpretation
0,Overall customer retention rate: 17.99%,Percentage of FY1 customers who remained in FY2
1,Cross-type customers retention rate: 35.15%,Shows how loyal multi-type customers are
2,POS-only customers retention rate: 18.08%,Shows loyalty of POS customers
3,Jumbo.ae-only customers retention rate: 14.33%,Shows loyalty of Jumbo.ae-only customers



CUSTOMER OVERLAP METRICS:
Data Sources and Formulas:
- Total unique customers across both years: Union of FY1 and FY2 customer sets
  Formula: len(fy1_unique_customers.union(fy2_unique_customers))
- Customers present in both years: Intersection of FY1 and FY2 customer sets
  Formula: len(retained_customers)
- New customers in FY2: FY2 customers not in FY1
  Formula: len(new_customers_in_fy2)
- Lost customers from FY1: FY1 customers not in FY2
  Formula: len(lost_customers_from_fy1)
- Percentages calculated relative to their respective base year totals


Unnamed: 0,Metric
0,"Total unique customers across both years: 418,805"
1,"Customers present in both years: 44,578 (17.99% of FY1)"
2,"New customers in FY2: 171,056 (79.33% of FY2)"
3,"Lost customers from FY1: 203,171 (82.01% of FY1)"



TYPE DISTRIBUTION INSIGHTS:
Data Sources and Formulas:
- POS-only customer percentages: len(fy1_pos_only) / fy1_total_emails * 100 for FY1
  and len(fy2_pos_only) / fy2_total_emails * 100 for FY2
- Jumbo.ae-only customer percentages: len(fy1_jumbo_only) / fy1_total_emails * 100 for FY1
  and len(fy2_jumbo_only) / fy2_total_emails * 100 for FY2
- Cross-type customer percentages: len(fy1_cross) / fy1_total_emails * 100 for FY1
  and len(fy2_cross) / fy2_total_emails * 100 for FY2
- Change calculated as FY2 percentage - FY1 percentage


Unnamed: 0,Type Insight
0,POS-only customers: FY1 93.51% → FY2 89.30% (-4.21% change)
1,Jumbo.ae-only customers: FY1 3.31% → FY2 5.59% (2.28% change)
2,Cross-type customers: FY1 0.51% → FY2 0.57% (0.06% change)



CONCLUSION AND RECOMMENDATIONS:
Data Sources and Formulas for Conclusions:
- Retention Analysis: Based on overall retention rate calculated from retained customers
- Type Strategy: Based on comparison of retention rates across different type segments (POS-only, Jumbo.ae-only, Cross-type)
- Customer Growth: Based on year-over-year change in total unique customers
- Type Shift: Based on customer migration patterns between POS and Jumbo.ae


Unnamed: 0,Conclusion & Recommendation
0,Retention Analysis: The data shows an overall retention rate of 17.99%.
1,"Type Strategy: Cross-type customers show the highest retention rate, suggesting customers using both POS and Jumbo.ae are most loyal."
2,Customer Growth: Decreased by 12.96% from FY1 to FY2.
3,Type Shift: More customers are migrating from POS to Jumbo.ae.



This analysis provides a comprehensive overview of customer behavior across the two financial years, focusing on retention rates, channel preferences, and buying patterns.


In [6]:
# === ANALYSIS 8: 2025 Customer Repeat Analysis by 2024 Half-Year ===
print("\n8. CUSTOMER REPEAT ANALYSIS FROM 2024 HALF-YEARS")
print("This analysis examines customers from 2025 who were also present in the first and second half of 2024.")

# Using already loaded data from fy1_data and fy2_data instead of re-reading Excel files
print("Using already loaded data from previous cells to perform analysis")

# Extract data from different time periods using the existing dataframes
# 1. Extract customers from 2025 files (in FY2 data)
print("\nExtracting data from different time periods...")
customers_2025 = set()
if 'year' in fy2_data.columns:
    # If year information is directly available in the data
    for _, row in fy2_data[fy2_data['year'] == 2025].iterrows():
        customers_2025.add(str(row['customeremail']).strip().lower())
else:
    # Otherwise, detect year from file name
    for _, row in fy2_data[fy2_data['file_name'].str.contains('25|2025', case=False)].iterrows():
        customers_2025.add(str(row['customeremail']).strip().lower())

print(f"Found {len(customers_2025)} unique customers in 2025 files")

# 2. Extract customers from first half of 2024 (Jan-Jun) - could be in both FY1 and FY2
first_half_2024 = set()

# First half patterns to search for in filenames
first_half_patterns = ['jan.*24', 'feb.*24', 'mar.*24', 'apr.*24', 'may.*24', 'jun.*24', 
                       '24.*jan', '24.*feb', '24.*mar', '24.*apr', '24.*may', '24.*jun']

# Check in FY1 data
for pattern in first_half_patterns:
    mask = fy1_data['file_name'].str.contains(pattern, case=False, regex=True)
    for _, row in fy1_data[mask].iterrows():
        first_half_2024.add(str(row['customeremail']).strip().lower())

# Check in FY2 data
for pattern in first_half_patterns:
    mask = fy2_data['file_name'].str.contains(pattern, case=False, regex=True)
    for _, row in fy2_data[mask].iterrows():
        first_half_2024.add(str(row['customeremail']).strip().lower())

print(f"Found {len(first_half_2024)} unique customers in first half of 2024 files")

# 3. Extract customers from second half of 2024 (Jul-Dec) - likely only in FY2
second_half_2024 = set()

# Second half patterns to search for in filenames
second_half_patterns = ['jul.*24', 'aug.*24', 'sep.*24', 'oct.*24', 'nov.*24', 'dec.*24',
                        '24.*jul', '24.*aug', '24.*sep', '24.*oct', '24.*nov', '24.*dec']

# Check in FY1 data (just in case)
for pattern in second_half_patterns:
    mask = fy1_data['file_name'].str.contains(pattern, case=False, regex=True)
    for _, row in fy1_data[mask].iterrows():
        second_half_2024.add(str(row['customeremail']).strip().lower())

# Check in FY2 data
for pattern in second_half_patterns:
    mask = fy2_data['file_name'].str.contains(pattern, case=False, regex=True)
    for _, row in fy2_data[mask].iterrows():
        second_half_2024.add(str(row['customeremail']).strip().lower())

print(f"Found {len(second_half_2024)} unique customers in second half of 2024 files")

# Find repeated customers
print("\nAnalyzing customer overlap between periods...")
repeat_from_first_half = customers_2025.intersection(first_half_2024)
repeat_from_second_half = customers_2025.intersection(second_half_2024)
repeat_from_both_halves = repeat_from_first_half.intersection(repeat_from_second_half)
completely_new = customers_2025 - first_half_2024.union(second_half_2024)

# Print detailed metrics for debugging
print(f"Customers present in both 2025 and first half 2024: {len(repeat_from_first_half):,}")
print(f"Customers present in both 2025 and second half 2024: {len(repeat_from_second_half):,}")
print(f"Customers present across all periods (2025, H1 2024, H2 2024): {len(repeat_from_both_halves):,}")
print(f"Completely new customers in 2025 (not in any 2024 period): {len(completely_new):,}")

# Calculate metrics
if customers_2025:
    repeat_first_half_percent = len(repeat_from_first_half) / len(customers_2025) * 100
    repeat_second_half_percent = len(repeat_from_second_half) / len(customers_2025) * 100
    repeat_both_halves_percent = len(repeat_from_both_halves) / len(customers_2025) * 100
    completely_new_percent = len(completely_new) / len(customers_2025) * 100
else:
    repeat_first_half_percent = repeat_second_half_percent = repeat_both_halves_percent = completely_new_percent = 0
    print("WARNING: No customers found in 2025 files")

# Create DataFrame for half-year repeat analysis
half_year_df = pd.DataFrame({
    'Customer Source': [
        'Repeat customers from first half of 2024 (Jan-Jun)',
        'Repeat customers from second half of 2024 (Jul-Dec)',
        'Repeat customers from both halves of 2024',
        'Completely new customers in 2025'
    ],
    'Count': [
        f"{len(repeat_from_first_half):,}",
        f"{len(repeat_from_second_half):,}",
        f"{len(repeat_from_both_halves):,}",
        f"{len(completely_new):,}"
    ],
    'Percentage of 2025 Customers': [
        f"{repeat_first_half_percent:.2f}%",
        f"{repeat_second_half_percent:.2f}%",
        f"{repeat_both_halves_percent:.2f}%",
        f"{completely_new_percent:.2f}%"
    ]
})

# Style the half-year analysis table
styled_half_year = half_year_df.style.set_properties(**{'text-align': 'center'})
styled_half_year = styled_half_year.set_table_styles([
    {'selector': 'th', 'props': [('background-color', '#5B9BD5'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
    {'selector': '.col0', 'props': [('text-align', 'left')]}
])

print("\nRepeat Customer Analysis by 2024 Half-Year:")
print("Data Sources and Formulas:")
print("- Data Source: Customer sets from first half 2024, second half 2024, and 2025")
print("- First half 2024: Customers from Jan-Jun 2024 files (based on filename patterns)")
print("- Second half 2024: Customers from Jul-Dec 2024 files (based on filename patterns)")
print("- Repeat from first half: Intersection of 2025 and first half 2024 customers")
print("  Formula: customers_2025.intersection(first_half_2024)")
print("- Repeat from second half: Intersection of 2025 and second half 2024 customers")
print("  Formula: customers_2025.intersection(second_half_2024)")
print("- Repeat from both halves: Intersection of 2025, first half, and second half customers")
print("  Formula: repeat_from_first_half.intersection(repeat_from_second_half)")
print("- Completely new: 2025 customers not found in any 2024 data")
print("  Formula: customers_2025 - first_half_2024.union(second_half_2024)")
print("- Percentage: (Customer count / Total 2025 customers) * 100%")

display(styled_half_year)

# Create a visual comparison of retention from each half-year
# Calculate retention rates with safeguards against division by zero
if first_half_2024:
    first_half_rate = f"{len(repeat_from_first_half) / len(first_half_2024) * 100:.2f}%"
else:
    first_half_rate = 'N/A (no customers in first half 2024)'

if second_half_2024:
    second_half_rate = f"{len(repeat_from_second_half) / len(second_half_2024) * 100:.2f}%"
else:
    second_half_rate = 'N/A (no customers in second half 2024)'

both_halves = first_half_2024.intersection(second_half_2024)
if both_halves:
    both_halves_rate = f"{len(repeat_from_both_halves) / len(both_halves) * 100:.2f}%"
else:
    both_halves_rate = 'N/A (no customers in both halves)'

half_year_comparison = pd.DataFrame({
    'Time Period': [
        'First Half 2024 → 2025',
        'Second Half 2024 → 2025',
        'Both Halves → 2025'
    ],
    'Retention Rate': [
        first_half_rate,
        second_half_rate,
        both_halves_rate
    ],
    'Numerator/Denominator': [
        f"{len(repeat_from_first_half):,} / {len(first_half_2024):,}" if first_half_2024 else "N/A",
        f"{len(repeat_from_second_half):,} / {len(second_half_2024):,}" if second_half_2024 else "N/A",
        f"{len(repeat_from_both_halves):,} / {len(both_halves):,}" if both_halves else "N/A"
    ]
})

# Style the half-year comparison table
styled_comparison = half_year_comparison.style.set_properties(**{'text-align': 'center'})
styled_comparison = styled_comparison.set_table_styles([
    {'selector': 'th', 'props': [('background-color', '#4472C4'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
    {'selector': '.col0', 'props': [('text-align', 'left')]}
])

print("\nRetention Rate Comparison:")
print("This shows what percentage of customers from each 2024 time period returned in 2025")
print("Data Sources and Formulas:")
print("- First Half 2024 → 2025 Retention Rate: (Customers in both periods / First half 2024 customers) * 100%")
print("  Formula: len(repeat_from_first_half) / len(first_half_2024) * 100")
print("- Second Half 2024 → 2025 Retention Rate: (Customers in both periods / Second half 2024 customers) * 100%")
print("  Formula: len(repeat_from_second_half) / len(second_half_2024) * 100")
print("- Both Halves → 2025 Retention Rate: (Customers in all periods / Customers in both halves of 2024) * 100%")
print("  Formula: len(repeat_from_both_halves) / len(both_halves) * 100")
print("- Numerator/Denominator shows the raw counts used in each calculation")

display(styled_comparison)

# Key insights with data validation
print("\nKey Insights:")
if not customers_2025:
    print("- WARNING: No customer data found for 2025. Check file naming patterns and data availability.")
elif len(customers_2025) < 10:
    print(f"- WARNING: Only {len(customers_2025)} customers found in 2025 files, which is suspiciously low.")
    print("  Check file naming patterns and email column detection.")
else:
    # Normal insights when we have sufficient data
    if repeat_first_half_percent > repeat_second_half_percent:
        print(f"- Customers from the first half of 2024 showed stronger repeat purchase behavior ({repeat_first_half_percent:.2f}%)")
    else:
        print(f"- Customers from the second half of 2024 showed stronger repeat purchase behavior ({repeat_second_half_percent:.2f}%)")
    
    print(f"- {repeat_both_halves_percent:.2f}% of 2025 customers were consistent throughout 2024 (both halves)")
    print(f"- {completely_new_percent:.2f}% of 2025 customers are completely new (not seen in 2024)")

# Calculate full year 2024 to 2025 retention metrics
all_2024_customers = first_half_2024.union(second_half_2024)
repeating_full_year = customers_2025.intersection(all_2024_customers)
full_year_retention_percent = len(repeating_full_year) / len(all_2024_customers) * 100 if all_2024_customers else 0
full_year_repeat_percent = len(repeating_full_year) / len(customers_2025) * 100 if customers_2025 else 0

# Create a DataFrame for the full-year analysis
full_year_df = pd.DataFrame({
    'Analysis Type': [
        'Full Year 2024 → 2025 Retention',
        'Full Year 2024 → 2025 Repeat Rate',
        'New in 2025 (not in 2024)',
        'In 2024 but not in 2025'
    ],
    'Customer Count': [
        f"{len(repeating_full_year):,} / {len(all_2024_customers):,}",
        f"{len(repeating_full_year):,} / {len(customers_2025):,}",
        f"{len(customers_2025 - all_2024_customers):,}",
        f"{len(all_2024_customers - customers_2025):,}"
    ],
    'Percentage': [
        f"{full_year_retention_percent:.2f}%",
        f"{full_year_repeat_percent:.2f}%",
        f"{len(customers_2025 - all_2024_customers) / len(customers_2025) * 100:.2f}%",
        f"{len(all_2024_customers - customers_2025) / len(all_2024_customers) * 100:.2f}%"
    ]
})

# Style the full-year analysis table
styled_full_year = full_year_df.style.set_properties(**{'text-align': 'center'})
styled_full_year = styled_full_year.set_table_styles([
    {'selector': 'th', 'props': [('background-color', '#5B9BD5'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
    {'selector': '.col0', 'props': [('text-align', 'left')]}
])

print("\nFULL YEAR 2024 TO 2025 ANALYSIS:")
print("This shows the complete year-over-year customer retention metrics")
print("Data Sources and Formulas:")
print("- Data Source: Combined customer sets from all 2024 files and all 2025 files")
print("- Full Year 2024 → 2025 Retention: (Customers in both years / All 2024 customers) * 100%")
print("  Formula: len(repeating_full_year) / len(all_2024_customers) * 100")
print("- Full Year 2024 → 2025 Repeat Rate: (Customers in both years / All 2025 customers) * 100%")
print("  Formula: len(repeating_full_year) / len(customers_2025) * 100")
print("- New in 2025: Customers in 2025 not found in any 2024 data")
print("  Formula: len(customers_2025 - all_2024_customers) / len(customers_2025) * 100")
print("- In 2024 but not in 2025: Customers in 2024 not present in 2025")
print("  Formula: len(all_2024_customers - customers_2025) / len(all_2024_customers) * 100")
print("- Customer Count column shows the raw counts used in each calculation")

display(styled_full_year)

# Add analytical insights
print(f"\nAdditional Context:")
print(f"- Total unique customers in all 2024 files: {len(all_2024_customers):,}")
print(f"- Total unique customers in 2025 files: {len(customers_2025):,}")
print(f"- Customer overlap between first and second half of 2024: {len(first_half_2024.intersection(second_half_2024)):,} " +
      f"({len(first_half_2024.intersection(second_half_2024)) / len(all_2024_customers) * 100:.2f}% of all 2024 customers)" if all_2024_customers else "- No data available for 2024")

# === ANALYSIS 9: Three-Year Comparison (First 4 Months 2025 vs 2024 vs 2023) ===
print("\n9. THREE-YEAR COMPARISON (FIRST 4 MONTHS 2025 vs 2024 vs 2023)")
print("This analysis compares the first four months of 2025 (Jan-Apr, including April25 file) with full year data from 2024 and 2023")

# Get customers from first 4 months 2025 (Jan-Apr 2025, including April25 file)
first_4_months_2025_customers = set()
if 'year' in fy2_data.columns and 'month' in fy2_data.columns:
    # If year and month information is directly available in the data
    first_4_mask = (fy2_data['year'] == 2025) & (fy2_data['month'].isin([1, 2, 3, 4]))
    for _, row in fy2_data[first_4_mask].iterrows():
        first_4_months_2025_customers.add(str(row['customeremail']).strip().lower())
else:
    # Otherwise, detect from file name
    first_4_patterns = ['jan.*25', 'feb.*25', 'mar.*25', 'apr.*25', 'april.*25', 
                       '25.*jan', '25.*feb', '25.*mar', '25.*apr', '25.*april',
                       'jan.*2025', 'feb.*2025', 'mar.*2025', 'apr.*2025', 'april.*2025']
    for pattern in first_4_patterns:
        mask = fy2_data['file_name'].str.contains(pattern, case=False, regex=True)
        for _, row in fy2_data[mask].iterrows():
            first_4_months_2025_customers.add(str(row['customeremail']).strip().lower())

# Also check for April25 file in main q directory if it exists
april25_file_path = os.path.join(q_main_path, "CUSTOMER PROFILE_April25.xlsx")
if os.path.exists(april25_file_path):
    april25_df = process_excel_file(april25_file_path)
    if april25_df is not None and not april25_df.empty:
        for _, row in april25_df.iterrows():
            first_4_months_2025_customers.add(str(row['customeremail']).strip().lower())
        print(f"Added {len(april25_df)} records from April25 file")

print(f"Found {len(first_4_months_2025_customers):,} unique customers in first 4 months of 2025 (Jan-Apr)")

# Get all 2023 customers
all_2023_customers = set()
for _, row in q_2023_data.iterrows():
    all_2023_customers.add(str(row['customeremail']).strip().lower())

# Add any 2023 customers from fy1_data (just in case there are any files from 2023 there)
if 'year' in fy1_data.columns:
    mask_2023 = fy1_data['year'] == 2023
    for _, row in fy1_data[mask_2023].iterrows():
        all_2023_customers.add(str(row['customeremail']).strip().lower())
else:
    # Look for files with 2023 in the name
    mask_2023 = fy1_data['file_name'].str.contains('23|2023', case=False)
    for _, row in fy1_data[mask_2023].iterrows():
        all_2023_customers.add(str(row['customeremail']).strip().lower())

print(f"Found {len(all_2023_customers):,} unique customers in 2023")

# Calculate overlaps between the years
overlap_2025_4m_2024 = first_4_months_2025_customers.intersection(all_2024_customers)
overlap_2025_4m_2023 = first_4_months_2025_customers.intersection(all_2023_customers)
overlap_2024_2023 = all_2024_customers.intersection(all_2023_customers)
overlap_all_years = first_4_months_2025_customers.intersection(all_2024_customers).intersection(all_2023_customers)

# Calculate percentages for the comparative analysis
if first_4_months_2025_customers:
    first_4m_2025_to_2024_percent = len(overlap_2025_4m_2024) / len(first_4_months_2025_customers) * 100
    first_4m_2025_to_2023_percent = len(overlap_2025_4m_2023) / len(first_4_months_2025_customers) * 100
    first_4m_2025_new_percent = len(first_4_months_2025_customers - all_2024_customers - all_2023_customers) / len(first_4_months_2025_customers) * 100
else:
    first_4m_2025_to_2024_percent = first_4m_2025_to_2023_percent = first_4m_2025_new_percent = 0

if all_2024_customers:
    y2024_to_first_4m_2025_percent = len(overlap_2025_4m_2024) / len(all_2024_customers) * 100
    y2024_to_2023_percent = len(overlap_2024_2023) / len(all_2024_customers) * 100
else:
    y2024_to_first_4m_2025_percent = y2024_to_2023_percent = 0

if all_2023_customers:
    y2023_to_first_4m_2025_percent = len(overlap_2025_4m_2023) / len(all_2023_customers) * 100
    y2023_to_2024_percent = len(overlap_2024_2023) / len(all_2023_customers) * 100
else:
    y2023_to_first_4m_2025_percent = y2023_to_2024_percent = 0

# Create a DataFrame for the three-year comparison
three_year_df = pd.DataFrame({
    'Comparison': [
        'First 4 months 2025 customers also in 2024',
        'First 4 months 2025 customers also in 2023',
        'First 4 months 2025 customers in both 2024 & 2023',
        'First 4 months 2025 customers new (not in 2023/2024)',
        '2024 customers also in first 4 months 2025',
        '2024 customers also in 2023',
        '2023 customers also in first 4 months 2025',
        '2023 customers also in 2024',
        'Customers present in all three periods'
    ],
    'Customer Count': [
        f"{len(overlap_2025_4m_2024):,}",
        f"{len(overlap_2025_4m_2023):,}",
        f"{len(overlap_all_years):,}",
        f"{len(first_4_months_2025_customers - all_2024_customers - all_2023_customers):,}",
        f"{len(overlap_2025_4m_2024):,}",
        f"{len(overlap_2024_2023):,}",
        f"{len(overlap_2025_4m_2023):,}",
        f"{len(overlap_2024_2023):,}",
        f"{len(overlap_all_years):,}"
    ],
    'Percentage': [
        f"{first_4m_2025_to_2024_percent:.2f}% of first 4 months 2025",
        f"{first_4m_2025_to_2023_percent:.2f}% of first 4 months 2025",
        f"{len(overlap_all_years) / len(first_4_months_2025_customers) * 100:.2f}% of first 4 months 2025" if first_4_months_2025_customers else "0.00%",
        f"{first_4m_2025_new_percent:.2f}% of first 4 months 2025",
        f"{y2024_to_first_4m_2025_percent:.2f}% of 2024",
        f"{y2024_to_2023_percent:.2f}% of 2024",
        f"{y2023_to_first_4m_2025_percent:.2f}% of 2023",
        f"{y2023_to_2024_percent:.2f}% of 2023",
        f"Present in all three periods"
    ]
})

# Style the three-year comparison table
styled_three_year = three_year_df.style.set_properties(**{'text-align': 'center'})
styled_three_year = styled_three_year.set_table_styles([
    {'selector': 'th', 'props': [('background-color', '#5B9BD5'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
    {'selector': '.col0', 'props': [('text-align', 'left')]}
])

print("\nThree-Year Customer Comparison (First 4 Months 2025 vs 2024 vs 2023):")
print("Data Sources and Formulas:")
print("- Data Source: Customer sets from first 4 months 2025 (including April25), full year 2024, and full year 2023")
print("- First 4 months 2025: Customers from Jan-Apr 2025 files (based on filename patterns) including April25 file")
print("- 2024: All unique customers from 2024 files in either financial year")
print("- 2023: All unique customers from 2023 files")
print("- First 4 months 2025 customers also in 2024: first_4_months_2025_customers.intersection(all_2024_customers)")
print("- First 4 months 2025 customers also in 2023: first_4_months_2025_customers.intersection(all_2023_customers)")
print("- First 4 months 2025 customers in both 2024 & 2023: first_4_months_2025_customers.intersection(all_2024_customers).intersection(all_2023_customers)")
print("- New customers: first_4_months_2025_customers - all_2024_customers - all_2023_customers")
print("- Percentages calculated relative to their respective base customer sets")

display(styled_three_year)

# Create a visual representation of year-over-year retention trends
retention_trend_df = pd.DataFrame({
    'Retention Metric': [
        'First 4 months 2025 customers retained from 2024',
        'First 4 months 2025 customers retained from 2023',
        '2024 customers retained from 2023',
        'First 4 months 2025 completely new customers'
    ],
    'Percentage': [
        f"{first_4m_2025_to_2024_percent:.2f}%",
        f"{first_4m_2025_to_2023_percent:.2f}%",
        f"{y2024_to_2023_percent:.2f}%",
        f"{first_4m_2025_new_percent:.2f}%"
    ],
    'Count': [
        f"{len(overlap_2025_4m_2024):,} / {len(first_4_months_2025_customers):,}",
        f"{len(overlap_2025_4m_2023):,} / {len(first_4_months_2025_customers):,}",
        f"{len(overlap_2024_2023):,} / {len(all_2024_customers):,}",
        f"{len(first_4_months_2025_customers - all_2024_customers - all_2023_customers):,} / {len(first_4_months_2025_customers):,}"
    ]
})

# Style the retention trend table
styled_trend = retention_trend_df.style.set_properties(**{'text-align': 'center'})
styled_trend = styled_trend.set_table_styles([
    {'selector': 'th', 'props': [('background-color', '#4472C4'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
    {'selector': '.col0', 'props': [('text-align', 'left')]}
])

print("\nRetention Trend Analysis:")
print("Data Sources and Formulas:")
print("- First 4 months 2025 customers retained from 2024: (Overlap between first 4 months 2025 and 2024 / Total first 4 months 2025 customers) * 100%")
print("  Formula: len(overlap_2025_4m_2024) / len(first_4_months_2025_customers) * 100")
print("- First 4 months 2025 customers retained from 2023: (Overlap between first 4 months 2025 and 2023 / Total first 4 months 2025 customers) * 100%")
print("  Formula: len(overlap_2025_4m_2023) / len(first_4_months_2025_customers) * 100")
print("- 2024 customers retained from 2023: (Overlap between 2024 and 2023 / Total 2024 customers) * 100%")
print("  Formula: len(overlap_2024_2023) / len(all_2024_customers) * 100")
print("- First 4 months 2025 completely new customers: (New customers in first 4 months 2025 / Total first 4 months 2025 customers) * 100%")
print("  Formula: len(first_4_months_2025_customers - all_2024_customers - all_2023_customers) / len(first_4_months_2025_customers) * 100")
print("- Count column shows the raw counts (numerator/denominator) used in each calculation")

display(styled_trend)

# === ANALYSIS 10: Cross-Type Customer Comparison Across Years ===
print("\n10. CROSS-TYPE CUSTOMER COMPARISON ACROSS YEARS")
print("This analysis compares cross-type shopping behavior (POS and Jumbo.ae) across 2023, 2024, and first 4 months 2025")

# Function to identify cross-type customers in a dataset
def identify_cross_type_customers(df):
    pos_customers = set()
    jumbo_customers = set()
    for _, row in df[df['channel'] == 'POS'].iterrows():
        pos_customers.add(str(row['customeremail']).strip().lower())
    for _, row in df[df['channel'] == 'Jumbo.ae'].iterrows():
        jumbo_customers.add(str(row['customeremail']).strip().lower())
    cross_type = pos_customers.intersection(jumbo_customers)
    return cross_type, pos_customers, jumbo_customers

# Identify cross-type customers in each year/period
print("\nAnalyzing cross-type shopping behavior...")

# For 2023
cross_2023, pos_2023, jumbo_2023 = identify_cross_type_customers(q_2023_data)
print(f"2023: Found {len(cross_2023):,} cross-type customers out of {len(all_2023_customers):,} total customers")

# For 2024 (combining both halves)
cross_2024 = set()
pos_2024 = set()
jumbo_2024 = set()

# First check FY1 data
cross_fy1, pos_fy1, jumbo_fy1 = identify_cross_type_customers(fy1_data[fy1_data['file_name'].str.contains('24|2024', case=False)])
cross_2024.update(cross_fy1)
pos_2024.update(pos_fy1)
jumbo_2024.update(jumbo_fy1)

# Then check FY2 data
cross_fy2, pos_fy2, jumbo_fy2 = identify_cross_type_customers(fy2_data[fy2_data['file_name'].str.contains('24|2024', case=False)])
cross_2024.update(cross_fy2)
pos_2024.update(pos_fy2)
jumbo_2024.update(jumbo_fy2)

print(f"2024: Found {len(cross_2024):,} cross-type customers out of {len(all_2024_customers):,} total customers")

# For first 4 months 2025
first_4m_2025_df = fy2_data[fy2_data['file_name'].str.contains('jan.*25|feb.*25|mar.*25|apr.*25|april.*25|25.*jan|25.*feb|25.*mar|25.*apr|25.*april', case=False, regex=True)]
cross_first_4m_2025, pos_first_4m_2025, jumbo_first_4m_2025 = identify_cross_type_customers(first_4m_2025_df)
print(f"First 4 months 2025: Found {len(cross_first_4m_2025):,} cross-type customers out of {len(first_4_months_2025_customers):,} total customers")

# Create a DataFrame to compare cross-type behavior
cross_type_comparison = pd.DataFrame({
    'Time Period': ['2023', '2024', 'First 4 Months 2025'],
    'Cross-Type Customers': [
        f"{len(cross_2023):,} ({len(cross_2023)/len(all_2023_customers)*100:.2f}%)" if all_2023_customers else "N/A",
        f"{len(cross_2024):,} ({len(cross_2024)/len(all_2024_customers)*100:.2f}%)" if all_2024_customers else "N/A", 
        f"{len(cross_first_4m_2025):,} ({len(cross_first_4m_2025)/len(first_4_months_2025_customers)*100:.2f}%)" if first_4_months_2025_customers else "N/A"
    ],
    'POS-Only Customers': [
        f"{len(pos_2023 - cross_2023):,} ({len(pos_2023 - cross_2023)/len(all_2023_customers)*100:.2f}%)" if all_2023_customers else "N/A",
        f"{len(pos_2024 - cross_2024):,} ({len(pos_2024 - cross_2024)/len(all_2024_customers)*100:.2f}%)" if all_2024_customers else "N/A",
        f"{len(pos_first_4m_2025 - cross_first_4m_2025):,} ({len(pos_first_4m_2025 - cross_first_4m_2025)/len(first_4_months_2025_customers)*100:.2f}%)" if first_4_months_2025_customers else "N/A"
    ],
    'Jumbo.ae-Only Customers': [
        f"{len(jumbo_2023 - cross_2023):,} ({len(jumbo_2023 - cross_2023)/len(all_2023_customers)*100:.2f}%)" if all_2023_customers else "N/A",
        f"{len(jumbo_2024 - cross_2024):,} ({len(jumbo_2024 - cross_2024)/len(all_2024_customers)*100:.2f}%)" if all_2024_customers else "N/A",
        f"{len(jumbo_first_4m_2025 - cross_first_4m_2025):,} ({len(jumbo_first_4m_2025 - cross_first_4m_2025)/len(first_4_months_2025_customers)*100:.2f}%)" if first_4_months_2025_customers else "N/A"
    ]
})

# Style the cross-type comparison table
styled_cross_type = cross_type_comparison.style.set_properties(**{'text-align': 'center'})
styled_cross_type = styled_cross_type.set_table_styles([
    {'selector': 'th', 'props': [('background-color', '#5B9BD5'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
    {'selector': '.col0', 'props': [('text-align', 'left')]}
])

print("\nCross-Type Shopping Behavior Comparison:")
print("Data Sources and Formulas:")
print("- Data Source: Customer type behavior analysis across all three time periods")
print("- 2023 data: From q_2023_data dataframe")
print("- 2024 data: Combined from fy1_data and fy2_data with '24' or '2024' in filename")
print("- First 4 months 2025 data: From fy2_data with Jan/Feb/Mar/Apr 2025 patterns in filename")
print("- Cross-Type Customers: Customers who used both POS and Jumbo.ae types")
print("  Formula: pos_customers.intersection(jumbo_customers) for each time period")
print("- POS-Only Customers: Customers who only used POS type (not cross-type)")
print("  Formula: pos_customers - cross_type_customers for each time period")
print("- Jumbo.ae-Only Customers: Customers who only used Jumbo.ae type (not cross-type)")
print("  Formula: jumbo_customers - cross_type_customers for each time period")
print("- Percentages: (Customer count / Total period customers) * 100%")

display(styled_cross_type)

# === ANALYSIS 11: Combined 2023+2024 Retention in 2025 Analysis ===
print("\n11. COMBINED 2023+2024 RETENTION IN 2025 ANALYSIS")
print("This analysis examines customers from 2025 who were retained from both 2024 and 2023")

# Get customers present in both 2023 and 2024
customers_2023_2024_combined = all_2023_customers.union(all_2024_customers)
customers_both_2023_2024 = all_2023_customers.intersection(all_2024_customers)
print(f"Total unique customers across 2023 and 2024 combined: {len(customers_2023_2024_combined):,}")
print(f"Customers present in both 2023 and 2024: {len(customers_both_2023_2024):,}")

# Calculate retention rates for full 2025 (using all 2025 customers found)
retained_from_combined = customers_2025.intersection(customers_2023_2024_combined)
retained_from_both = customers_2025.intersection(customers_both_2023_2024)

# Calculate percentages
combined_retention_rate = len(retained_from_combined) / len(customers_2023_2024_combined) * 100 if customers_2023_2024_combined else 0
both_retention_rate = len(retained_from_both) / len(customers_both_2023_2024) * 100 if customers_both_2023_2024 else 0

# For 2025 perspective
combined_repeat_rate = len(retained_from_combined) / len(customers_2025) * 100 if customers_2025 else 0
both_repeat_rate = len(retained_from_both) / len(customers_2025) * 100 if customers_2025 else 0

# Create DataFrame for the combined retention analysis
combined_retention_df = pd.DataFrame({
    'Customer Group': [
        'Customers from either 2023 or 2024 retained in 2025',
        'Customers from both 2023 and 2024 retained in 2025',
        '2025 customers who were also customers in either 2023 or 2024',
        '2025 customers who were also customers in both 2023 and 2024',
        '2025 customers who are completely new (not in 2023 or 2024)'
    ],
    'Customer Count': [
        f"{len(retained_from_combined):,} / {len(customers_2023_2024_combined):,}",
        f"{len(retained_from_both):,} / {len(customers_both_2023_2024):,}",
        f"{len(retained_from_combined):,} / {len(customers_2025):,}",
        f"{len(retained_from_both):,} / {len(customers_2025):,}",
        f"{len(customers_2025 - customers_2023_2024_combined):,} / {len(customers_2025):,}"
    ],
    'Percentage': [
        f"{combined_retention_rate:.2f}%",
        f"{both_retention_rate:.2f}%",
        f"{combined_repeat_rate:.2f}%",
        f"{both_repeat_rate:.2f}%",
        f"{len(customers_2025 - customers_2023_2024_combined) / len(customers_2025) * 100:.2f}%"
    ]
})

# Style the combined retention table
styled_combined_retention = combined_retention_df.style.set_properties(**{'text-align': 'center'})
styled_combined_retention = styled_combined_retention.set_table_styles([
    {'selector': 'th', 'props': [('background-color', '#4472C4'), ('color', 'white'), ('font-weight', 'bold'), ('text-align', 'center')]},
    {'selector': '.col0', 'props': [('text-align', 'left')]}
])

print("\nCombined 2023+2024 Retention Analysis:")
print("Data Sources and Formulas:")
print("- Data Source: Combined customer sets from 2023, 2024, and 2025")
print("- Customers from either 2023 or 2024 retained in 2025:")
print("  Formula: customers_2025.intersection(customers_2023_2024_combined) / len(customers_2023_2024_combined) * 100")
print("- Customers from both 2023 and 2024 retained in 2025:")
print("  Formula: customers_2025.intersection(customers_both_2023_2024) / len(customers_both_2023_2024) * 100")
print("- 2025 customers who were also customers in either 2023 or 2024:")
print("  Formula: len(retained_from_combined) / len(customers_2025) * 100")
print("- 2025 customers who were also customers in both 2023 and 2024:")
print("  Formula: len(retained_from_both) / len(customers_2025) * 100")
print("- 2025 customers who are completely new:")
print("  Formula: len(customers_2025 - customers_2023_2024_combined) / len(customers_2025) * 100")
print("- Customer Count column shows the raw counts (numerator/denominator) used in each calculation")

display(styled_combined_retention)

# Key three-year insights
print("\nKey Three-Year Insights:")
print(f"- Total unique customers across all three periods (First 4 months 2025, 2024, 2023): {len(first_4_months_2025_customers.union(all_2024_customers).union(all_2023_customers)):,}")
print(f"- Customers present in all three periods: {len(overlap_all_years):,}")

# Insights about cross-type behavior
if len(cross_2023) > 0 and len(cross_2024) > 0 and len(cross_first_4m_2025) > 0:
    cross_trend = "increasing" if (len(cross_2024)/len(all_2024_customers) > len(cross_2023)/len(all_2023_customers) and 
                                   len(cross_first_4m_2025)/len(first_4_months_2025_customers) > len(cross_2024)/len(all_2024_customers)) else "decreasing"
    print(f"- Cross-type shopping behavior (POS + Jumbo.ae) is {cross_trend} over the three-year period")
    
    # Calculate retention rate for cross-type customers
    cross_2023_retained_in_2025 = cross_2023.intersection(customers_2025)
    cross_type_retention = len(cross_2023_retained_in_2025) / len(cross_2023) * 100 if cross_2023 else 0
    print(f"- Cross-type customers from 2023 have a {cross_type_retention:.2f}% retention rate in 2025")

if first_4_months_2025_customers:
    if first_4m_2025_to_2024_percent > first_4m_2025_to_2023_percent:
        print(f"- First 4 months 2025 has stronger customer overlap with 2024 ({first_4m_2025_to_2024_percent:.2f}%) than with 2023 ({first_4m_2025_to_2023_percent:.2f}%)")
    else:
        print(f"- First 4 months 2025 has stronger customer overlap with 2023 ({first_4m_2025_to_2023_percent:.2f}%) than with 2024 ({first_4m_2025_to_2024_percent:.2f}%)")

print(f"- Of all 2025 customers, {both_repeat_rate:.2f}% were loyal customers present in both 2023 and 2024")
print(f"- {combined_repeat_rate - both_repeat_rate:.2f}% of 2025 customers were present in either 2023 or 2024, but not both years")

# Calculate customer base growth/decline
if all_2023_customers and all_2024_customers:
    growth_2023_to_2024 = (len(all_2024_customers) - len(all_2023_customers)) / len(all_2023_customers) * 100
    print(f"- Customer base {'grew' if growth_2023_to_2024 >= 0 else 'declined'} by {abs(growth_2023_to_2024):.2f}% from 2023 to 2024")

# Estimate annualized growth for 2025 based on first 4 months
if all_2024_customers and first_4_months_2025_customers:
    first_4m_2025_annualized = len(first_4_months_2025_customers) * 3  # Approximate full year based on first 4 months (12/4 = 3)
    estimated_growth = (first_4m_2025_annualized - len(all_2024_customers)) / len(all_2024_customers) * 100
    print(f"- Based on first 4 months data, 2025 customer base is projected to {'grow' if estimated_growth >= 0 else 'decline'} by approximately {abs(estimated_growth):.2f}% compared to 2024")


8. CUSTOMER REPEAT ANALYSIS FROM 2024 HALF-YEARS
This analysis examines customers from 2025 who were also present in the first and second half of 2024.
Using already loaded data from previous cells to perform analysis

Extracting data from different time periods...
Found 51548 unique customers in 2025 files
Found 51548 unique customers in 2025 files
Found 109146 unique customers in first half of 2024 files
Found 109146 unique customers in first half of 2024 files
Found 154952 unique customers in second half of 2024 files

Analyzing customer overlap between periods...
Customers present in both 2025 and first half 2024: 4,887
Customers present in both 2025 and second half 2024: 7,558
Customers present across all periods (2025, H1 2024, H2 2024): 3,456
Completely new customers in 2025 (not in any 2024 period): 42,559

Repeat Customer Analysis by 2024 Half-Year:
Data Sources and Formulas:
- Data Source: Customer sets from first half 2024, second half 2024, and 2025
- First half 2024: Cust

Unnamed: 0,Customer Source,Count,Percentage of 2025 Customers
0,Repeat customers from first half of 2024 (Jan-Jun),4887,9.48%
1,Repeat customers from second half of 2024 (Jul-Dec),7558,14.66%
2,Repeat customers from both halves of 2024,3456,6.70%
3,Completely new customers in 2025,42559,82.56%



Retention Rate Comparison:
This shows what percentage of customers from each 2024 time period returned in 2025
Data Sources and Formulas:
- First Half 2024 → 2025 Retention Rate: (Customers in both periods / First half 2024 customers) * 100%
  Formula: len(repeat_from_first_half) / len(first_half_2024) * 100
- Second Half 2024 → 2025 Retention Rate: (Customers in both periods / Second half 2024 customers) * 100%
  Formula: len(repeat_from_second_half) / len(second_half_2024) * 100
- Both Halves → 2025 Retention Rate: (Customers in all periods / Customers in both halves of 2024) * 100%
  Formula: len(repeat_from_both_halves) / len(both_halves) * 100
- Numerator/Denominator shows the raw counts used in each calculation


Unnamed: 0,Time Period,Retention Rate,Numerator/Denominator
0,First Half 2024 → 2025,4.48%,"4,887 / 109,146"
1,Second Half 2024 → 2025,4.88%,"7,558 / 154,952"
2,Both Halves → 2025,5.82%,"3,456 / 59,429"



Key Insights:
- Customers from the second half of 2024 showed stronger repeat purchase behavior (14.66%)
- 6.70% of 2025 customers were consistent throughout 2024 (both halves)
- 82.56% of 2025 customers are completely new (not seen in 2024)

FULL YEAR 2024 TO 2025 ANALYSIS:
This shows the complete year-over-year customer retention metrics
Data Sources and Formulas:
- Data Source: Combined customer sets from all 2024 files and all 2025 files
- Full Year 2024 → 2025 Retention: (Customers in both years / All 2024 customers) * 100%
  Formula: len(repeating_full_year) / len(all_2024_customers) * 100
- Full Year 2024 → 2025 Repeat Rate: (Customers in both years / All 2025 customers) * 100%
  Formula: len(repeating_full_year) / len(customers_2025) * 100
- New in 2025: Customers in 2025 not found in any 2024 data
  Formula: len(customers_2025 - all_2024_customers) / len(customers_2025) * 100
- In 2024 but not in 2025: Customers in 2024 not present in 2025
  Formula: len(all_2024_customers - 

Unnamed: 0,Analysis Type,Customer Count,Percentage
0,Full Year 2024 → 2025 Retention,"8,989 / 204,669",4.39%
1,Full Year 2024 → 2025 Repeat Rate,"8,989 / 51,548",17.44%
2,New in 2025 (not in 2024),42559,82.56%
3,In 2024 but not in 2025,195680,95.61%



Additional Context:
- Total unique customers in all 2024 files: 204,669
- Total unique customers in 2025 files: 51,548
- Customer overlap between first and second half of 2024: 59,429 (29.04% of all 2024 customers)

9. THREE-YEAR COMPARISON (FIRST 4 MONTHS 2025 vs 2024 vs 2023)
This analysis compares the first four months of 2025 (Jan-Apr, including April25 file) with full year data from 2024 and 2023
Reading 'Export' sheet from CUSTOMER PROFILE_April25.xlsx
Reading 'Export' sheet from CUSTOMER PROFILE_April25.xlsx
File: CUSTOMER PROFILE_April25.xlsx - Shape: (65647, 49)
Column mapping for CUSTOMER PROFILE_April25.xlsx:
  Email: customeremail
  Date: orderdate
  Type: type
  Order Type: retailordertype
Processed CUSTOMER PROFILE_April25.xlsx: Found 65647 valid records (after filtering for POS/Jumbo.ae types)
File: CUSTOMER PROFILE_April25.xlsx - Shape: (65647, 49)
Column mapping for CUSTOMER PROFILE_April25.xlsx:
  Email: customeremail
  Date: orderdate
  Type: type
  Order Type: reta

Unnamed: 0,Comparison,Customer Count,Percentage
0,First 4 months 2025 customers also in 2024,11129,16.69% of first 4 months 2025
1,First 4 months 2025 customers also in 2023,9170,13.76% of first 4 months 2025
2,First 4 months 2025 customers in both 2024 & 2023,3965,5.95% of first 4 months 2025
3,First 4 months 2025 customers new (not in 2023/2024),50328,75.50% of first 4 months 2025
4,2024 customers also in first 4 months 2025,11129,5.44% of 2024
5,2024 customers also in 2023,30339,14.82% of 2024
6,2023 customers also in first 4 months 2025,9170,3.44% of 2023
7,2023 customers also in 2024,30339,11.37% of 2023
8,Customers present in all three periods,3965,Present in all three periods



Retention Trend Analysis:
Data Sources and Formulas:
- First 4 months 2025 customers retained from 2024: (Overlap between first 4 months 2025 and 2024 / Total first 4 months 2025 customers) * 100%
  Formula: len(overlap_2025_4m_2024) / len(first_4_months_2025_customers) * 100
- First 4 months 2025 customers retained from 2023: (Overlap between first 4 months 2025 and 2023 / Total first 4 months 2025 customers) * 100%
  Formula: len(overlap_2025_4m_2023) / len(first_4_months_2025_customers) * 100
- 2024 customers retained from 2023: (Overlap between 2024 and 2023 / Total 2024 customers) * 100%
  Formula: len(overlap_2024_2023) / len(all_2024_customers) * 100
- First 4 months 2025 completely new customers: (New customers in first 4 months 2025 / Total first 4 months 2025 customers) * 100%
  Formula: len(first_4_months_2025_customers - all_2024_customers - all_2023_customers) / len(first_4_months_2025_customers) * 100
- Count column shows the raw counts (numerator/denominator) used in ea

Unnamed: 0,Retention Metric,Percentage,Count
0,First 4 months 2025 customers retained from 2024,16.69%,"11,129 / 66,662"
1,First 4 months 2025 customers retained from 2023,13.76%,"9,170 / 66,662"
2,2024 customers retained from 2023,14.82%,"30,339 / 204,669"
3,First 4 months 2025 completely new customers,75.50%,"50,328 / 66,662"



10. CROSS-TYPE CUSTOMER COMPARISON ACROSS YEARS
This analysis compares cross-type shopping behavior (POS and Jumbo.ae) across 2023, 2024, and first 4 months 2025

Analyzing cross-type shopping behavior...
2023: Found 202 cross-type customers out of 266,780 total customers
2023: Found 202 cross-type customers out of 266,780 total customers
2024: Found 896 cross-type customers out of 204,669 total customers
2024: Found 896 cross-type customers out of 204,669 total customers
First 4 months 2025: Found 245 cross-type customers out of 66,662 total customers

Cross-Type Shopping Behavior Comparison:
Data Sources and Formulas:
- Data Source: Customer type behavior analysis across all three time periods
- 2023 data: From q_2023_data dataframe
- 2024 data: Combined from fy1_data and fy2_data with '24' or '2024' in filename
- First 4 months 2025 data: From fy2_data with Jan/Feb/Mar/Apr 2025 patterns in filename
- Cross-Type Customers: Customers who used both POS and Jumbo.ae types
  Formula: po

Unnamed: 0,Time Period,Cross-Type Customers,POS-Only Customers,Jumbo.ae-Only Customers
0,2023,202 (0.08%),"76,761 (28.77%)","2,280 (0.85%)"
1,2024,896 (0.44%),"186,499 (91.12%)","9,680 (4.73%)"
2,First 4 Months 2025,245 (0.37%),"44,540 (66.81%)","4,671 (7.01%)"



11. COMBINED 2023+2024 RETENTION IN 2025 ANALYSIS
This analysis examines customers from 2025 who were retained from both 2024 and 2023
Total unique customers across 2023 and 2024 combined: 441,110
Customers present in both 2023 and 2024: 30,339

Combined 2023+2024 Retention Analysis:
Data Sources and Formulas:
- Data Source: Combined customer sets from 2023, 2024, and 2025
- Customers from either 2023 or 2024 retained in 2025:
  Formula: customers_2025.intersection(customers_2023_2024_combined) / len(customers_2023_2024_combined) * 100
- Customers from both 2023 and 2024 retained in 2025:
  Formula: customers_2025.intersection(customers_both_2023_2024) / len(customers_both_2023_2024) * 100
- 2025 customers who were also customers in either 2023 or 2024:
  Formula: len(retained_from_combined) / len(customers_2025) * 100
- 2025 customers who were also customers in both 2023 and 2024:
  Formula: len(retained_from_both) / len(customers_2025) * 100
- 2025 customers who are completely new:


Unnamed: 0,Customer Group,Customer Count,Percentage
0,Customers from either 2023 or 2024 retained in 2025,"13,019 / 441,110",2.95%
1,Customers from both 2023 and 2024 retained in 2025,"3,194 / 30,339",10.53%
2,2025 customers who were also customers in either 2023 or 2024,"13,019 / 51,548",25.26%
3,2025 customers who were also customers in both 2023 and 2024,"3,194 / 51,548",6.20%
4,2025 customers who are completely new (not in 2023 or 2024),"38,529 / 51,548",74.74%



Key Three-Year Insights:
- Total unique customers across all three periods (First 4 months 2025, 2024, 2023): 491,438
- Customers present in all three periods: 3,965
- Cross-type shopping behavior (POS + Jumbo.ae) is decreasing over the three-year period
- Cross-type customers from 2023 have a 11.39% retention rate in 2025
- First 4 months 2025 has stronger customer overlap with 2024 (16.69%) than with 2023 (13.76%)
- Of all 2025 customers, 6.20% were loyal customers present in both 2023 and 2024
- 19.06% of 2025 customers were present in either 2023 or 2024, but not both years
- Customer base declined by 23.28% from 2023 to 2024
- Based on first 4 months data, 2025 customer base is projected to decline by approximately 2.29% compared to 2024
