# Excel File Analysis - Price Comparison Report

This notebook reads and analyzes the Excel file from ComparisionData folder, specifically examining the "detailed report" sheet.

In [4]:
import json
import pandas as pd
import numpy as np
import re
import os
from datetime import datetime
import glob

def process_single_file(file_path, year):
    """
    Process a single Price comparison file and return pivot table
    """
    print(f"\nProcessing file: {file_path}")
    
    # Extract date from filename
    filename = os.path.basename(file_path)
    date_match = re.search(r'Price comparision_(\d{2})_(\d{2})\.xlsx', filename)
    
    if date_match:
        day = date_match.group(1)
        month = date_match.group(2)
        
        # Create date string
        date_str = f"{year}-{month}-{day}"
        print(f"Extracted date from filename: {date_str}")
        
        # Convert to datetime for validation
        try:
            date_obj = datetime.strptime(date_str, "%Y-%m-%d")
            formatted_date = date_obj.strftime("%Y-%m-%d")
        except ValueError:
            print("Invalid date extracted from filename")
            formatted_date = f"{year}-{month}-{day}"
    else:
        print("Could not extract date from filename")
        formatted_date = f"{year}-01-01"  # Default date
    
    try:
        excel_file = pd.ExcelFile(file_path)
        
        # Convert sheet names to lowercase and find target sheet
        sheet_names = excel_file.sheet_names
        target_sheet = None
        for sheet in sheet_names:
            if sheet.lower() == "detailed report":
                target_sheet = sheet
                break
        
        if not target_sheet:
            print(f"No 'detailed report' sheet found in {filename}")
            return None
        
        # Load the target sheet
        df = pd.read_excel(file_path, sheet_name=target_sheet)
        
        # Case insensitive filtering function
        def matches_pattern(value):
            if pd.isna(value) or not isinstance(value, str):
                return False
            parts = value.split('-')
            if len(parts) != 2:
                return False
            # Allow alphabetic characters and spaces, remove extra spaces for comparison
            part1_clean = parts[0].strip()
            part2_clean = parts[1].strip()
            
            # Both parts should contain at least some alphabetic characters
            if not (any(c.isalpha() for c in part1_clean) and any(c.isalpha() for c in part2_clean)):
                return False
            
            # Compare case insensitive and handle spaces
            return part1_clean.lower() == part2_clean.lower()
        
        # Filter data
        filtered_df = df[df['ORG_SELLER'].apply(matches_pattern)]
        
        if len(filtered_df) == 0:
            print(f"No matching data found in {filename}")
            return None
        
        # Extract company names from ORG_SELLER for column headers
        unique_sellers = sorted(filtered_df['ORG_SELLER'].unique())
        print(f"Found sellers: {unique_sellers}")
        
        # Define priority order for companies
        priority_order = ['Jumbo', 'Sharaf DG', 'Emax', 'Noon', 'Amazon', 'Carrefour', 'Dyson']
        
        # Create a base dataframe
        pivot_base = filtered_df[['SKU', 'TITLE', 'CATEGORY', 'ORG_SELLER', 'OFFER PRICE']].copy()
        
        # Extract company name from ORG_SELLER
        pivot_base['Company'] = pivot_base['ORG_SELLER'].str.split('-').str[0]
        
        # Create pivot table with OFFER PRICE as values
        pivot_table = pivot_base.pivot_table(
            index=['SKU', 'TITLE', 'CATEGORY'], 
            columns='Company', 
            values='OFFER PRICE', 
            aggfunc='first'
        ).reset_index()
        
        # Add Date column as the first column
        pivot_table.insert(0, 'Date', formatted_date)
        
        # Reorder columns according to priority (after Date)
        available_companies = [col for col in pivot_table.columns if col in priority_order]
        other_companies = [col for col in pivot_table.columns if col not in priority_order and col not in ['Date', 'SKU', 'TITLE', 'CATEGORY']]
        
        # Create ordered column list starting with Date
        ordered_columns = ['Date', 'SKU', 'TITLE', 'CATEGORY']
        
        # Add companies in priority order
        for company in priority_order:
            if company in pivot_table.columns:
                ordered_columns.append(company)
        
        # Add any other companies at the end
        for company in other_companies:
            ordered_columns.append(company)
        
        # Reorder the pivot table
        pivot_table = pivot_table[ordered_columns]
        
        # Create "Jumbo Higher than" and "Jumbo Lower than" columns
        def get_jumbo_higher_than(row):
            jumbo_price = row.get('Jumbo', None)
            
            if pd.isna(jumbo_price) or jumbo_price is None:
                return ""
            
            higher_than = []
            
            # Check each company (excluding Jumbo)
            for company in priority_order[1:]:  # Skip Jumbo
                if company in row and not pd.isna(row[company]):
                    company_price = row[company]
                    # Add relaxation of 1: Jumbo must be more than company_price + 1 to be considered higher
                    if jumbo_price > (company_price + 1):  # Jumbo is higher with relaxation
                        higher_than.append(company)
            
            # Check other companies not in priority order
            for company in other_companies:
                if company in row and not pd.isna(row[company]):
                    company_price = row[company]
                    # Add relaxation of 1: Jumbo must be more than company_price + 1 to be considered higher
                    if jumbo_price > (company_price + 1):  # Jumbo is higher with relaxation
                        higher_than.append(company)
            
            return ", ".join(higher_than) if higher_than else ""
        
        def get_jumbo_lower_than(row):
            jumbo_price = row.get('Jumbo', None)
            
            if pd.isna(jumbo_price) or jumbo_price is None:
                return ""
            
            lower_than = []
            
            # Check each company (excluding Jumbo)
            for company in priority_order[1:]:  # Skip Jumbo
                if company in row and not pd.isna(row[company]):
                    company_price = row[company]
                    # Add relaxation of 1: Jumbo must be less than company_price - 1 to be considered lower
                    if jumbo_price < (company_price - 1):  # Jumbo is lower with relaxation
                        lower_than.append(company)
            
            # Check other companies not in priority order
            for company in other_companies:
                if company in row and not pd.isna(row[company]):
                    company_price = row[company]
                    # Add relaxation of 1: Jumbo must be less than company_price - 1 to be considered lower
                    if jumbo_price < (company_price - 1):  # Jumbo is lower with relaxation
                        lower_than.append(company)
            
            return ", ".join(lower_than) if lower_than else ""
        
        # Apply the functions to create the new columns
        pivot_table['Jumbo Higher than'] = pivot_table.apply(get_jumbo_higher_than, axis=1)
        pivot_table['Jumbo Lower than'] = pivot_table.apply(get_jumbo_lower_than, axis=1)
        
        print(f"Pivot table created with shape: {pivot_table.shape}")
        return pivot_table
        
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")
        return None

def process_folder_and_combine(folder_path, year, output_filename="Combined_Price_Analysis.xlsx", create_latest_file=True):
    """
    Process all Price comparison files in a folder and combine them into one Excel file
    Also creates a separate file with just the latest date's data
    """
    print(f"Processing folder: {folder_path}")
    
    # Find all files matching the pattern
    file_pattern = os.path.join(folder_path, "Price comparision_*.xlsx")
    files = glob.glob(file_pattern)
    
    if not files:
        print(f"No files found matching pattern in {folder_path}")
        return
    
    print(f"Found {len(files)} files to process:")
    for file in files:
        print(f"  - {os.path.basename(file)}")
    
    # Process each file and collect pivot tables
    all_pivot_tables = []
    
    for file_path in files:
        pivot_table = process_single_file(file_path, year)
        if pivot_table is not None:
            all_pivot_tables.append(pivot_table)
    
    if not all_pivot_tables:
        print("No valid pivot tables were created")
        return
    
    # Combine all pivot tables
    print(f"\nCombining {len(all_pivot_tables)} pivot tables...")
    combined_df = pd.concat(all_pivot_tables, ignore_index=True)
    
    # Sort by Date and SKU
    combined_df = combined_df.sort_values(['Date', 'SKU'])
    
    print(f"Combined dataframe shape: {combined_df.shape}")
    
    # Save combined data to Excel file
    output_path = os.path.join(folder_path, output_filename)
    combined_df.to_excel(output_path, index=False)
    
    print(f"\nCombined data saved to: {output_path}")
    print(f"Total rows: {len(combined_df)}")
    print(f"Date range: {combined_df['Date'].min()} to {combined_df['Date'].max()}")
    
    # Create separate file for latest date data
    if create_latest_file:
        latest_date = combined_df['Date'].max()
        latest_data = combined_df[combined_df['Date'] == latest_date].copy()
        
        if not latest_data.empty:
            # Create filename for latest date file
            latest_filename = f"Latest_Date_Analysis_{latest_date}.xlsx"
            latest_output_path = os.path.join(folder_path, latest_filename)
            
            # Sort latest data by SKU for better readability
            latest_data = latest_data.sort_values(['SKU', 'TITLE'])
            
            # Save latest date data
            latest_data.to_excel(latest_output_path, index=False)
            
            print(f"\nLatest date data saved to: {latest_output_path}")
            print(f"Latest date: {latest_date}")
            print(f"Latest date rows: {len(latest_data)}")
            print(f"Unique SKUs in latest date: {latest_data['SKU'].nunique()}")
            
            # Show summary of latest data
            companies_in_latest = [col for col in latest_data.columns if col in ['Jumbo', 'Sharaf DG', 'Emax', 'Noon', 'Amazon', 'Carrefour', 'Dyson', 'Sony']]
            print(f"Companies with data in latest date: {companies_in_latest}")
            
        else:
            print("\nNo latest date data found")
    
    return combined_df

def process_latest_file_only(folder_path, year):
    """
    Process only the latest file in the folder for faster execution
    """
    print(f"Finding latest file in: {folder_path}")
    
    # Find all files matching the pattern
    file_pattern = os.path.join(folder_path, "Price comparision_*.xlsx")
    files = glob.glob(file_pattern)
    
    if not files:
        print(f"No files found matching pattern in {folder_path}")
        return None
    
    # Find the latest file based on date in filename
    latest_file = None
    latest_date = None
    
    for file_path in files:
        filename = os.path.basename(file_path)
        date_match = re.search(r'Price comparision_(\d{2})_(\d{2})\.xlsx', filename)
        
        if date_match:
            day = date_match.group(1)
            month = date_match.group(2)
            try:
                # Create date for comparison
                file_date = datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d")
                if latest_date is None or file_date > latest_date:
                    latest_date = file_date
                    latest_file = file_path
            except ValueError:
                continue
    
    if not latest_file:
        print("No valid date files found")
        return None
    
    print(f"Processing latest file: {os.path.basename(latest_file)}")
    print(f"Latest date: {latest_date.strftime('%Y-%m-%d')}")
    
    # Process the latest file
    pivot_table = process_single_file(latest_file, year)
    
    if pivot_table is not None:
        # Save latest file data
        latest_filename = f"Latest_Date_Analysis_{latest_date.strftime('%Y-%m-%d')}.xlsx"
        output_path = os.path.join(folder_path, latest_filename)
        
        # Sort by SKU for better readability
        pivot_table = pivot_table.sort_values(['SKU', 'TITLE'])
        pivot_table.to_excel(output_path, index=False)
        
        print(f"\nLatest date analysis saved to: {output_path}")
        print(f"Rows processed: {len(pivot_table)}")
        print(f"Unique SKUs: {pivot_table['SKU'].nunique()}")
        
        return pivot_table
    else:
        print("Failed to process latest file")
        return None

# Main execution with 3 processing options
try:
    with open('config.json', 'r') as f:
        config_data = json.load(f)
    folder_path = config_data['paths']['comparision_data_folder']
except Exception as e:
    print(f"❌ Error loading config.json: {e}")
    print("Using default DSR folder path...")
    folder_path = "ComparisionData"
year = input("Please enter the year (e.g., 2024): ")

print("")

choice = input("\nChoose processing option:\n1. Combined file only (all historical data)\n2. Combined file + Latest date file (all data + separate latest)\n3. Latest date file only (fastest - processes only newest file)\nEnter your choice (1, 2, or 3): ").strip()

if choice == "1":
    print("\n" + "="*60)
    print("PROCESSING: COMBINED FILE ONLY")
    print("="*60)
    combined_result = process_folder_and_combine(folder_path, year, create_latest_file=False)
    
    if combined_result is not None:
        print("\n" + "="*60)
        print("PROCESSING COMPLETE")
        print("="*60)
        print("File created:")
        print("✓ Combined_Price_Analysis.xlsx - All historical data")
        print(f"\nTotal rows processed: {len(combined_result)}")
        print(f"Date range: {combined_result['Date'].min()} to {combined_result['Date'].max()}")
    else:
        print("Failed to process files")

elif choice == "2":
    print("\n" + "="*60)
    print("PROCESSING: COMBINED + LATEST FILES")
    print("="*60)
    combined_result = process_folder_and_combine(folder_path, year, create_latest_file=True)
    
    if combined_result is not None:
        print("\n" + "="*60)
        print("PROCESSING COMPLETE")
        print("="*60)
        print("Files created:")
        print("✓ Combined_Price_Analysis.xlsx - All historical data")
        print("✓ Latest_Date_Analysis_[DATE].xlsx - Latest date data only")
        print(f"\nTotal rows processed: {len(combined_result)}")
        print(f"Date range: {combined_result['Date'].min()} to {combined_result['Date'].max()}")
    else:
        print("Failed to process files")

elif choice == "3":
    print("\n" + "="*60)
    print("PROCESSING: LATEST DATE FILE ONLY")
    print("="*60)
    latest_result = process_latest_file_only(folder_path, year)
    
    if latest_result is not None:
        print("\n" + "="*60)
        print("PROCESSING COMPLETE")
        print("="*60)
        print("File created:")
        print("✓ Latest_Date_Analysis_[DATE].xlsx - Latest date data only")
        print(f"\nRows processed: {len(latest_result)}")
        print("\nFirst 5 rows of latest data:")
        print(latest_result.head())
    else:
        print("Failed to process latest file")

else:
    print("Invalid choice. Please run again and select 1, 2, or 3.")



PROCESSING: COMBINED FILE ONLY
Processing folder: ComparisionData
Found 7 files to process:
  - Price comparision_03_06.xlsx
  - Price comparision_04_06.xlsx
  - Price comparision_06_06.xlsx
  - Price comparision_10_06.xlsx
  - Price comparision_11_06.xlsx
  - Price comparision_12_06.xlsx
  - Price comparision_13_06.xlsx

Processing file: ComparisionData\Price comparision_03_06.xlsx
Extracted date from filename: 2025-06-03

PROCESSING: COMBINED FILE ONLY
Processing folder: ComparisionData
Found 7 files to process:
  - Price comparision_03_06.xlsx
  - Price comparision_04_06.xlsx
  - Price comparision_06_06.xlsx
  - Price comparision_10_06.xlsx
  - Price comparision_11_06.xlsx
  - Price comparision_12_06.xlsx
  - Price comparision_13_06.xlsx

Processing file: ComparisionData\Price comparision_03_06.xlsx
Extracted date from filename: 2025-06-03
Found sellers: ['Amazon-Amazon', 'Carrefour-Carrefour', 'Dyson-dyson', 'Emax-emax', 'Jumbo-jumbo', 'Noon-noon', 'Sharaf DG-Sharaf DG', 'Sony-So