In [1]:
import pandas as pd
import numpy as np
import os
from sqlalchemy import create_engine
import time
import pdb
import bdb  # Added to handle BdbQuit
from collections import Counter

In [2]:
if 1: # Create SQLAlchemy engine
    connection_string = (
        f"mssql+pyodbc://{"LouisHaehle"}:{"QuiltStanceCurler51"}"
        f"@10.0.30.16:1433/RTID_SourceData"
        f"?driver=ODBC+Driver+17+for+SQL+Server"
        f"&TrustServerCertificate=yes"
        f"&Connection+Timeout=5"
        )
    engine = create_engine(connection_string)

In [3]:
# Query all rows from the US_HY_NVR_TEST_2023_2025_REDO table
live_query = """
    SELECT [REPORT_YEAR_WEEK]
      ,[VEH_AT_PHYS_LOC]
      ,[INV_LST_PRCE_AMT]
      ,[INV_MSRP_AMT]
      ,[VIN_SOLD]
      ,[INV_MIN_VIN_SOLD_DT]
      ,[DAYS_LSTD_BFR_SLD]
      ,[SRC_VEH_FIRST_SCRAPED_DT]
      ,[TRIM_DESCRIPTION]
      ,[NVI_OWNSHP_DT]
      ,[NVI_EFCTV_START_DT]
      ,[NVI_CENSUS_TRACT]
      ,[NVI_RPT_YYYYMM]
      ,[SLS_REPORT_YEAR_MONTH]
      ,[SALES_DT]
      ,[CATG_CD]
      ,[INV_CENSUS_TRACT]
      ,[INV_CONTROL_NBR]
      ,[INV_DEALER_NAME]
      ,[INV_TOWN_NAME]
      ,[INV_STATE_ABBRV]
      ,[SLS_CENSUS_TRACT]
      ,[SLS_CONTROL_NBR]
      ,[SLS_DEALER_NAME]
      ,[SLS_TOWN_NAME]
      ,[SLS_STATE_ABBRV]
      ,[NVI_CONTROL_NBR]
      ,[NVI_DEALER_NAME]
      ,[NVI_TOWN_NAME]
      ,[NVI_STATE_ABBRV]
      ,[MAKE_DESC]
      ,[MODEL_DESC]
      ,[SERIES_TEXT]
      ,[FUEL_DESC]
      ,[SEGMENT_DESC]
      ,[ADVNC_VEH_TYPE_DESC]
      ,[VEH_MDL_YR]
      ,[MODEL_KEY_SHORT]
      ,[SLS_VEHICLE_COUNT]
      ,[INV_COUNT]
    FROM [SPGM_Live].[SPGM_Weekly_INV_NVI_SLS_20251107]
"""

# Execute the query and fetch all rows
full_df_live = pd.read_sql(live_query, engine)

# Display the shape and first few rows of the DataFrame
print(f"Total rows fetched: {len(full_df_live):,}")
print(full_df_live.head())

Total rows fetched: 30,625,458
  REPORT_YEAR_WEEK VEH_AT_PHYS_LOC  INV_LST_PRCE_AMT  INV_MSRP_AMT  VIN_SOLD  \
0          2025-42               Y           38778.0       39528.0       0.0   
1          2025-36               Y           58560.0       67810.0       0.0   
2          2025-38               Y           64569.0       62774.0       2.0   
3          2025-41               Y           65633.0       69580.0       0.0   
4          2025-37               N           33161.0       34130.0       0.0   

  INV_MIN_VIN_SOLD_DT  DAYS_LSTD_BFR_SLD SRC_VEH_FIRST_SCRAPED_DT  \
0          1899-12-30             -999.0               2025-08-05   
1          1899-12-30             -999.0               2024-11-14   
2          2025-09-30               62.0               2025-07-30   
3          1899-12-30             -999.0               2025-09-28   
4          1899-12-30             -999.0               2025-07-12   

  TRIM_DESCRIPTION NVI_OWNSHP_DT  ...   MAKE_DESC MODEL_DESC SERIES_TEXT 

In [4]:
def analyze_duplicates_with_variations(df, duplicate_check_columns, variation_columns, 
                                       sample_groups=10, export_to_excel=False, 
                                       output_file='duplicate_analysis.xlsx'):
    """
    Find duplicates based on specific columns and analyze what varies in other columns.
    
    Parameters:
    df: DataFrame to analyze
    duplicate_check_columns: List of column names to check for duplicates
    variation_columns: List of column names to check for variations within duplicate groups
    sample_groups: Number of duplicate groups to display (default: 10)
    export_to_excel: Whether to export full results to Excel (default: False)
    output_file: Excel filename if export_to_excel=True
    
    Returns:
    DataFrame with duplicate groups and their variations
    """
    print(f"Analyzing duplicates in {len(df):,} rows...")
    print(f"Checking duplicates on: {duplicate_check_columns}")
    print(f"Analyzing variations in: {variation_columns}")
    print("=" * 80)
    
    # Find duplicates based on specified columns
    is_duplicate = df.duplicated(subset=duplicate_check_columns, keep=False)
    num_duplicates = is_duplicate.sum()
    num_unique = (~df.duplicated(subset=duplicate_check_columns)).sum()
    
    # Calculate the number of duplicate groups (unique combinations that appear more than once)
    num_duplicate_groups = len(df[is_duplicate].drop_duplicates(subset=duplicate_check_columns))
    
    print(f"\nDuplicate Summary:")
    print(f"  Total rows: {len(df):,}")
    print(f"  Unique combinations (total): {num_unique:,}")
    print(f"  Duplicate rows: {num_duplicates:,} ({100*num_duplicates/len(df):.2f}%)")
    print(f"  Number of duplicate groups: {num_duplicate_groups:,}")
    print(f"  Average rows per duplicate group: {num_duplicates/num_duplicate_groups:.1f}")
    
    if num_duplicates == 0:
        print("\nNo duplicates found!")
        return None
    
    # Get all duplicate rows
    duplicate_rows = df[is_duplicate].copy()
    
    # Analyze variations within each duplicate group
    print(f"\n{'=' * 80}")
    print(f"Analyzing variations in duplicate groups...")
    print(f"{'=' * 80}\n")
    
    # Group by duplicate check columns and analyze variations
    all_columns = duplicate_check_columns + variation_columns
    grouped = duplicate_rows[all_columns].groupby(duplicate_check_columns)
    
    results = []
    for name, group in grouped:
        group_size = len(group)
        
        # For each variation column, count unique values
        variations = {}
        for col in variation_columns:
            unique_vals = group[col].dropna().unique()
            variations[f'{col}_unique_count'] = len(unique_vals)
            variations[f'{col}_values'] = ', '.join([str(v) for v in unique_vals[:5]])  # First 5 values
            if len(unique_vals) > 5:
                variations[f'{col}_values'] += f' ... ({len(unique_vals)} total)'
        
        # Create result row
        result_row = {
            'duplicate_group_size': group_size,
        }
        
        # Add duplicate check column values
        if isinstance(name, tuple):
            for i, col in enumerate(duplicate_check_columns):
                result_row[col] = name[i]
        else:
            result_row[duplicate_check_columns[0]] = name
        
        # Add variation statistics
        result_row.update(variations)
        results.append(result_row)
    
    # Create results DataFrame
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('duplicate_group_size', ascending=False)
    
    # Display sample groups
    print(f"Top {sample_groups} duplicate groups by size:")
    print("-" * 80)
    
    for idx, row in results_df.head(sample_groups).iterrows():
        print(f"\nGroup size: {row['duplicate_group_size']:,} rows")
        
        # Print duplicate check column values
        for col in duplicate_check_columns:
            print(f"  {col}: {row[col]}")
        
        # Print variations
        print(f"  Variations:")
        for col in variation_columns:
            unique_count = row[f'{col}_unique_count']
            values = row[f'{col}_values']
            print(f"    {col}: {unique_count} unique values - [{values}]")
        print("-" * 80)
    
    # Export results if requested
    if export_to_excel:
        num_results = len(results_df)
        max_excel_rows = 1048576
        
        if num_results > max_excel_rows:
            # Too large for Excel, use CSV instead
            csv_file = output_file.replace('.xlsx', '.csv')
            results_df.to_csv(csv_file, index=False)
            print(f"\n⚠️  Results too large for Excel ({num_results:,} rows > {max_excel_rows:,} limit)")
            print(f"✓ Full results exported to CSV: {csv_file}")
            
            # Also export top 100k to Excel for easier viewing
            excel_sample_file = output_file.replace('.xlsx', '_top100k.xlsx')
            results_df.head(100000).to_excel(excel_sample_file, index=False, sheet_name='Top_100k_Groups')
            print(f"✓ Top 100,000 groups exported to Excel: {excel_sample_file}")
        else:
            results_df.to_excel(output_file, index=False, sheet_name='Duplicate_Analysis')
            print(f"\n✓ Full results exported to: {output_file}")
    
    return results_df

In [5]:
duplicate_check_cols = ['INV_MIN_VIN_SOLD_DT', 'SRC_VEH_FIRST_SCRAPED_DT', 'TRIM_DESCRIPTION', 'INV_CENSUS_TRACT', 'INV_CONTROL_NBR', 'INV_DEALER_NAME', 'INV_TOWN_NAME', 'INV_STATE_ABBRV', 'MAKE_DESC', 'MODEL_DESC', 'SERIES_TEXT', 'FUEL_DESC', 'SEGMENT_DESC', 'ADVNC_VEH_TYPE_DESC', 'MODEL_KEY_SHORT', 'INV_COUNT']

# Removed 'INV_MIN_VIN_SOLD_DT' from variation_cols since it's already in duplicate_check_cols
variation_cols = ['VEH_AT_PHYS_LOC', 'VIN_SOLD', 'DAYS_LSTD_BFR_SLD', 'INV_LST_PRCE_AMT', 'INV_MSRP_AMT', 'SALES_DT', 'SLS_CONTROL_NBR', 'NVI_CONTROL_NBR']

In [6]:
# Run duplicate analysis


results = analyze_duplicates_with_variations(
    df=full_df_live,
    duplicate_check_columns=duplicate_check_cols,
    variation_columns=variation_cols,
    sample_groups=20,  # Show top 20 duplicate groups
    export_to_excel=True,  # Export full results to Excel
    output_file='duplicate_analysis_results.xlsx'
)

Analyzing duplicates in 30,625,458 rows...
Checking duplicates on: ['INV_MIN_VIN_SOLD_DT', 'SRC_VEH_FIRST_SCRAPED_DT', 'TRIM_DESCRIPTION', 'INV_CENSUS_TRACT', 'INV_CONTROL_NBR', 'INV_DEALER_NAME', 'INV_TOWN_NAME', 'INV_STATE_ABBRV', 'MAKE_DESC', 'MODEL_DESC', 'SERIES_TEXT', 'FUEL_DESC', 'SEGMENT_DESC', 'ADVNC_VEH_TYPE_DESC', 'MODEL_KEY_SHORT', 'INV_COUNT']
Analyzing variations in: ['VEH_AT_PHYS_LOC', 'VIN_SOLD', 'DAYS_LSTD_BFR_SLD', 'INV_LST_PRCE_AMT', 'INV_MSRP_AMT', 'SALES_DT', 'SLS_CONTROL_NBR', 'NVI_CONTROL_NBR']

Duplicate Summary:
  Total rows: 30,625,458
  Unique combinations (total): 6,287,548
  Duplicate rows: 29,657,088 (96.84%)
  Number of duplicate groups: 5,319,178
  Average rows per duplicate group: 5.6

Duplicate Summary:
  Total rows: 30,625,458
  Unique combinations (total): 6,287,548
  Duplicate rows: 29,657,088 (96.84%)
  Number of duplicate groups: 5,319,178
  Average rows per duplicate group: 5.6

Analyzing variations in duplicate groups...


Analyzing variations i

In [None]:
# Find duplicate groups where INV_LST_PRCE_AMT varies
# Filter for groups where INV_LST_PRCE_AMT_unique_count > 1
price_varying_groups = results[results['INV_LST_PRCE_AMT_unique_count'] > 1].copy()

print(f"Found {len(price_varying_groups):,} duplicate groups where INV_LST_PRCE_AMT varies")
print(f"That's {100*len(price_varying_groups)/len(results):.2f}% of all duplicate groups")
print("\n" + "="*80)
print("Top 10 groups with price variations (sorted by group size):")
print("="*80)

# Show top 10 by group size
for idx, row in price_varying_groups.head(10).iterrows():
    print(f"\nGroup size: {row['duplicate_group_size']:,} rows")
    print(f"  INV_LST_PRCE_AMT: {row['INV_LST_PRCE_AMT_unique_count']} unique values - {row['INV_LST_PRCE_AMT_values']}")
    print(f"  INV_DEALER_NAME: {row['INV_DEALER_NAME']}")
    print(f"  MAKE_DESC: {row['MAKE_DESC']}")
    print(f"  MODEL_DESC: {row['MODEL_DESC']}")
    print("-"*80)

In [None]:
# Get the actual rows from the first duplicate group with price variations
# This will show you all the rows that belong to that group

if len(price_varying_groups) > 0:
    # Get the first group's identifying values
    first_group = price_varying_groups.iloc[0]
    
    print(f"Fetching all rows for the largest duplicate group with price variations...")
    print(f"Group size: {first_group['duplicate_group_size']:,} rows")
    print("="*80)
    
    # Build a filter to get all rows from this duplicate group
    filter_mask = pd.Series(True, index=full_df_live.index)
    for col in duplicate_check_cols:
        filter_mask &= (full_df_live[col] == first_group[col])
    
    # Get the actual rows
    example_group = full_df_live[filter_mask].copy()
    
    # Display key columns including the varying price
    display_cols = ['REPORT_YEAR_WEEK', 'VEH_AT_PHYS_LOC', 'INV_LST_PRCE_AMT', 'INV_MSRP_AMT', 
                    'VIN_SOLD', 'DAYS_LSTD_BFR_SLD', 'INV_DEALER_NAME', 'MAKE_DESC', 'MODEL_DESC']
    
    print(f"\nShowing key columns for this duplicate group:")
    print(example_group[display_cols].to_string())
    
    print(f"\n\nINV_LST_PRCE_AMT value counts in this group:")
    print(example_group['INV_LST_PRCE_AMT'].value_counts())
else:
    print("No duplicate groups with price variations found!")

In [9]:
def get_duplicate_insights(df, duplicate_check_columns, variation_columns):
    """
    Get comprehensive high-level insights about duplicates in the dataset.
    
    Parameters:
    df: DataFrame to analyze
    duplicate_check_columns: List of column names to check for duplicates
    variation_columns: List of column names to check for variations
    
    Returns:
    Dictionary with various insights
    """
    print("=" * 80)
    print("HIGH-LEVEL DUPLICATE INSIGHTS")
    print("=" * 80)
    
    # Basic duplicate statistics
    is_duplicate = df.duplicated(subset=duplicate_check_columns, keep=False)
    num_duplicates = is_duplicate.sum()
    num_unique = (~df.duplicated(subset=duplicate_check_columns)).sum()
    duplicate_rows = df[is_duplicate].copy()
    
    # Count duplicate groups
    duplicate_groups = duplicate_rows.groupby(duplicate_check_columns).size().reset_index(name='group_size')
    num_duplicate_groups = len(duplicate_groups)
    
    print(f"\n1. OVERALL STATISTICS:")
    print(f"   Total rows: {len(df):,}")
    print(f"   Unique combinations: {num_unique:,} ({100*num_unique/len(df):.2f}%)")
    print(f"   Duplicate rows: {num_duplicates:,} ({100*num_duplicates/len(df):.2f}%)")
    print(f"   Number of duplicate groups: {num_duplicate_groups:,}")
    if num_duplicate_groups > 0:
        print(f"   Average rows per duplicate group: {num_duplicates/num_duplicate_groups:.2f}")
    
    # Group size distribution
    print(f"\n2. DUPLICATE GROUP SIZE DISTRIBUTION:")
    size_distribution = duplicate_groups['group_size'].value_counts().sort_index()
    print(f"   Group Size  |  Count  |  % of Groups")
    print(f"   " + "-" * 40)
    for size, count in size_distribution.items():
        pct = 100 * count / num_duplicate_groups
        print(f"   {size:>5} rows  |  {count:>6,}  |  {pct:>5.2f}%")
    
    # Largest groups
    print(f"\n3. TOP 10 LARGEST DUPLICATE GROUPS:")
    top_groups = duplicate_groups.nlargest(10, 'group_size')
    for idx, row in top_groups.iterrows():
        print(f"   Group with {row['group_size']:,} rows")
    
    # Variation analysis summary
    print(f"\n4. VARIATION PATTERNS ACROSS ALL DUPLICATE GROUPS:")
    variation_stats = {}
    
    for col in variation_columns:
        # For each duplicate group, count unique values in variation column
        group_variations = duplicate_rows.groupby(duplicate_check_columns)[col].nunique()
        
        # How many groups have multiple different values?
        groups_with_variation = (group_variations > 1).sum()
        pct_with_variation = 100 * groups_with_variation / num_duplicate_groups if num_duplicate_groups > 0 else 0
        
        variation_stats[col] = {
            'groups_with_variation': groups_with_variation,
            'pct_with_variation': pct_with_variation,
            'max_unique_values': group_variations.max() if len(group_variations) > 0 else 0
        }
        
        print(f"   {col}:")
        print(f"      {groups_with_variation:,} groups ({pct_with_variation:.1f}%) have variations")
        print(f"      Max unique values in a single group: {variation_stats[col]['max_unique_values']}")
    
    print("\n" + "=" * 80)
    
    return {
        'total_rows': len(df),
        'unique_combinations': num_unique,
        'duplicate_rows': num_duplicates,
        'duplicate_groups': num_duplicate_groups,
        'size_distribution': size_distribution,
        'variation_stats': variation_stats,
        'duplicate_groups_df': duplicate_groups
    }

In [8]:
def inspect_duplicate_group(df, duplicate_check_columns, variation_columns, 
                           group_index=0, group_criteria=None, display_all_cols=False):
    """
    Inspect a specific duplicate group to see all rows and their variations.
    
    Parameters:
    df: DataFrame to analyze
    duplicate_check_columns: List of column names that define duplicates
    variation_columns: List of column names to show variations
    group_index: Index of group to inspect (0 = largest group, 1 = 2nd largest, etc.)
    group_criteria: Dict to filter specific group (e.g., {'VIN_SOLD': '12345', 'MAKE_DESC': 'FORD'})
    display_all_cols: If True, show all columns; if False, show only check + variation columns
    
    Returns:
    DataFrame with all rows from the selected duplicate group
    """
    # Find all duplicate rows
    is_duplicate = df.duplicated(subset=duplicate_check_columns, keep=False)
    duplicate_rows = df[is_duplicate].copy()
    
    if len(duplicate_rows) == 0:
        print("No duplicates found!")
        return None
    
    # Select group based on criteria or index
    if group_criteria:
        # Filter by specific criteria
        mask = pd.Series(True, index=duplicate_rows.index)
        for col, val in group_criteria.items():
            mask &= (duplicate_rows[col] == val)
        
        group_rows = duplicate_rows[mask]
        
        if len(group_rows) == 0:
            print(f"No duplicate group found matching criteria: {group_criteria}")
            return None
        
        print(f"Found duplicate group matching criteria: {group_criteria}")
        
    else:
        # Get group by index (sorted by size)
        grouped = duplicate_rows.groupby(duplicate_check_columns).size().reset_index(name='group_size')
        grouped = grouped.sort_values('group_size', ascending=False)
        
        if group_index >= len(grouped):
            print(f"Group index {group_index} out of range. Only {len(grouped)} duplicate groups exist.")
            return None
        
        # Get the group at specified index
        target_group = grouped.iloc[group_index]
        
        # Build filter for this group
        mask = pd.Series(True, index=duplicate_rows.index)
        for col in duplicate_check_columns:
            mask &= (duplicate_rows[col] == target_group[col])
        
        group_rows = duplicate_rows[mask]
        print(f"Inspecting duplicate group #{group_index} (largest = 0)")
        print(f"Group size: {len(group_rows):,} rows")
    
    # Display the duplicate check column values (what's the same)
    print("\n" + "=" * 80)
    print("DUPLICATE CHECK COLUMNS (these are IDENTICAL across all rows):")
    print("=" * 80)
    for col in duplicate_check_columns:
        value = group_rows[col].iloc[0]
        print(f"  {col}: {value}")
    
    # Display variation statistics
    print("\n" + "=" * 80)
    print("VARIATION COLUMNS (these DIFFER across rows):")
    print("=" * 80)
    for col in variation_columns:
        unique_vals = group_rows[col].dropna().unique()
        null_count = group_rows[col].isna().sum()
        
        print(f"\n  {col}:")
        print(f"    Unique values: {len(unique_vals)}")
        print(f"    Null values: {null_count}")
        
        if len(unique_vals) <= 10:
            # Show all values if there aren't too many
            value_counts = group_rows[col].value_counts(dropna=False)
            for val, count in value_counts.items():
                print(f"      {val}: {count} rows")
        else:
            # Show top 10 most common
            print(f"    Top 10 most common values:")
            value_counts = group_rows[col].value_counts(dropna=False).head(10)
            for val, count in value_counts.items():
                print(f"      {val}: {count} rows")
            print(f"    ... and {len(unique_vals) - 10} more unique values")
    
    # Return the actual rows
    print("\n" + "=" * 80)
    print("ACTUAL ROWS IN THIS DUPLICATE GROUP:")
    print("=" * 80)
    
    if display_all_cols:
        display_df = group_rows
    else:
        # Show only relevant columns
        cols_to_show = duplicate_check_columns + variation_columns
        # Only include columns that actually exist in the dataframe
        cols_to_show = [col for col in cols_to_show if col in group_rows.columns]
        display_df = group_rows[cols_to_show]
    
    print(f"\nShowing {len(display_df)} rows:")
    print(f"(Use display_all_cols=True to see all columns)")
    
    return display_df

In [10]:
# Step 1: Get overall insights about duplicates
insights = get_duplicate_insights(full_df_live, duplicate_check_cols, variation_cols)

# The insights dictionary contains useful information:
# - insights['stats']: Overall statistics
# - insights['group_size_distribution']: How many groups have 2 rows, 3 rows, etc.
# - insights['top_groups']: Top 10 largest duplicate groups
# - insights['duplicate_groups_df']: Full DataFrame for custom analysis

HIGH-LEVEL DUPLICATE INSIGHTS

1. OVERALL STATISTICS:
   Total rows: 30,625,458
   Unique combinations: 6,287,548 (20.53%)
   Duplicate rows: 29,657,088 (96.84%)
   Number of duplicate groups: 5,319,178
   Average rows per duplicate group: 5.58

2. DUPLICATE GROUP SIZE DISTRIBUTION:
   Group Size  |  Count  |  % of Groups
   ----------------------------------------
       2 rows  |  815,323  |  15.33%
       3 rows  |  721,400  |  13.56%
       4 rows  |  665,614  |  12.51%
       5 rows  |  552,308  |  10.38%
       6 rows  |  474,764  |   8.93%
       7 rows  |  1,686,936  |  31.71%
       8 rows  |  53,394  |   1.00%
       9 rows  |  22,961  |   0.43%
      10 rows  |  43,837  |   0.82%
      11 rows  |  12,610  |   0.24%
      12 rows  |  43,389  |   0.82%
      13 rows  |  10,826  |   0.20%
      14 rows  |  127,423  |   2.40%
      15 rows  |   9,844  |   0.19%
      16 rows  |   5,824  |   0.11%
      17 rows  |   3,663  |   0.07%
      18 rows  |   7,721  |   0.15%
      19 ro

In [12]:
# Export duplicate groups to Excel with formatting for readability
from openpyxl import load_workbook
from openpyxl.styles import Font, Alignment, PatternFill
from openpyxl.utils import get_column_letter

# Get the duplicate groups DataFrame
all_duplicate_groups = insights['duplicate_groups_df']

# Limit to top groups (Excel can't handle all 4.4M rows)
# Export top 100k largest groups - these are the most important ones
num_groups_to_export = min(100000, len(all_duplicate_groups))
top_duplicate_groups = all_duplicate_groups.head(num_groups_to_export)

print(f"Exporting top {num_groups_to_export:,} groups out of {len(all_duplicate_groups):,} total")
print(f"These groups contain {top_duplicate_groups['group_size'].sum():,} rows")

# Export to Excel
output_file = 'duplicate_groups_summary.xlsx'
top_duplicate_groups.to_excel(output_file, index=False, sheet_name='Duplicate_Groups')

# Load the workbook for formatting
wb = load_workbook(output_file)
ws = wb['Duplicate_Groups']

# Define styles
header_font = Font(bold=True, size=12, color='FFFFFF')
header_fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')
header_alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)

cell_alignment = Alignment(horizontal='left', vertical='top', wrap_text=True)
number_alignment = Alignment(horizontal='right', vertical='center')

# Format header row
for col_num, col in enumerate(top_duplicate_groups.columns, 1):
    cell = ws.cell(row=1, column=col_num)
    cell.font = header_font
    cell.fill = header_fill
    cell.alignment = header_alignment

# Set column widths and apply formatting
for col_num, col in enumerate(top_duplicate_groups.columns, 1):
    col_letter = get_column_letter(col_num)
    
    # Set column width based on column type
    if col == 'group_size':
        ws.column_dimensions[col_letter].width = 15
        # Format group_size column
        for row_num in range(2, len(top_duplicate_groups) + 2):
            ws.cell(row=row_num, column=col_num).alignment = number_alignment
    elif col in ['TRIM_DESCRIPTION', 'INV_DEALER_NAME', 'MODEL_DESC']:
        ws.column_dimensions[col_letter].width = 35
        # Apply text alignment
        for row_num in range(2, len(top_duplicate_groups) + 2):
            ws.cell(row=row_num, column=col_num).alignment = cell_alignment
    elif col in ['MAKE_DESC', 'SERIES_TEXT', 'FUEL_DESC', 'SEGMENT_DESC', 
                 'ADVNC_VEH_TYPE_DESC', 'MODEL_KEY_SHORT']:
        ws.column_dimensions[col_letter].width = 25
        for row_num in range(2, len(top_duplicate_groups) + 2):
            ws.cell(row=row_num, column=col_num).alignment = cell_alignment
    elif col in ['INV_TOWN_NAME', 'INV_STATE_ABBRV']:
        ws.column_dimensions[col_letter].width = 20
        for row_num in range(2, len(top_duplicate_groups) + 2):
            ws.cell(row=row_num, column=col_num).alignment = cell_alignment
    elif col in ['INV_CONTROL_NBR', 'INV_CENSUS_TRACT']:
        ws.column_dimensions[col_letter].width = 18
        for row_num in range(2, len(top_duplicate_groups) + 2):
            ws.cell(row=row_num, column=col_num).alignment = cell_alignment
    else:
        ws.column_dimensions[col_letter].width = 22
        for row_num in range(2, len(top_duplicate_groups) + 2):
            ws.cell(row=row_num, column=col_num).alignment = cell_alignment

# Set row height for better readability
ws.row_dimensions[1].height = 30  # Header row
for row_num in range(2, min(1000, len(top_duplicate_groups) + 2)):  # First 1000 rows
    ws.row_dimensions[row_num].height = 20

# Add alternating row colors for easier reading
light_fill = PatternFill(start_color='F2F2F2', end_color='F2F2F2', fill_type='solid')
for row_num in range(2, len(top_duplicate_groups) + 2):
    if row_num % 2 == 0:
        for col_num in range(1, len(top_duplicate_groups.columns) + 1):
            ws.cell(row=row_num, column=col_num).fill = light_fill

# Freeze the header row
ws.freeze_panes = 'A2'

# Add auto-filter
ws.auto_filter.ref = ws.dimensions

# Save the formatted workbook
wb.save(output_file)

print(f"✓ Duplicate groups exported to: {output_file}")
print(f"  Groups exported: {len(top_duplicate_groups):,} (top {num_groups_to_export:,} largest)")
print(f"  Total groups in dataset: {len(all_duplicate_groups):,}")
print(f"  Rows covered by exported groups: {top_duplicate_groups['group_size'].sum():,}")
print(f"  Sorted by group_size (largest first)")
print(f"  Features: Formatted headers, wider columns, alternating row colors, frozen header, auto-filter")

Exporting top 100,000 groups out of 5,319,178 total
These groups contain 700,062 rows
✓ Duplicate groups exported to: duplicate_groups_summary.xlsx
  Groups exported: 100,000 (top 100,000 largest)
  Total groups in dataset: 5,319,178
  Rows covered by exported groups: 700,062
  Sorted by group_size (largest first)
  Features: Formatted headers, wider columns, alternating row colors, frozen header, auto-filter


## Inspect Specific Duplicate Groups

Now that you have the Excel summary, you can pick any group and see all the actual rows with all columns.

In [None]:
# Example 1: Look at the largest duplicate group (index 0) with ALL columns
largest_group_all_cols = inspect_duplicate_group(
    full_df_live,
    duplicate_check_cols,
    variation_cols,
    group_index=0,  # Largest group
    display_all_cols=True  # Show ALL 40 columns, not just duplicate_check + variation
)

# This will return a DataFrame with all rows from that group
largest_group_all_cols

In [None]:
# Example 2: Pick a specific group from your Excel file
# Say you found an interesting group in the Excel - row 10 for example
interesting_group = top_duplicate_groups.iloc[9]  # Row 10 (0-indexed)

# Build criteria from that row
criteria = {col: interesting_group[col] for col in duplicate_check_cols}

# Get all actual rows for this group with ALL columns
group_details = inspect_duplicate_group(
    full_df_live,
    duplicate_check_cols,
    variation_cols,
    group_criteria=criteria,
    display_all_cols=True  # Show all 40 columns
)

group_details

In [None]:
# Example 3: Export a specific group's rows to Excel for detailed examination
# Pick any group you're interested in
group_to_export = top_duplicate_groups.iloc[0]  # Largest group
criteria = {col: group_to_export[col] for col in duplicate_check_cols}

# Get all rows
group_rows = inspect_duplicate_group(
    full_df_live,
    duplicate_check_cols,
    variation_cols,
    group_criteria=criteria,
    display_all_cols=True
)

# Export to Excel
if group_rows is not None:
    excel_filename = f"duplicate_group_detail_{group_rows['MAKE_DESC'].iloc[0]}_{group_rows['MODEL_DESC'].iloc[0]}.xlsx"
    group_rows.to_excel(excel_filename, index=False)
    print(f"\n✓ Group details exported to: {excel_filename}")
    print(f"  Rows in group: {len(group_rows):,}")
    print(f"  All {len(group_rows.columns)} columns included")

## Create Excel File with Summary + All Details

This creates a multi-sheet Excel file where you can browse the summary and use filters to see detailed rows.

Preparing to export 29,933,432 duplicate rows...
Across 4,400,732 duplicate groups

Will export top 1,038,576 rows (leaving room for headers and formulas)

✓ Exporting to Excel: duplicates_complete.xlsx
  Formatting Summary sheet...
  Formatting All_Duplicate_Rows sheet...
  Adding row formatting...


KeyboardInterrupt: 

## Analyze Which Columns Vary Within Duplicate Groups

Check which columns actually differ within duplicate groups to understand what makes them "duplicates" vs truly different records.

In [13]:
def analyze_column_variations_in_duplicates(df, duplicate_check_columns):
    """
    For each column NOT used in duplicate detection, check how often it varies within duplicate groups.
    
    This helps identify which columns are actually different vs identical within duplicate groups.
    
    Parameters:
    df: DataFrame to analyze
    duplicate_check_columns: List of columns used to identify duplicates
    
    Returns:
    DataFrame with variation statistics for each column
    """
    print("Analyzing which columns vary within duplicate groups...")
    print("=" * 80)
    
    # Get duplicate rows
    is_duplicate = df.duplicated(subset=duplicate_check_columns, keep=False)
    duplicate_rows = df[is_duplicate].copy()
    
    if len(duplicate_rows) == 0:
        print("No duplicates found!")
        return None
    
    # Get all columns that are NOT in the duplicate check (these can potentially vary)
    all_columns = df.columns.tolist()
    columns_to_analyze = [col for col in all_columns if col not in duplicate_check_columns]
    
    print(f"\nTotal duplicate rows: {len(duplicate_rows):,}")
    print(f"Columns used for duplicate detection: {len(duplicate_check_columns)}")
    print(f"Columns to analyze for variations: {len(columns_to_analyze)}")
    print()
    
    # Group by duplicate check columns
    grouped = duplicate_rows.groupby(duplicate_check_columns)
    num_groups = len(grouped)
    
    print(f"Number of duplicate groups: {num_groups:,}")
    print()
    
    # For each column, calculate variation statistics
    variation_stats = []
    
    for col in columns_to_analyze:
        # Count how many unique values exist in each group for this column
        unique_counts = grouped[col].nunique()
        
        # Groups with variation (more than 1 unique value)
        groups_with_variation = (unique_counts > 1).sum()
        pct_groups_with_variation = 100 * groups_with_variation / num_groups
        
        # Groups with no variation (only 1 unique value or all null)
        groups_without_variation = num_groups - groups_with_variation
        pct_groups_without_variation = 100 * groups_without_variation / num_groups
        
        # Average number of unique values per group
        avg_unique_per_group = unique_counts.mean()
        max_unique_in_group = unique_counts.max()
        
        variation_stats.append({
            'column': col,
            'pct_groups_with_variation': pct_groups_with_variation,
            'avg_unique_per_group': avg_unique_per_group
        })
    
    # Create results DataFrame
    results_df = pd.DataFrame(variation_stats)
    
    # Sort by percentage of groups with variation (descending)
    results_df = results_df.sort_values('pct_groups_with_variation', ascending=False)
    
    return results_df

# Run the analysis
variation_analysis = analyze_column_variations_in_duplicates(full_df_live, duplicate_check_cols)

# Display results
print("\n" + "=" * 80)
print("COLUMN VARIATION ANALYSIS")
print("=" * 80)
print(f"\nColumns ranked by how often they vary within duplicate groups:\n")

# Format and display the results
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', None)

print(variation_analysis.to_string(index=False))

print("\n" + "=" * 80)
print("INTERPRETATION:")
print("=" * 80)
print("• High 'pct_groups_with_variation' = Column often differs within duplicate groups")
print("• Low 'pct_groups_with_variation' = Column usually identical within duplicate groups")
print("• 'avg_unique_per_group' shows average number of different values per group")
print("=" * 80)

Analyzing which columns vary within duplicate groups...

Total duplicate rows: 29,657,088
Columns used for duplicate detection: 16
Columns to analyze for variations: 24

Number of duplicate groups: 5,319,178


COLUMN VARIATION ANALYSIS

Columns ranked by how often they vary within duplicate groups:

               column  pct_groups_with_variation  avg_unique_per_group
     REPORT_YEAR_WEEK                  98.248696              4.847148
     INV_LST_PRCE_AMT                  48.840422              1.873822
         INV_MSRP_AMT                  16.474312              1.245404
             VIN_SOLD                  11.729801              1.117298
      VEH_AT_PHYS_LOC                   7.331960              1.073320
     NVI_CENSUS_TRACT                   3.354710              1.042082
        NVI_OWNSHP_DT                   2.662347              1.032254
      NVI_CONTROL_NBR                   2.501552              1.026663
   NVI_EFCTV_START_DT                   2.314907            

In [31]:
# Export variation analysis to Excel with formatting
from openpyxl import load_workbook
from openpyxl.styles import Font, Alignment, PatternFill
from openpyxl.utils import get_column_letter

output_file = 'column_variation_analysis.xlsx'
variation_analysis.to_excel(output_file, index=False, sheet_name='Variation_Analysis')

# Load and format
wb = load_workbook(output_file)
ws = wb['Variation_Analysis']

# Header formatting
header_font = Font(bold=True, size=12, color='FFFFFF')
header_fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')
header_alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)

for col_num in range(1, len(variation_analysis.columns) + 1):
    cell = ws.cell(row=1, column=col_num)
    cell.font = header_font
    cell.fill = header_fill
    cell.alignment = header_alignment

# Column widths
ws.column_dimensions['A'].width = 35  # column name
ws.column_dimensions['B'].width = 28  # pct_groups_with_variation
ws.column_dimensions['C'].width = 25  # avg_unique_per_group

# Format data rows
for row_num in range(2, len(variation_analysis) + 2):
    # Column name - left aligned
    ws.cell(row=row_num, column=1).alignment = Alignment(horizontal='left', vertical='center')
    
    # Numbers - right aligned with appropriate formatting
    for col_num in range(2, len(variation_analysis.columns) + 1):
        cell = ws.cell(row=row_num, column=col_num)
        cell.alignment = Alignment(horizontal='right', vertical='center')
        
        # Format percentage (column B)
        if col_num == 2:  # pct_groups_with_variation
            cell.number_format = '0.00'
        # Format decimals (column C)
        elif col_num == 3:  # avg_unique_per_group
            cell.number_format = '0.00'

# Alternating row colors
light_fill = PatternFill(start_color='F2F2F2', end_color='F2F2F2', fill_type='solid')
for row_num in range(2, len(variation_analysis) + 2):
    if row_num % 2 == 0:
        for col_num in range(1, len(variation_analysis.columns) + 1):
            ws.cell(row=row_num, column=col_num).fill = light_fill

# Freeze header
ws.freeze_panes = 'A2'

# Auto-filter
ws.auto_filter.ref = ws.dimensions

# Row heights
ws.row_dimensions[1].height = 40
for row_num in range(2, len(variation_analysis) + 2):
    ws.row_dimensions[row_num].height = 18

wb.save(output_file)

print(f"✓ Variation analysis exported to: {output_file}")
print(f"  {len(variation_analysis)} columns analyzed")
print(f"  Sorted by pct_groups_with_variation (highest first)")
print(f"  Features: Formatted headers, proper number formats, alternating colors, frozen header, auto-filter")

✓ Variation analysis exported to: column_variation_analysis.xlsx
  26 columns analyzed
  Sorted by pct_groups_with_variation (highest first)
  Features: Formatted headers, proper number formats, alternating colors, frozen header, auto-filter


## Focus on REPORT_YEAR_WEEK Variations

Since you suspect REPORT_YEAR_WEEK is the main differentiator, let's specifically analyze it.

In [None]:
def analyze_report_week_as_only_difference(df, duplicate_check_columns):
    """
    Specifically check if REPORT_YEAR_WEEK is the ONLY difference in duplicate groups.
    
    This identifies "true duplicates" where the same record appears in multiple weeks.
    """
    print("=" * 80)
    print("REPORT_YEAR_WEEK SPECIFIC ANALYSIS")
    print("=" * 80)
    
    # Get duplicate rows
    is_duplicate = df.duplicated(subset=duplicate_check_columns, keep=False)
    duplicate_rows = df[is_duplicate].copy()
    
    if len(duplicate_rows) == 0:
        print("No duplicates found!")
        return None
    
    # Group by duplicate check columns
    grouped = duplicate_rows.groupby(duplicate_check_columns)
    num_groups = len(grouped)
    
    # Get all columns except duplicate_check_columns and REPORT_YEAR_WEEK
    all_columns = df.columns.tolist()
    other_columns = [col for col in all_columns 
                     if col not in duplicate_check_columns and col != 'REPORT_YEAR_WEEK']
    
    print(f"\nTotal duplicate groups: {num_groups:,}")
    print(f"Checking if REPORT_YEAR_WEEK is the ONLY varying column...")
    print()
    
    # Count groups where ONLY REPORT_YEAR_WEEK varies
    groups_only_week_varies = 0
    groups_week_and_others_vary = 0
    groups_week_constant = 0
    
    week_variation_details = []
    
    for name, group in grouped:
        # Check if REPORT_YEAR_WEEK varies in this group
        week_unique = group['REPORT_YEAR_WEEK'].nunique()
        week_varies = week_unique > 1
        
        # Check if ANY other column varies
        other_columns_vary = False
        varying_other_cols = []
        
        for col in other_columns:
            if group[col].nunique() > 1:
                other_columns_vary = True
                varying_other_cols.append(col)
        
        # Categorize this group
        if week_varies and not other_columns_vary:
            groups_only_week_varies += 1
            week_variation_details.append({
                'group_size': len(group),
                'num_weeks': week_unique,
                'weeks': sorted(group['REPORT_YEAR_WEEK'].unique()),
                'category': 'ONLY_WEEK_VARIES'
            })
        elif week_varies and other_columns_vary:
            groups_week_and_others_vary += 1
            week_variation_details.append({
                'group_size': len(group),
                'num_weeks': week_unique,
                'weeks': sorted(group['REPORT_YEAR_WEEK'].unique()),
                'other_varying_cols': varying_other_cols,
                'category': 'WEEK_AND_OTHERS_VARY'
            })
        else:
            groups_week_constant += 1
    
    # Calculate percentages
    pct_only_week = 100 * groups_only_week_varies / num_groups
    pct_week_and_others = 100 * groups_week_and_others_vary / num_groups
    pct_week_constant = 100 * groups_week_constant / num_groups
    
    print(f"Results:")
    print(f"  Groups where ONLY REPORT_YEAR_WEEK varies: {groups_only_week_varies:,} ({pct_only_week:.2f}%)")
    print(f"  Groups where REPORT_YEAR_WEEK + other columns vary: {groups_week_and_others_vary:,} ({pct_week_and_others:.2f}%)")
    print(f"  Groups where REPORT_YEAR_WEEK is constant: {groups_week_constant:,} ({pct_week_constant:.2f}%)")
    
    print("\n" + "=" * 80)
    print("INTERPRETATION:")
    print("=" * 80)
    print(f"✓ {pct_only_week:.1f}% of duplicate groups differ ONLY in REPORT_YEAR_WEEK")
    print(f"  → These are the same record appearing in multiple weekly reports")
    print()
    print(f"✓ {pct_week_and_others:.1f}% differ in REPORT_YEAR_WEEK AND other columns")
    print(f"  → These records changed over time (price, location, etc.)")
    print()
    print(f"✓ {pct_week_constant:.1f}% have the SAME REPORT_YEAR_WEEK")
    print(f"  → These are true duplicates within the same week")
    print("=" * 80)
    
    return {
        'summary': {
            'total_groups': num_groups,
            'only_week_varies': groups_only_week_varies,
            'week_and_others_vary': groups_week_and_others_vary,
            'week_constant': groups_week_constant,
            'pct_only_week': pct_only_week,
            'pct_week_and_others': pct_week_and_others,
            'pct_week_constant': pct_week_constant
        },
        'details': week_variation_details
    }

# Run the analysis
week_analysis = analyze_report_week_as_only_difference(full_df_live, duplicate_check_cols)

In [None]:
# Show some examples of groups where ONLY the week varies
print("\n" + "=" * 80)
print("SAMPLE GROUPS WHERE ONLY REPORT_YEAR_WEEK VARIES")
print("=" * 80)

only_week_examples = [d for d in week_analysis['details'] if d['category'] == 'ONLY_WEEK_VARIES']

if only_week_examples:
    # Sort by group size and show top 5
    only_week_examples_sorted = sorted(only_week_examples, key=lambda x: x['group_size'], reverse=True)
    
    print(f"\nShowing top 5 largest groups (out of {len(only_week_examples):,} total):\n")
    
    for i, example in enumerate(only_week_examples_sorted[:5], 1):
        print(f"{i}. Group with {example['group_size']} rows across {example['num_weeks']} weeks")
        print(f"   Weeks: {example['weeks']}")
        print()
    
    # Show distribution of how many weeks duplicates span
    week_counts = [d['num_weeks'] for d in only_week_examples]
    week_count_dist = pd.Series(week_counts).value_counts().sort_index()
    
    print("\nDistribution of weeks spanned by 'only week varies' groups:")
    print(f"{'Num Weeks':<12} {'Count':<12} {'Percentage'}")
    print("-" * 40)
    for num_weeks, count in week_count_dist.items():
        pct = 100 * count / len(only_week_examples)
        print(f"{num_weeks:<12} {count:<12,} {pct:>6.2f}%")
else:
    print("\nNo groups found where only REPORT_YEAR_WEEK varies.")

## Find Duplicate Groups Where INV_LST_PRCE_AMT Varies

Looking for duplicate groups where the list price (INV_LST_PRCE_AMT) has different values.

In [17]:
# Find duplicate groups where INV_LST_PRCE_AMT varies
# Filter for groups where INV_LST_PRCE_AMT_unique_count > 1 AND group size between 6 and 10
price_varying_groups = results[
    (results['INV_LST_PRCE_AMT_unique_count'] > 1) &
    (results['duplicate_group_size'] >= 6) &
    (results['duplicate_group_size'] <= 10)
].copy()

print(f"Found {len(price_varying_groups):,} duplicate groups where:")
print(f"  - INV_LST_PRCE_AMT varies")
print(f"  - Group size is between 6 and 10 rows")
print(f"That's {100*len(price_varying_groups)/len(results):.2f}% of all duplicate groups")
print("\n" + "="*80)
print("Groups with price variations (group size 6-10, sorted by group size):")
print("="*80)

# Show top 20 by group size
for i, (idx, row) in enumerate(price_varying_groups.head(20).iterrows(), 1):
    print(f"\n{'='*80}")
    print(f"GROUP {i} - Size: {row['duplicate_group_size']:,} rows")
    print(f"{'='*80}")
    
    # Show ALL duplicate check columns (these define the group)
    print("\nDuplicate Check Columns (what makes this a group):")
    for col in duplicate_check_cols:
        print(f"  {col}: {row[col]}")
    
    # Show variation info
    print(f"\nPrice Variations:")
    print(f"  INV_LST_PRCE_AMT: {row['INV_LST_PRCE_AMT_unique_count']} unique values - {row['INV_LST_PRCE_AMT_values']}")
    
    # Show other variation columns summary
    print(f"\nOther Variations:")
    for col in variation_cols:
        if col != 'INV_LST_PRCE_AMT':
            unique_count = row[f'{col}_unique_count']
            if unique_count > 1:
                values = row[f'{col}_values']
                print(f"  {col}: {unique_count} unique values - {values}")

Found 1,352,764 duplicate groups where:
  - INV_LST_PRCE_AMT varies
  - Group size is between 6 and 10 rows
That's 25.43% of all duplicate groups

Groups with price variations (group size 6-10, sorted by group size):

GROUP 1 - Size: 10 rows

Duplicate Check Columns (what makes this a group):
  INV_MIN_VIN_SOLD_DT: 2025-09-30 00:00:00
  SRC_VEH_FIRST_SCRAPED_DT: 2025-06-25 00:00:00
  TRIM_DESCRIPTION: SEL SPORT
  INV_CENSUS_TRACT: 37155009613
  INV_CONTROL_NBR: 0545961
  INV_DEALER_NAME: ALM HYUNDAI LUMBERTON
  INV_TOWN_NAME: LUMBERTON
  INV_STATE_ABBRV: NC
  MAKE_DESC: HYUNDAI
  MODEL_DESC: ELANTRA
  SERIES_TEXT: SLS
  FUEL_DESC: GASOLINE
  SEGMENT_DESC: *COMPACT CAR
  ADVNC_VEH_TYPE_DESC: Gasoline
  MODEL_KEY_SHORT: 148799
  INV_COUNT: 1.0

Price Variations:
  INV_LST_PRCE_AMT: 4 unique values - 23772.0, 25165.0, 25105.0, 23717.0

Other Variations:
  INV_MSRP_AMT: 2 unique values - 25165.0, 25105.0

GROUP 2 - Size: 10 rows

Duplicate Check Columns (what makes this a group):
  INV_MIN

In [15]:
# Get the actual rows from the first duplicate group with price variations
# This will show you all the rows that belong to that group

if len(price_varying_groups) > 0:
    # Get the first group's identifying values
    first_group = price_varying_groups.iloc[0]
    
    print(f"Fetching all rows for the largest duplicate group with price variations...")
    print(f"Group size: {first_group['duplicate_group_size']:,} rows")
    print("="*80)
    
    # Build a filter to get all rows from this duplicate group
    filter_mask = pd.Series(True, index=full_df_live.index)
    for col in duplicate_check_cols:
        filter_mask &= (full_df_live[col] == first_group[col])
    
    # Get the actual rows
    example_group = full_df_live[filter_mask].copy()
    
    # Display key columns including the varying price
    display_cols = ['REPORT_YEAR_WEEK', 'VEH_AT_PHYS_LOC', 'INV_LST_PRCE_AMT', 'INV_MSRP_AMT', 
                    'VIN_SOLD', 'DAYS_LSTD_BFR_SLD', 'INV_DEALER_NAME', 'MAKE_DESC', 'MODEL_DESC']
    
    print(f"\nShowing key columns for this duplicate group:")
    print(example_group[display_cols].to_string())
    
    print(f"\n\nINV_LST_PRCE_AMT value counts in this group:")
    print(example_group['INV_LST_PRCE_AMT'].value_counts())
else:
    print("No duplicate groups with price variations found!")

Fetching all rows for the largest duplicate group with price variations...
Group size: 424 rows

Showing key columns for this duplicate group:
         REPORT_YEAR_WEEK VEH_AT_PHYS_LOC  INV_LST_PRCE_AMT  INV_MSRP_AMT  VIN_SOLD  DAYS_LSTD_BFR_SLD      INV_DEALER_NAME   MAKE_DESC   MODEL_DESC
9754              2025-42               N          159095.0      159095.0       0.0             -999.0  JAGUAR SANTA MONICA  LAND ROVER  RANGE ROVER
115133            2025-40               N          130900.0      130900.0       0.0             -999.0  JAGUAR SANTA MONICA  LAND ROVER  RANGE ROVER
156267            2025-40               N          159950.0      159950.0       0.0             -999.0  JAGUAR SANTA MONICA  LAND ROVER  RANGE ROVER
166297            2025-41               N          151325.0      151325.0       0.0             -999.0  JAGUAR SANTA MONICA  LAND ROVER  RANGE ROVER
275320            2025-41               N          150330.0      150330.0       0.0             -999.0  JAGUAR S