# Cell 1: Imports and Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

print("Libraries loaded successfully")

Libraries loaded successfully


# Cell 2: Load All Tables

In [2]:
tables = {}
files = [
    'transaction_data', 'hh_demographic', 'product',
    'campaign_desc', 'campaign_table', 'coupon',
    'coupon_redempt', 'causal_data'
]

for file in files:
    path = f'../data/raw/{file}.csv'
    tables[file] = pd.read_csv(path)
    print(f"{file}: {len(tables[file]):,} rows, {len(tables[file].columns)} columns")

transaction_data: 2,595,732 rows, 12 columns
hh_demographic: 801 rows, 8 columns
product: 92,353 rows, 7 columns
campaign_desc: 30 rows, 4 columns
campaign_table: 7,208 rows, 3 columns
coupon: 124,548 rows, 3 columns
coupon_redempt: 2,318 rows, 4 columns
causal_data: 36,786,524 rows, 5 columns


# Cell 3: Profile Transaction Data

In [3]:
trans = tables['transaction_data']

print("=== TRANSACTION DATA PROFILE ===")
print(f"\nDate range: Day {trans['DAY'].min()} to Day {trans['DAY'].max()}")
print(f"Unique households: {trans['household_key'].nunique():,}")
print(f"Unique baskets: {trans['BASKET_ID'].nunique():,}")
print(f"Unique products: {trans['PRODUCT_ID'].nunique():,}")
print(f"\nTotal revenue: ${trans['SALES_VALUE'].sum():,.2f}")
print(f"Total discounts: ${trans['RETAIL_DISC'].sum():,.2f} (retail)")
print(f"               ${trans['COUPON_DISC'].sum():,.2f} (coupon)")

trans.describe()

=== TRANSACTION DATA PROFILE ===

Date range: Day 1 to Day 711
Unique households: 2,500
Unique baskets: 276,484
Unique products: 92,339

Total revenue: $8,057,463.08
Total discounts: $-1,398,334.84 (retail)
               $-42,611.54 (coupon)


Unnamed: 0,household_key,BASKET_ID,DAY,PRODUCT_ID,QUANTITY,SALES_VALUE,STORE_ID,RETAIL_DISC,TRANS_TIME,WEEK_NO,COUPON_DISC,COUPON_MATCH_DISC
count,2595732.0,2595732.0,2595732.0,2595732.0,2595732.0,2595732.0,2595732.0,2595732.0,2595732.0,2595732.0,2595732.0,2595732.0
mean,1271.95,34026199138.89,388.76,2891435.16,100.43,3.1,3142.67,-0.54,1561.59,56.22,-0.02,-0.0
std,726.07,4711649037.86,189.72,3837403.69,1153.44,4.18,8937.11,1.25,399.84,27.1,0.22,0.04
min,1.0,26984851472.0,1.0,25671.0,0.0,0.0,1.0,-180.0,0.0,1.0,-55.93,-7.7
25%,656.0,30408046256.0,229.0,917459.0,1.0,1.29,330.0,-0.69,1308.0,33.0,0.0,0.0
50%,1272.0,32760806548.0,390.0,1028816.0,1.0,2.0,372.0,-0.01,1613.0,56.0,0.0,0.0
75%,1913.0,40126853628.0,553.0,1133018.0,1.0,3.49,422.0,0.0,1843.0,80.0,0.0,0.0
max,2500.0,42305362535.0,711.0,18316298.0,89638.0,840.0,34280.0,3.99,2359.0,102.0,0.0,0.0


# Cell 4: CHECK FOR ANOMALY - Positive Discounts

In [4]:
retail_positive = (trans['RETAIL_DISC'] > 0).sum()
coupon_positive = (trans['COUPON_DISC'] > 0).sum()

print("=== DATA QUALITY CHECK: DISCOUNTS ===")
print(f"Rows with RETAIL_DISC > 0: {retail_positive}")
print(f"Rows with COUPON_DISC > 0: {coupon_positive}")

if retail_positive > 0 or coupon_positive > 0:
    print("\n⚠️ ANOMALY DETECTED: Positive discount values found!")
    print("These should be negative (discounts reduce price)")
    
    # Examine the anomalous rows
    anomalies = trans[(trans['RETAIL_DISC'] > 0) | (trans['COUPON_DISC'] > 0)]
    print(f"\nAnomalous rows: {len(anomalies)}")
    print(anomalies.head(10))

=== DATA QUALITY CHECK: DISCOUNTS ===
Rows with RETAIL_DISC > 0: 36
Rows with COUPON_DISC > 0: 0

⚠️ ANOMALY DETECTED: Positive discount values found!
These should be negative (discounts reduce price)

Anomalous rows: 36
         household_key    BASKET_ID  DAY  PRODUCT_ID  QUANTITY  SALES_VALUE  \
968972            1306  31624096856  310      990941         0         0.01   
1001608           1306  31735306143  318     7409789         0         0.00   
1023085           1321  31803641901  323     1051069         0         0.00   
1023086           1321  31803641901  323     1055863         0         0.00   
1067337             37  31944616637  334     8090539         0         0.00   
1143255           2241  32173242560  353      995242         0         0.00   
1210504            923  32446036423  369     1047226         0         0.00   
1215551           1535  32478752229  370      999270         1         7.98   
1216489           2491  32478901200  370    13945244         0      

# Cell 5: Profile Demographics

In [5]:
demog = tables['hh_demographic']

print("=== DEMOGRAPHIC COVERAGE ===")
print(f"Households with demographics: {len(demog)}")
print(f"Total households in transactions: {trans['household_key'].nunique()}")
print(f"Coverage: {len(demog) / trans['household_key'].nunique() * 100:.1f}%")

print("\n=== DEMOGRAPHIC DISTRIBUTIONS ===")
for col in ['AGE_DESC', 'INCOME_DESC', 'HH_COMP_DESC']:
    print(f"\n{col}:")
    print(demog[col].value_counts())

=== DEMOGRAPHIC COVERAGE ===
Households with demographics: 801
Total households in transactions: 2500
Coverage: 32.0%

=== DEMOGRAPHIC DISTRIBUTIONS ===

AGE_DESC:
AGE_DESC
45-54    288
35-44    194
25-34    142
65+       72
55-64     59
19-24     46
Name: count, dtype: int64

INCOME_DESC:
INCOME_DESC
50-74K       192
35-49K       172
75-99K        96
25-34K        77
15-24K        74
Under 15K     61
125-149K      38
100-124K      34
150-174K      30
250K+         11
175-199K      11
200-249K       5
Name: count, dtype: int64

HH_COMP_DESC:
HH_COMP_DESC
2 Adults No Kids    255
2 Adults Kids       187
Single Female       144
Single Male          95
Unknown              73
1 Adult Kids         47
Name: count, dtype: int64
