In [None]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine

load_dotenv(os.path.join('..', 'Modeling', '.env'))

connection_string = (
    f"mssql+pyodbc://{os.getenv('DB_USERNAME')}:{os.getenv('DB_PASSWORD')}"
    f"@10.0.30.16:1433/RTID_SourceData"
    f"?driver=ODBC+Driver+17+for+SQL+Server"
    f"&TrustServerCertificate=yes"
    f"&Connection+Timeout=5"
)
engine = create_engine(connection_string)
print('Connected')

In [None]:
# Load the tmp CSV that Stage 1 produced
tmp = pd.read_csv(os.path.join('..', 'Modeling', 'tmp_20260205.csv'), low_memory=False)
print(f'Rows: {len(tmp)}')
print(f'
is_reporting_brand value counts:')
print(tmp['is_reporting_brand'].value_counts())
print(f'
Unique brands: {tmp["MAKE_DESC"].nunique()}')
print(tmp['MAKE_DESC'].value_counts())

In [None]:
# Check the raw DB table: which brands exist and how many have SLS vs NVI data?
query = """
SELECT 
    MAKE_DESC,
    COUNT(*) as total_rows,
    SUM(CASE WHEN SLS_OWNSHP_DT > '1900-01-01' THEN 1 ELSE 0 END) as has_sales,
    SUM(CASE WHEN NVI_OWNSHP_DT > '1900-01-01' THEN 1 ELSE 0 END) as has_registration,
    SUM(CASE WHEN SLS_OWNSHP_DT <= '1900-01-01' OR SLS_OWNSHP_DT IS NULL THEN 1 ELSE 0 END) as no_sales
FROM SPGM_Live.SPGM_Weekly_INV_NVI_SLS_20260209
GROUP BY MAKE_DESC
ORDER BY total_rows DESC
"""
brands = pd.read_sql(query, engine)
brands['pct_reported'] = (brands['has_sales'] / brands['total_rows'] * 100).round(1)
print(f'Total brands in DB: {len(brands)}')
print(f'\nBrands with <100% sales reporting (potential non-reporting):')
print(brands[brands['pct_reported'] < 100].to_string(index=False))
print(f'\nTop 40 brands:')
print(brands.head(40).to_string(index=False))