<a href="https://colab.research.google.com/github/joshuadollison/smallbizpulse/blob/jd-model/notebooks/model_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q uninstall -y pyarrow datasets
!pip -q install --no-cache-dir -U pyarrow datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.6/47.6 MB[0m [31m272.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m296.3 MB/s[0m eta [36m0:00:00[0m
[?25h

# Setup

In [2]:
# ============================================================
# SETUP: Mount Drive, Install Dependencies, Configure Styling
# ============================================================

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install VADER for sentiment analysis
!pip install vaderSentiment -q

import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
from collections import Counter
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# ── Consistent Plot Styling ──────────────────────────────────
plt.rcParams.update({
    'figure.figsize': (12, 6),
    'figure.dpi': 120,
    'font.family': 'sans-serif',
    'font.size': 11,
    'axes.titlesize': 14,
    'axes.titleweight': 'bold',
    'axes.labelsize': 12,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.grid': True,
    'grid.alpha': 0.3,
    'grid.linestyle': '--',
})

# SmallBizPulse color palette
COLORS = {
    'primary': '#2563EB',
    'secondary': '#F59E0B',
    'open': '#10B981',
    'closed': '#EF4444',
    'accent1': '#8B5CF6',
    'accent2': '#EC4899',
    'neutral': '#6B7280',
    'bg': '#F9FAFB',
}
PALETTE_OC = [COLORS['open'], COLORS['closed']]

print("Setup complete — libraries loaded, styling configured.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Setup complete — libraries loaded, styling configured.


In [3]:
# ============================================================
# DATA LOADING
# ============================================================
# >>> UPDATE THIS PATH to match your Google Drive folder <<<
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/CIS509/yelp_dataset_new/'

def load_json(filename):
    filepath = DATA_PATH + filename
    with open(filepath, 'r') as f:
        first_char = f.read(1)
        f.seek(0)
        if first_char == '[':
            return pd.DataFrame(json.load(f))
        else:
            return pd.read_json(f, lines=True)

print("Loading datasets...")
business_df = load_json('yelp_academic_dataset_business.json')
print(f"  Business: {len(business_df):,} records")

review_df = load_json('yelp_academic_dataset_review.json')
print(f"  Review:   {len(review_df):,} records")

tip_df = load_json('yelp_academic_dataset_tip.json')
print(f"  Tip:      {len(tip_df):,} records")

checkin_df = load_json('yelp_academic_dataset_checkin.json')
print(f"  Checkin:  {len(checkin_df):,} records")

user_df = load_json('yelp_academic_dataset_user.json')
print(f"  User:     {len(user_df):,} records")

print("\nAll datasets loaded successfully.")

Loading datasets...
  Business: 9,973 records
  Review:   100,000 records
  Tip:      264,693 records
  Checkin:  9,337 records
  User:     79,345 records

All datasets loaded successfully.


In [4]:
# ============================================================
# DATA SOURCES & FILTERING CRITERIA
# ============================================================

# Step 1: Filter for restaurants
# A business is classified as a "restaurant" if its Yelp categories
# contain the word "Restaurants" (case-insensitive).
restaurant_df = business_df[
    business_df['categories'].str.contains('Restaurants', case=False, na=False)
].copy()

# Step 2: Get restaurant business IDs
restaurant_ids = set(restaurant_df['business_id'])

# Step 3: Filter reviews to restaurant-only
rest_review_df = review_df[review_df['business_id'].isin(restaurant_ids)].copy()
rest_review_df['date'] = pd.to_datetime(rest_review_df['date'])
rest_review_df['year'] = rest_review_df['date'].dt.year

# Step 4: Filter tips to restaurant-only
rest_tip_df = tip_df[tip_df['business_id'].isin(restaurant_ids)].copy()

# Step 5: Filter checkins to restaurant-only
rest_checkin_df = checkin_df[checkin_df['business_id'].isin(restaurant_ids)].copy()

# Step 6: Merge business status onto reviews
status_map = restaurant_df.set_index('business_id')['is_open'].to_dict()
rest_review_df['is_open'] = rest_review_df['business_id'].map(status_map)
rest_review_df['status'] = rest_review_df['is_open'].map({1: 'Open', 0: 'Closed'})

# ── Print Summary ────────────────────────────────────────────
print("=" * 65)
print("DATA SOURCES & FILTERING SUMMARY")
print("=" * 65)

print("\nPRIMARY DATA SOURCE: Yelp Academic Dataset")
print("-" * 45)

print("\nFull Dataset:")
print(f"  Businesses:  {len(business_df):>8,}")
print(f"  Reviews:     {len(review_df):>8,}")
print(f"  Tips:        {len(tip_df):>8,}")
print(f"  Check-ins:   {len(checkin_df):>8,}")
print(f"  Users:       {len(user_df):>8,}")

print("\nFiltered to Restaurants (categories contain 'Restaurants'):")
print(f"  Restaurants:        {len(restaurant_df):>8,}")
print(f"  Restaurant Reviews: {len(rest_review_df):>8,}")
print(f"  Restaurant Tips:    {len(rest_tip_df):>8,}")
print(f"  Restaurant Checkins:{len(rest_checkin_df):>8,}")

n_open = (restaurant_df['is_open'] == 1).sum()
n_closed = (restaurant_df['is_open'] == 0).sum()
print("\nRestaurant Status:")
print(f"  Open:   {n_open:>5,}  ({n_open / len(restaurant_df) * 100:.1f}%)")
print(f"  Closed: {n_closed:>5,}  ({n_closed / len(restaurant_df) * 100:.1f}%)")

print(f"\nDate Range: {rest_review_df['date'].min().strftime('%Y-%m-%d')} to "
      f"{rest_review_df['date'].max().strftime('%Y-%m-%d')}")

print("\nFILTERING CRITERIA APPLIED:")
print("  1. Category filter: categories.str.contains('Restaurants')")
print("  2. Reviews, tips, and check-ins filtered by restaurant business_id")
print("  3. No minimum review count threshold (preserving data-sparse")
print("     businesses is important for studying closure patterns)")
print("  4. No date range restriction (full temporal span needed for time-series)")

DATA SOURCES & FILTERING SUMMARY

PRIMARY DATA SOURCE: Yelp Academic Dataset
---------------------------------------------

Full Dataset:
  Businesses:     9,973
  Reviews:      100,000
  Tips:         264,693
  Check-ins:      9,337
  Users:         79,345

Filtered to Restaurants (categories contain 'Restaurants'):
  Restaurants:           4,132
  Restaurant Reviews:   72,124
  Restaurant Tips:      20,394
  Restaurant Checkins:   4,085

Restaurant Status:
  Open:   2,575  (62.3%)
  Closed: 1,557  (37.7%)

Date Range: 2005-03-01 to 2018-10-04

FILTERING CRITERIA APPLIED:
  1. Category filter: categories.str.contains('Restaurants')
  2. Reviews, tips, and check-ins filtered by restaurant business_id
  3. No minimum review count threshold (preserving data-sparse
     businesses is important for studying closure patterns)
  4. No date range restriction (full temporal span needed for time-series)


# Get some counts

- wanted to see counts per month to get a sense of the types of model and windows we would want for regression/time-series

In [5]:
import pandas as pd

# Safety - ensure datetime (EDA notebook already does this, but this won't hurt)
rest_review_df['date'] = pd.to_datetime(rest_review_df['date'], errors='coerce')

# 1) Overall monthly review counts
monthly_counts = (
    rest_review_df
      .dropna(subset=['date'])
      .groupby(rest_review_df['date'].dt.to_period('M'))
      .size()
      .rename('review_count')
      .reset_index(name='review_count')
      .rename(columns={'date': 'month'})
)

# Convert Period to timestamp for easy plotting/merging (month start)
monthly_counts['month'] = monthly_counts['month'].dt.to_timestamp()

print(monthly_counts.head(12))
print('\nRows:', len(monthly_counts))
print('Date range:', monthly_counts['month'].min(), 'to', monthly_counts['month'].max())

# 2) Monthly counts split by business status (Open vs Closed) - if you created 'status' in EDA
if 'status' in rest_review_df.columns:
    monthly_counts_by_status = (
        rest_review_df
          .dropna(subset=['date'])
          .groupby([rest_review_df['date'].dt.to_period('M'), 'status'])
          .size()
          .rename('review_count')
          .reset_index()
          .rename(columns={'date': 'month'})
    )
    monthly_counts_by_status['month'] = monthly_counts_by_status['month'].dt.to_timestamp()
    print('\nBy status:')
    print(monthly_counts_by_status.head(12))

# 3) Monthly counts per business_id (useful for later time-series modeling)
monthly_counts_by_business = (
    rest_review_df
      .dropna(subset=['date'])
      .groupby(['business_id', rest_review_df['date'].dt.to_period('M')])
      .size()
      .rename('review_count')
      .reset_index()
      .rename(columns={'date': 'month'})
)
monthly_counts_by_business['month'] = monthly_counts_by_business['month'].dt.to_timestamp()

print('\nPer business:')
print(monthly_counts_by_business.head(12))

        month  review_count
0  2005-03-01             4
1  2005-04-01             3
2  2005-05-01             4
3  2005-06-01             1
4  2005-07-01            14
5  2005-08-01             1
6  2005-09-01             7
7  2005-10-01             2
8  2005-11-01             7
9  2005-12-01             4
10 2006-01-01            13
11 2006-02-01             5

Rows: 156
Date range: 2005-03-01 00:00:00 to 2018-10-01 00:00:00

By status:
        month  status  review_count
0  2005-03-01  Closed             1
1  2005-03-01    Open             3
2  2005-04-01  Closed             2
3  2005-04-01    Open             1
4  2005-05-01  Closed             2
5  2005-05-01    Open             2
6  2005-06-01    Open             1
7  2005-07-01  Closed             7
8  2005-07-01    Open             7
9  2005-08-01    Open             1
10 2005-09-01  Closed             3
11 2005-09-01    Open             4

Per business:
               business_id      month  review_count
0   --ZVrH2X2QXBFdCilbi

In [6]:
# Pivot for quick plot-ready table (month rows, status columns)
if 'status' in rest_review_df.columns:
    pivot = monthly_counts_by_status.pivot(index='month', columns='status', values='review_count').fillna(0).astype(int)
    print(pivot.tail(12))


status      Closed  Open
month                   
2017-11-01     129   529
2017-12-01     181   904
2018-01-01     141   652
2018-02-01     202   888
2018-03-01     220  1035
2018-04-01     202  1012
2018-05-01     177   924
2018-06-01     144   844
2018-07-01     182   967
2018-08-01     137   799
2018-09-01     105   749
2018-10-01       8    38


In [7]:
import pandas as pd

# Ensure datetime
rest_review_df['date'] = pd.to_datetime(rest_review_df['date'], errors='coerce')
df = rest_review_df.dropna(subset=['date']).copy()

# Ensure status exists (Open/Closed)
if 'status' not in df.columns:
    if 'is_open' in df.columns:
        df['status'] = df['is_open'].map({1: 'Open', 0: 'Closed'})
    else:
        raise ValueError("Need either 'status' or 'is_open' in rest_review_df.")

# Build year-month grain counts (so the averaging is fair across years)
df['year'] = df['date'].dt.year
df['month_num'] = df['date'].dt.month
df['month_name'] = df['date'].dt.strftime('%b')  # Jan, Feb, ...

monthly_counts = (
    df.groupby(['status', 'year', 'month_num', 'month_name'])
      .size()
      .reset_index(name='review_count')
)

# Average by calendar month across years
avg_by_month = (
    monthly_counts.groupby(['status', 'month_num', 'month_name'])['review_count']
      .mean()
      .reset_index(name='avg_reviews_per_month')
      .sort_values(['month_num', 'status'])
)

# Nice pivot view (rows = month, cols = status)
avg_by_month_pivot = (
    avg_by_month.pivot(index=['month_num', 'month_name'], columns='status', values='avg_reviews_per_month')
      .reset_index()
      .sort_values('month_num')
)

print(avg_by_month_pivot)

status  month_num month_name      Closed        Open
0               1        Jan  133.230769  343.076923
1               2        Feb  159.300000  356.083333
2               3        Mar  148.333333  392.833333
3               4        Apr  121.538462  348.230769
4               5        May  131.000000  358.285714
5               6        Jun  118.307692  318.000000
6               7        Jul  140.615385  404.461538
7               8        Aug  142.692308  367.571429
8               9        Sep  104.923077  320.833333
9              10        Oct  135.692308  342.153846
10             11        Nov  101.833333  273.000000
11             12        Dec  119.000000  298.166667


# Build the Business-Month Feature Table

We start by creating a clean monthly view of the reviews dataset.  First, we convert each review timestamp into a month bucket (YYYY-MM) so we can measure activity and behavior at a consistent time grain.  

We then compute:
1. Total monthly review volume split by business status (Open vs Closed)
2. A business-month feature table that aggregates review behavior for each restaurant each month (review count, average star rating, rating mix, engagement signals, and basic text length statistics).  

This business-month table becomes the backbone for the modeling pipeline - we will later enrich it with neural-network sentiment scores and BERTopic topic proportions, then feed sequences of monthly features into a GRU/RNN to predict future sentiment direction and closure risk.

In [8]:
import pandas as pd

df = rest_review_df.copy()

# Ensure datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date'])

# Month bucket (month start timestamp)
df['month'] = df['date'].dt.to_period('M').dt.to_timestamp()

# 1) Monthly totals by status (Open vs Closed)
monthly_by_status = (
    df.groupby(['status', 'month'])
      .size()
      .reset_index(name='review_count')
      .sort_values(['month', 'status'])
)

print(monthly_by_status.head(24))

# 2) Business-month backbone table (this is what the RNN will consume)
biz_month = (
    df.groupby(['business_id', 'status', 'month'])
      .agg(
          review_count=('review_id', 'count'),
          avg_stars=('stars', 'mean'),
          pct_1star=('stars', lambda s: (s <= 1.0).mean()),
          pct_5star=('stars', lambda s: (s >= 5.0).mean()),
          avg_useful=('useful', 'mean'),
          avg_funny=('funny', 'mean'),
          avg_cool=('cool', 'mean'),
          avg_text_len=('text', lambda x: x.fillna('').str.len().mean()),
          avg_word_count=('text', lambda x: x.fillna('').str.split().str.len().mean()),
      )
      .reset_index()
      .sort_values(['business_id', 'month'])
)

print(biz_month.head(20))

# 3) Optional: filter to businesses with enough activity for monthly sequences
# Example rule: at least 12 total business-month rows in the dataset
eligible = (
    biz_month.groupby('business_id')['month']
             .nunique()
             .reset_index(name='n_months')
)
eligible_ids = eligible.loc[eligible['n_months'] >= 12, 'business_id']

biz_month_eligible = biz_month[biz_month['business_id'].isin(eligible_ids)].copy()
print("Eligible businesses:", biz_month_eligible['business_id'].nunique())
print("Eligible rows:", len(biz_month_eligible))

     status      month  review_count
0    Closed 2005-03-01             1
150    Open 2005-03-01             3
1    Closed 2005-04-01             2
151    Open 2005-04-01             1
2    Closed 2005-05-01             2
152    Open 2005-05-01             2
153    Open 2005-06-01             1
3    Closed 2005-07-01             7
154    Open 2005-07-01             7
155    Open 2005-08-01             1
4    Closed 2005-09-01             3
156    Open 2005-09-01             4
5    Closed 2005-10-01             1
157    Open 2005-10-01             1
6    Closed 2005-11-01             2
158    Open 2005-11-01             5
7    Closed 2005-12-01             2
159    Open 2005-12-01             2
8    Closed 2006-01-01             7
160    Open 2006-01-01             6
9    Closed 2006-02-01             2
161    Open 2006-02-01             3
10   Closed 2006-03-01             4
162    Open 2006-03-01             6
               business_id  status      month  review_count  avg_stars  \
0

# VADER comparison

## 1) Create a baseline sentiment score (VADER) per review

This gives you an immediate, cheap sentiment channel to compare against the NN later.

In [9]:
import numpy as np
import pandas as pd

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

df = rest_review_df.copy()
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date'])

analyzer = SentimentIntensityAnalyzer()

# VADER scores
vader = df['text'].fillna('').apply(analyzer.polarity_scores)
df['vader_neg'] = vader.apply(lambda d: d['neg'])
df['vader_neu'] = vader.apply(lambda d: d['neu'])
df['vader_pos'] = vader.apply(lambda d: d['pos'])
df['vader_compound'] = vader.apply(lambda d: d['compound'])

df[['review_id','stars','vader_compound']].head()

Unnamed: 0,review_id,stars,vader_compound
0,KU_O5udG6zpxOg-VcAEodg,3.0,0.8597
2,saUsX_uimxRlCVr67Z4Jig,3.0,0.9201
3,AqPFMleE6RsU23_auESxiA,5.0,0.9588
4,Sx8TMOWLNuJBWer-0pcmoA,4.0,0.9815
5,JrIxlS1TzJ-iCu79ul40cQ,1.0,0.7117


## 2) Aggregate VADER to business-month features

This becomes part of the sequence input.

In [10]:
df['month'] = df['date'].dt.to_period('M').dt.to_timestamp()

biz_month_vader = (
    df.groupby(['business_id', 'status', 'month'])
      .agg(
          review_count=('review_id', 'count'),
          avg_stars=('stars', 'mean'),
          vader_mean=('vader_compound', 'mean'),
          vader_std=('vader_compound', 'std'),
          neg_share=('vader_compound', lambda s: (s < -0.05).mean()),
          pos_share=('vader_compound', lambda s: (s >  0.05).mean()),
      )
      .reset_index()
      .sort_values(['business_id', 'month'])
)

biz_month_vader.head(20)

Unnamed: 0,business_id,status,month,review_count,avg_stars,vader_mean,vader_std,neg_share,pos_share
0,--ZVrH2X2QXBFdCilbirsw,Closed,2013-07-01,1,5.0,0.8856,,0.0,1.0
1,--ZVrH2X2QXBFdCilbirsw,Closed,2014-03-01,1,5.0,0.7777,,0.0,1.0
2,--ZVrH2X2QXBFdCilbirsw,Closed,2014-12-01,1,5.0,0.8646,,0.0,1.0
3,--ZVrH2X2QXBFdCilbirsw,Closed,2015-02-01,1,3.0,0.8921,,0.0,1.0
4,--ZVrH2X2QXBFdCilbirsw,Closed,2015-05-01,1,5.0,0.6468,,0.0,1.0
5,--ZVrH2X2QXBFdCilbirsw,Closed,2016-02-01,2,5.0,0.8821,0.076085,0.0,1.0
6,--ZVrH2X2QXBFdCilbirsw,Closed,2016-03-01,1,5.0,0.9449,,0.0,1.0
7,--ZVrH2X2QXBFdCilbirsw,Closed,2017-07-01,1,5.0,0.9794,,0.0,1.0
8,--ZVrH2X2QXBFdCilbirsw,Closed,2018-02-01,1,5.0,0.0,,0.0,0.0
9,-1MhPXk1FglglUAmuPLIGg,Open,2009-03-01,1,3.0,0.802,,0.0,1.0


# Sentiment Analysis

In this section we fine-tune a transformer-based sentiment model on our own Yelp review data to produce a high-quality, domain-specific sentiment signal.  We create a supervised training set using clearly polarized reviews (1-star = negative, 5-star = positive), split it into train/validation sets, and fine-tune DistilBERT to classify review sentiment.  After training, we use the best checkpoint to score every review with a continuous probability of positive sentiment (0-1).  Finally, we aggregate these transformer sentiment scores to the business-month level (mean, variability, and positive/negative share), creating time-series features that will later feed our GRU model for forecasting sentiment trajectories and predicting closure risk.

In [11]:
!pip -q install -U transformers datasets accelerate

In [13]:
# ============================
# Cell 0 - Setup + Imports
# ============================
import os
import math
import random
import numpy as np
import pandas as pd

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

# ============================
# Cell 1 - Reproducibility
# ============================
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("CUDA available:", torch.cuda.is_available())

# ============================
# Cell 2 - Prep Data
# Assumes rest_review_df is already loaded like your EDA notebook.
# ============================
df = rest_review_df.copy()

df["date"] = pd.to_datetime(df["date"], errors="coerce")
df = df.dropna(subset=["date"]).copy()

df["text"] = df["text"].fillna("").astype(str)

# Fine-tune labels: 1-star = 0 (negative), 5-star = 1 (positive)
train_df = df[df["stars"].isin([1.0, 5.0])].copy()
train_df["label"] = (train_df["stars"] == 5.0).astype(int)


train_df = df[df["stars"].isin([1.0, 2.0, 4.0, 5.0])].copy()
train_df["label"] = train_df["stars"].isin([4.0, 5.0]).astype(int)

print("Fine-tune rows:", len(train_df))
print(train_df["label"].value_counts())

# ============================
# Cell 3 - Stratified Train/Val Split (no sklearn)
# ============================
y = train_df["label"].values

idx_pos = np.where(y == 1)[0]
idx_neg = np.where(y == 0)[0]

np.random.shuffle(idx_pos)
np.random.shuffle(idx_neg)

split_pos = int(0.8 * len(idx_pos))
split_neg = int(0.8 * len(idx_neg))

tr_idx = np.concatenate([idx_pos[:split_pos], idx_neg[:split_neg]])
va_idx = np.concatenate([idx_pos[split_pos:], idx_neg[split_neg:]])

np.random.shuffle(tr_idx)
np.random.shuffle(va_idx)

train_split = train_df.iloc[tr_idx].reset_index(drop=True)
val_split = train_df.iloc[va_idx].reset_index(drop=True)

print("Train split:", len(train_split), "Val split:", len(val_split))
print("Train label dist:\n", train_split["label"].value_counts(normalize=True))
print("Val label dist:\n", val_split["label"].value_counts(normalize=True))

# ============================
# Cell 4 - Tokenize + Build Torch Datasets (NO HuggingFace datasets/pyarrow)
# ============================
MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 256

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

from torch.utils.data import Dataset as TorchDataset

class ReviewTorchDataset(TorchDataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            max_length=self.max_len,
        )
        enc["labels"] = int(self.labels[idx])
        return enc

train_tds = ReviewTorchDataset(train_split["text"], train_split["label"], tokenizer, MAX_LEN)
val_tds   = ReviewTorchDataset(val_split["text"],   val_split["label"],   tokenizer, MAX_LEN)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# ============================
# Cell 5 - Model + Metrics
# ============================
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    tp = int(((preds == 1) & (labels == 1)).sum())
    tn = int(((preds == 0) & (labels == 0)).sum())
    fp = int(((preds == 1) & (labels == 0)).sum())
    fn = int(((preds == 0) & (labels == 1)).sum())

    acc = (tp + tn) / max(1, tp + tn + fp + fn)
    precision = tp / max(1, tp + fp)
    recall = tp / max(1, tp + fn)
    f1 = 2 * precision * recall / max(1e-12, (precision + recall))

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# ============================
# Cell 6 - TrainingArguments (Transformers v5-safe) + Trainer + Train
# ============================
OUT_DIR = "../artifacts/transformer_sentiment_distilbert"

EPOCHS = 3
PER_DEVICE_TRAIN_BS = 16
PER_DEVICE_EVAL_BS = 32
GRAD_ACCUM = 2

steps_per_epoch = math.ceil(len(train_tds) / (PER_DEVICE_TRAIN_BS * GRAD_ACCUM))
total_steps = steps_per_epoch * EPOCHS
warmup_steps = int(0.06 * total_steps)

training_args = TrainingArguments(
    output_dir=OUT_DIR,
    seed=SEED,

    eval_strategy="epoch",
    save_strategy="epoch",

    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    num_train_epochs=EPOCHS,
    learning_rate=2e-5,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BS,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BS,
    gradient_accumulation_steps=GRAD_ACCUM,

    warmup_steps=warmup_steps,
    weight_decay=0.01,

    logging_steps=100,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tds,
    eval_dataset=val_tds,
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
eval_metrics = trainer.evaluate()
print("Eval metrics:", eval_metrics)

trainer.save_model(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)
print("Saved to:", OUT_DIR)

# ============================
# Cell 7 - Temperature scaling (calibration) on validation set
# Output: best_T
# ============================
pred_out = trainer.predict(val_tds)
val_logits = torch.tensor(pred_out.predictions, dtype=torch.float32)
val_labels = torch.tensor(pred_out.label_ids, dtype=torch.long)

def nll_for_T(T: float) -> float:
    scaled = val_logits / T
    probs = torch.softmax(scaled, dim=1)
    p = probs[torch.arange(len(val_labels)), val_labels]
    return (-torch.log(p.clamp_min(1e-12))).mean().item()

Ts = np.linspace(0.5, 5.0, 46)  # 0.5, 0.6, ..., 5.0
losses = [nll_for_T(float(T)) for T in Ts]
best_T = float(Ts[int(np.argmin(losses))])

print("Best temperature:", best_T)
print("NLL @ best_T:", min(losses))

# ============================
# Cell 8 - Score ALL Reviews (fast batch logits, temperature-scaled)
# Output: df['tx_sent'] = calibrated P(positive), 0..1
# ============================
ft_model = AutoModelForSequenceClassification.from_pretrained(OUT_DIR)
ft_tokenizer = AutoTokenizer.from_pretrained(OUT_DIR)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
ft_model.to(device)
ft_model.eval()

collator = DataCollatorWithPadding(tokenizer=ft_tokenizer, return_tensors="pt")

texts = df["text"].fillna("").astype(str).tolist()
BATCH_SIZE = 64

p_pos = []

with torch.no_grad():
    for i in range(0, len(texts), BATCH_SIZE):
        batch_texts = texts[i:i + BATCH_SIZE]

        enc = ft_tokenizer(
            batch_texts,
            truncation=True,
            max_length=MAX_LEN
        )

        features = [{k: enc[k][j] for k in enc.keys()} for j in range(len(batch_texts))]
        batch = collator(features)
        batch = {k: v.to(device) for k, v in batch.items()}

        logits = ft_model(**batch).logits
        probs = torch.softmax(logits / best_T, dim=1)[:, 1].detach().cpu().numpy()
        p_pos.extend(probs.tolist())

df["tx_sent"] = np.array(p_pos, dtype=float)

print(df[["review_id", "stars", "tx_sent"]].head(10))
print("tx_sent range:", df["tx_sent"].min(), "to", df["tx_sent"].max())

# ============================
# Cell 9 - Aggregate to Business-Month Features (feeds GRU time-series later)
# Output: biz_month_tx
# ============================
df["month"] = df["date"].dt.to_period("M").dt.to_timestamp()

biz_month_tx = (
    df.groupby(["business_id", "status", "month"])
      .agg(
          review_count=("review_id", "count"),
          avg_stars=("stars", "mean"),
          tx_sent_mean=("tx_sent", "mean"),
          tx_sent_std=("tx_sent", "std"),
          tx_neg_share=("tx_sent", lambda s: (s < 0.30).mean()),
          tx_pos_share=("tx_sent", lambda s: (s > 0.70).mean()),
      )
      .reset_index()
      .sort_values(["business_id", "month"])
)

MIN_N = 5

biz_month_tx["tx_pos_share"] = np.where(
    biz_month_tx["review_count"] >= MIN_N,
    biz_month_tx["tx_pos_share"],
    np.nan
)

biz_month_tx["tx_neg_share"] = np.where(
    biz_month_tx["review_count"] >= MIN_N,
    biz_month_tx["tx_neg_share"],
    np.nan
)

# std is NaN when review_count == 1 - make it numeric for modeling
biz_month_tx["tx_sent_std"] = biz_month_tx["tx_sent_std"].fillna(0.0)

print(biz_month_tx.head(20))
print("Rows:", len(biz_month_tx), "Businesses:", biz_month_tx["business_id"].nunique())

CUDA available: True
Fine-tune rows: 63032
label
1    49496
0    13536
Name: count, dtype: int64
Train split: 50424 Val split: 12608
Train label dist:
 label
1    0.785261
0    0.214739
Name: proportion, dtype: float64
Val label dist:
 label
1    0.785216
0    0.214784
Name: proportion, dtype: float64


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
pre_classifier.weight   | MISSING    | 
classifier.bias         | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.weight       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.181096,0.094598,0.97105,0.979194,0.98404,0.981611
2,0.109126,0.083634,0.974699,0.984427,0.983333,0.98388
3,0.047096,0.105613,0.973985,0.983337,0.983535,0.983436


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].


Eval metrics: {'eval_loss': 0.08364104479551315, 'eval_accuracy': 0.9746192893401016, 'eval_precision': 0.9843276036400405, 'eval_recall': 0.9833333333333333, 'eval_f1': 0.9838302172814554, 'eval_runtime': 26.9956, 'eval_samples_per_second': 467.038, 'eval_steps_per_second': 14.595, 'epoch': 3.0}


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Saved to: ../artifacts/transformer_sentiment_distilbert
Best temperature: 1.4
NLL @ best_T: 0.07390153408050537


Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

                 review_id  stars   tx_sent
0   KU_O5udG6zpxOg-VcAEodg    3.0  0.213885
2   saUsX_uimxRlCVr67Z4Jig    3.0  0.996652
3   AqPFMleE6RsU23_auESxiA    5.0  0.996723
4   Sx8TMOWLNuJBWer-0pcmoA    4.0  0.987097
5   JrIxlS1TzJ-iCu79ul40cQ    1.0  0.020942
7   _ZeMknuYdlQcUqng_Im3yg    5.0  0.997374
9   pUycOfUwM8vqX7KjRRhUEA    3.0  0.037991
11  l3Wk_mvAog6XANIuGQ9C7Q    4.0  0.996316
12  XW_LfMv0fV21l9c6xQd_lw    4.0  0.995593
13  8JFGBuHMoiNDyfcxuWNtrA    4.0  0.994428
tx_sent range: 0.00818129163235426 to 0.9976353645324707
               business_id  status      month  review_count  avg_stars  \
0   --ZVrH2X2QXBFdCilbirsw  Closed 2013-07-01             1        5.0   
1   --ZVrH2X2QXBFdCilbirsw  Closed 2014-03-01             1        5.0   
2   --ZVrH2X2QXBFdCilbirsw  Closed 2014-12-01             1        5.0   
3   --ZVrH2X2QXBFdCilbirsw  Closed 2015-02-01             1        3.0   
4   --ZVrH2X2QXBFdCilbirsw  Closed 2015-05-01             1        5.0   
5   --ZVrH2X2QX

In [14]:
biz_month_tx["tx_neg_share"] = biz_month_tx["tx_neg_share"].fillna(0.5)
biz_month_tx["tx_pos_share"] = biz_month_tx["tx_pos_share"].fillna(0.5)

# GRU

In [25]:
# ============================
# Cell 10 - GRU End-to-End (triage version - censoring + recent-only business risk)
# - Masks "zombie" Open businesses as Unknown (excluded)
# - Masks right-censored Open windows where full horizon is unobserved (excluded)
# - Builds trajectory features (deltas + rolling)
# - Business-stratified split (no leakage)
# - Balanced batches for training
# - Better model + better calibration (bias init, AdamW, label smoothing, clipnorm)
# - Evaluates PR-AUC + top-K lift (window + business)
# - Business triage uses RECENT windows only (last RECENT_K_MONTHS of dataset)
# - Saves triage artifacts CSVs
# ============================

import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# ----------------------------
# Knobs (you can tweak safely)
# ----------------------------
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

SEQ_LEN = 12          # months per window
H = 6                 # predict closure within next H months
INACTIVE_K = 12       # "zombie" if Open and no reviews in last K months of dataset
MIN_ACTIVE_MONTHS = 6
MIN_REVIEWS_IN_WINDOW = 10

POS_BATCH_RATE = 0.30
BATCH_SIZE = 256
EPOCHS = 40
LR = 1e-3
WEIGHT_DECAY = 1e-4
LABEL_SMOOTHING = 0.01
CLIPNORM = 1.0

RECENT_K_MONTHS = 3   # business risk computed from windows ending in last K months of dataset

OUT_DIR = "../artifacts"
os.makedirs(OUT_DIR, exist_ok=True)

# ----------------------------
# Required columns
# ----------------------------
REQ = [
    "business_id", "status", "month",
    "review_count", "avg_stars",
    "tx_sent_mean", "tx_sent_std",
    "tx_neg_share", "tx_pos_share"
]
missing = [c for c in REQ if c not in biz_month_tx.columns]
if missing:
    raise ValueError(f"biz_month_tx missing required columns: {missing}")

dfm = biz_month_tx.copy()
dfm["month"] = pd.to_datetime(dfm["month"], errors="coerce")
dfm = dfm.dropna(subset=["month"]).copy()
dfm = dfm.sort_values(["business_id", "month"]).reset_index(drop=True)

# Enforce numeric types
NUM_COLS = ["review_count", "avg_stars", "tx_sent_mean", "tx_sent_std", "tx_neg_share", "tx_pos_share"]
for c in NUM_COLS:
    dfm[c] = pd.to_numeric(dfm[c], errors="coerce")
dfm[NUM_COLS] = dfm[NUM_COLS].fillna(0.0)

# ----------------------------
# Build business last-observed month + closure proxy
# ----------------------------
global_last_month = dfm["month"].max()

biz_last = (
    dfm.groupby(["business_id", "status"], as_index=False)["month"]
       .max()
       .rename(columns={"month": "last_review_month"})
)

biz_last["closure_month"] = pd.NaT
biz_last.loc[biz_last["status"] == "Closed", "closure_month"] = biz_last.loc[
    biz_last["status"] == "Closed", "last_review_month"
].values

zombie_cutoff = global_last_month - pd.DateOffset(months=INACTIVE_K)
biz_last["is_zombie_open"] = (biz_last["status"] == "Open") & (biz_last["last_review_month"] <= zombie_cutoff)

print("Global last month:", global_last_month.date())
print("Zombie cutoff:", zombie_cutoff.date())
print("Open zombies:", int(biz_last["is_zombie_open"].sum()))

dfm2 = dfm.merge(
    biz_last[["business_id", "status", "last_review_month", "closure_month", "is_zombie_open"]],
    on=["business_id", "status"],
    how="left"
)

# ----------------------------
# Feature engineering (levels + trajectories)
# ----------------------------
BASE_FEATS = ["review_count", "avg_stars", "tx_sent_mean", "tx_sent_std", "tx_neg_share", "tx_pos_share"]

def add_features_per_business(g: pd.DataFrame) -> pd.DataFrame:
    g = g.sort_values("month").copy()

    x = g[BASE_FEATS].astype(float)
    mu = x.mean(axis=0)
    sd = x.std(axis=0).replace(0.0, 1.0)

    for c in BASE_FEATS:
        g[f"{c}_z"] = (g[c].astype(float) - float(mu[c])) / float(sd[c])

    Z = [f"{c}_z" for c in BASE_FEATS]

    for c in Z:
        g[f"{c}_d1"] = g[c].diff(1)

    for c in Z:
        g[f"{c}_rm3"] = g[c].rolling(3, min_periods=1).mean()
        g[f"{c}_rs3"] = g[c].rolling(3, min_periods=1).std()
        g[f"{c}_rs6"] = g[c].rolling(6, min_periods=1).std()

    m0 = int(g["month"].iloc[0].year) * 12 + int(g["month"].iloc[0].month)
    m  = g["month"].dt.year.astype(int) * 12 + g["month"].dt.month.astype(int)
    g["months_since_first"] = (m - m0).astype(float)

    feat_cols = [c for c in g.columns if c.endswith("_z") or c.endswith("_d1") or c.endswith("_rm3") or c.endswith("_rs3") or c.endswith("_rs6")]
    g[feat_cols] = g[feat_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)

    return g

dfm2 = dfm2.groupby("business_id", group_keys=False).apply(add_features_per_business)

FEAT_COLS = [c for c in dfm2.columns if (
    c.endswith("_z") or c.endswith("_d1") or c.endswith("_rm3") or c.endswith("_rs3") or c.endswith("_rs6")
)]
FEAT_COLS = FEAT_COLS + ["months_since_first"]
print("Feature count:", len(FEAT_COLS))

# ----------------------------
# Build windows + labels with zombie masking + right-censor masking
# ----------------------------
X_list, y_list, meta = [], [], []

for (bid, status), g in dfm2.groupby(["business_id", "status"]):
    g = g.sort_values("month").reset_index(drop=True)

    is_zombie_open = bool(g["is_zombie_open"].iloc[0])
    closure_month = g["closure_month"].iloc[0]

    # Exclude zombie Open entirely (unknown label)
    if status == "Open" and is_zombie_open:
        continue

    if len(g) < SEQ_LEN:
        continue

    for start in range(0, len(g) - SEQ_LEN + 1):
        end = start + SEQ_LEN
        w = g.iloc[start:end].copy()
        window_end = w["month"].iloc[-1]

        # activity filters
        active_months = int((w["review_count"] > 0).sum())
        total_reviews = float(w["review_count"].sum())
        if active_months < MIN_ACTIVE_MONTHS:
            continue
        if total_reviews < MIN_REVIEWS_IN_WINDOW:
            continue

        # Exclude post-closure windows (Closed class)
        if status == "Closed" and pd.notna(closure_month):
            if window_end >= closure_month:
                continue

        # Right-censoring:
        # If Open but we cannot observe full horizon after window_end, label is unknown -> exclude
        horizon_end = window_end + pd.DateOffset(months=H)
        if status == "Open" and horizon_end > global_last_month:
            continue

        # label
        if status == "Closed" and pd.notna(closure_month):
            y_seq = int((closure_month > window_end) and (closure_month <= horizon_end))
        else:
            y_seq = 0

        X_seq = w[FEAT_COLS].to_numpy(dtype=np.float32)

        X_list.append(X_seq)
        y_list.append(y_seq)
        meta.append({
            "business_id": bid,
            "status": status,
            "start_month": w["month"].iloc[0],
            "end_month": window_end,
            "horizon_end": horizon_end,
            "closure_month": closure_month,
            "y": y_seq,
            "last_review_month": g["last_review_month"].iloc[0],
        })

X = np.stack(X_list, axis=0) if len(X_list) else np.empty((0, SEQ_LEN, len(FEAT_COLS)), dtype=np.float32)
y = np.asarray(y_list, dtype=np.int64)
meta_df = pd.DataFrame(meta)

print("X shape:", X.shape)
print("y balance:\n", pd.Series(y).value_counts())
print("Windows:", len(meta_df), "Businesses:", meta_df["business_id"].nunique())

# ----------------------------
# Business-level stratified split (no leakage)
# ----------------------------
biz_outcome = (
    meta_df.groupby("business_id")["y"]
      .max()
      .reset_index()
      .rename(columns={"y": "y_business"})
)

pos_biz = biz_outcome[biz_outcome["y_business"] == 1]["business_id"].to_numpy()
neg_biz = biz_outcome[biz_outcome["y_business"] == 0]["business_id"].to_numpy()

rng = np.random.default_rng(SEED)
rng.shuffle(pos_biz)
rng.shuffle(neg_biz)

def split_ids(arr, frac=0.8):
    cut = int(frac * len(arr))
    return arr[:cut], arr[cut:]

pos_tr, pos_va = split_ids(pos_biz, 0.8)
neg_tr, neg_va = split_ids(neg_biz, 0.8)

train_biz = set(pos_tr.tolist() + neg_tr.tolist())
val_biz   = set(pos_va.tolist() + neg_va.tolist())

train_mask = meta_df["business_id"].isin(train_biz).values
val_mask   = meta_df["business_id"].isin(val_biz).values

X_train, y_train = X[train_mask], y[train_mask]
X_val, y_val     = X[val_mask], y[val_mask]

meta_train = meta_df.loc[train_mask].reset_index(drop=True)
meta_val   = meta_df.loc[val_mask].reset_index(drop=True)

print("Train windows:", len(meta_train), "Val windows:", len(meta_val))
print("Train y dist:\n", pd.Series(y_train).value_counts(normalize=True))
print("Val y dist:\n", pd.Series(y_val).value_counts(normalize=True))

val_biz_outcome = (
    meta_val.groupby("business_id")["y"]
      .max()
      .reset_index()
      .rename(columns={"y":"y_business"})
)
print("Val businesses:", val_biz_outcome["business_id"].nunique(),
      "Val positive businesses:", int((val_biz_outcome["y_business"] == 1).sum()),
      "(", float((val_biz_outcome["y_business"] == 1).mean()), ")")

print("X_train shape:", X_train.shape, "X_val shape:", X_val.shape)
print("Feature count:", len(FEAT_COLS))

# ----------------------------
# Balanced-batch tf.data pipeline
# ----------------------------
pos_idx = np.where(y_train == 1)[0]
neg_idx = np.where(y_train == 0)[0]

print("Train positives:", len(pos_idx), "negatives:", len(neg_idx),
      "pos_rate:", float(len(pos_idx) / max(1, len(y_train))))

ds_pos = tf.data.Dataset.from_tensor_slices((X_train[pos_idx], y_train[pos_idx]))
ds_neg = tf.data.Dataset.from_tensor_slices((X_train[neg_idx], y_train[neg_idx]))

ds_pos = ds_pos.shuffle(min(len(pos_idx), 20000), seed=SEED, reshuffle_each_iteration=True).repeat()
ds_neg = ds_neg.shuffle(min(len(neg_idx), 20000), seed=SEED, reshuffle_each_iteration=True).repeat()

train_ds = tf.data.Dataset.sample_from_datasets(
    [ds_pos, ds_neg],
    weights=[POS_BATCH_RATE, 1.0 - POS_BATCH_RATE],
    seed=SEED
).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

steps_per_epoch = int(np.ceil(len(y_train) / BATCH_SIZE))

# ----------------------------
# Model (LayerNorm GRUs + attention pooling + bias init)
# ----------------------------
pos_rate = float((y_train == 1).mean())
eps = 1e-7
init_bias = float(np.log((pos_rate + eps) / (1.0 - pos_rate + eps)))
print("Base pos_rate:", pos_rate, "logit bias init:", init_bias)

def build_model(seq_len: int, n_feats: int, init_bias: float):
    inp = layers.Input(shape=(seq_len, n_feats))

    x = layers.GRU(128, return_sequences=True)(inp)
    x = layers.LayerNormalization()(x)
    x = layers.Dropout(0.20)(x)

    x = layers.GRU(64, return_sequences=True)(x)
    x = layers.LayerNormalization()(x)
    x = layers.Dropout(0.20)(x)

    # Attention pooling (Keras-safe)
    attn = layers.Dense(1)(x)                                # (B, T, 1)
    attn = layers.Softmax(axis=1, name="attn_softmax")(attn) # (B, T, 1)
    x = layers.Multiply()([x, attn])                         # (B, T, H)
    x = layers.Lambda(lambda t: tf.reduce_sum(t, axis=1),
                      name="attn_pool_sum")(x)               # (B, H)

    x = layers.Dense(64, activation="relu")(x)
    x = layers.Dropout(0.25)(x)

    out = layers.Dense(
        1,
        activation="sigmoid",
        bias_initializer=keras.initializers.Constant(init_bias)
    )(x)

    return keras.Model(inp, out)

N_FEATS = X_train.shape[2]
model_gru = build_model(SEQ_LEN, N_FEATS, init_bias)

try:
    optimizer = keras.optimizers.AdamW(
        learning_rate=LR,
        weight_decay=WEIGHT_DECAY,
        clipnorm=CLIPNORM
    )
except Exception:
    optimizer = keras.optimizers.Adam(
        learning_rate=LR,
        clipnorm=CLIPNORM
    )

loss_fn = keras.losses.BinaryCrossentropy(label_smoothing=LABEL_SMOOTHING)

model_gru.compile(
    optimizer=optimizer,
    loss=loss_fn,
    metrics=[
        keras.metrics.AUC(name="roc_auc"),
        keras.metrics.AUC(name="pr_auc", curve="PR"),
        keras.metrics.Precision(name="precision"),
        keras.metrics.Recall(name="recall"),
    ],
)

callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_pr_auc", mode="max", patience=6, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor="val_pr_auc", mode="max", factor=0.5, patience=3, min_lr=1e-5),
]

history = model_gru.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    steps_per_epoch=steps_per_epoch,
    callbacks=callbacks,
    verbose=1
)

val_metrics = model_gru.evaluate(val_ds, verbose=0)
print(dict(zip(model_gru.metrics_names, val_metrics)))

# ----------------------------
# Window-level triage metrics (top-K)
# ----------------------------
p_val = model_gru.predict(X_val, batch_size=512, verbose=0).reshape(-1)
ytrue = y_val.astype(int)

val_meta2 = meta_val.copy()
val_meta2["p_closed"] = p_val
val_meta2["y_true"] = ytrue

n = len(val_meta2)
pos_total = int((val_meta2["y_true"] == 1).sum())
print("\nVal windows:", n)
print("Val positives:", pos_total, "(", pos_total / max(1, n), ")")

print("\nTop-K% window triage (higher p_closed = higher risk):")
for pct in [0.5, 1, 2, 5, 10, 15, 20]:
    k = max(1, int(n * (pct / 100.0)))
    topk = val_meta2.sort_values("p_closed", ascending=False).head(k)
    tp = int((topk["y_true"] == 1).sum())
    precision = tp / max(1, k)
    recall = tp / max(1, pos_total)
    print(f"Top {pct:>4}% (k={k:>5}): precision={precision:.3f}  recall={recall:.3f}  tp={tp}")

print("\nTop 20 highest-risk windows (sanity check):")
cols = ["business_id", "status", "start_month", "end_month", "closure_month", "y_true", "p_closed"]
print(val_meta2.sort_values("p_closed", ascending=False)[cols].head(20))

print("\nBottom 20 lowest-risk windows (sanity check):")
print(val_meta2.sort_values("p_closed", ascending=True)[cols].head(20))

# ----------------------------
# Business-level triage (RECENT ONLY)
# - risk_score = max probability over last 3 windows, but only windows ending in last RECENT_K_MONTHS
# - fallback: if a business has 0 recent windows in val, use its overall last-3 max
# ----------------------------
val_meta2 = val_meta2.sort_values(["business_id", "end_month"]).reset_index(drop=True)

biz_grp = val_meta2.groupby(["business_id", "status"], as_index=False).agg(
    end_month_last=("end_month", "max"),
    p_last=("p_closed", "last"),
    p_max=("p_closed", "max"),
    p_mean=("p_closed", "mean"),
    n_windows=("p_closed", "size"),
    y_business=("y_true", "max"),
)

recent_cutoff = global_last_month - pd.DateOffset(months=RECENT_K_MONTHS)
val_recent = val_meta2[val_meta2["end_month"] >= recent_cutoff].copy()

# Recent max over last 3 windows within recent subset
recent = (
    val_recent.sort_values(["business_id", "end_month"])
      .groupby("business_id", group_keys=False)
      .tail(3)
      .groupby("business_id", as_index=False)["p_closed"]
      .max()
      .rename(columns={"p_closed": "p_recent_max"})
)

# Fallback max over last 3 windows overall
overall_last3 = (
    val_meta2.sort_values(["business_id", "end_month"])
      .groupby("business_id", group_keys=False)
      .tail(3)
      .groupby("business_id", as_index=False)["p_closed"]
      .max()
      .rename(columns={"p_closed": "p_last3_max"})
)

biz_triage = biz_grp.merge(recent, on="business_id", how="left").merge(overall_last3, on="business_id", how="left")

# Final risk score: prefer recent, else fallback to last3 overall, else p_max
biz_triage["risk_score"] = (
    biz_triage["p_recent_max"]
      .fillna(biz_triage["p_last3_max"])
      .fillna(biz_triage["p_max"])
)

bins = [0.0, 0.50, 0.65, 0.75, 0.85, 1.0]
labels = ["low", "medium", "elevated", "high", "very_high"]
biz_triage["risk_bucket"] = pd.cut(biz_triage["risk_score"], bins=bins, labels=labels, include_lowest=True)

biz_triage = biz_triage.sort_values("risk_score", ascending=False).reset_index(drop=True)

n_biz = len(biz_triage)
pos_biz_total = int((biz_triage["y_business"] == 1).sum())
print("\nVal businesses:", n_biz, "Positives:", pos_biz_total, "Pos rate:", pos_biz_total / max(1, n_biz))
print("Recent cutoff for business triage:", recent_cutoff.date())

print("\nBusiness-level Top-K% workload metrics (sorted by risk_score desc):")
for pct in [0.5, 1, 2, 5, 10, 15, 20]:
    k = max(1, int(n_biz * (pct / 100.0)))
    topk = biz_triage.head(k)
    tp = int((topk["y_business"] == 1).sum())
    precision = tp / max(1, k)
    recall = tp / max(1, pos_biz_total)
    thr = float(topk["risk_score"].iloc[-1])
    print(f"Top {pct:>4}% (k={k:>4})  thr>={thr:.4f}  precision={precision:.3f}  recall={recall:.3f}  tp={tp}")

# ----------------------------
# Save artifacts
# ----------------------------
triage_path = os.path.join(OUT_DIR, "gru_business_triage.csv")
top5_path   = os.path.join(OUT_DIR, "gru_business_triage_top5pct.csv")
top10_path  = os.path.join(OUT_DIR, "gru_business_triage_top10pct.csv")

biz_triage.to_csv(triage_path, index=False)
biz_triage.head(max(1, int(0.05 * n_biz))).to_csv(top5_path, index=False)
biz_triage.head(max(1, int(0.10 * n_biz))).to_csv(top10_path, index=False)

print("\nSaved:", triage_path)
print("Saved:", top5_path)
print("Saved:", top10_path)

Global last month: 2018-10-01
Zombie cutoff: 2017-10-01
Open zombies: 854
Feature count: 31
X shape: (18942, 12, 31)
y balance:
 0    18222
1      720
Name: count, dtype: int64
Windows: 18942 Businesses: 1135
Train windows: 15116 Val windows: 3826
Train y dist:
 0    0.961829
1    0.038171
Name: proportion, dtype: float64
Val y dist:
 0    0.962624
1    0.037376
Name: proportion, dtype: float64
Val businesses: 228 Val positive businesses: 61 ( 0.2675438596491228 )
X_train shape: (15116, 12, 31) X_val shape: (3826, 12, 31)
Feature count: 31
Train positives: 577 negatives: 14539 pos_rate: 0.038171473934903416
Base pos_rate: 0.038171473934903416 logit bias init: -3.2267451906380225
Epoch 1/40
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 25ms/step - loss: 0.6697 - pr_auc: 0.2998 - precision: 0.2936 - recall: 0.0988 - roc_auc: 0.5156 - val_loss: 0.3972 - val_pr_auc: 0.0678 - val_precision: 0.1379 - val_recall: 0.0280 - val_roc_auc: 0.6441 - learning_rate: 0.0010
Epoch 2/4