In [None]:
import pandas as pd
import numpy as np
from torch.nn.utils.rnn import pack_padded_sequence

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
df_usa = pd.read_csv('/content/drive/MyDrive/usa_data.csv')

In [None]:
df_usa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 447860 entries, 0 to 447859
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Unnamed: 0           447860 non-null  int64  
 1   transaction_id       447860 non-null  object 
 2   customer_id          447860 non-null  object 
 3   timestamp            447860 non-null  object 
 4   merchant_category    447860 non-null  object 
 5   merchant_type        447860 non-null  object 
 6   merchant             447860 non-null  object 
 7   amount               447860 non-null  float64
 8   currency             447860 non-null  object 
 9   country              447860 non-null  object 
 10  channel              447860 non-null  object 
 11  distance_from_home   447860 non-null  int64  
 12  weekend_transaction  447860 non-null  bool   
 13  date_transact        447860 non-null  object 
dtypes: bool(1), float64(1), int64(2), object(10)
memory usage: 44.8+ MB


In [None]:
df = df_usa.drop(['currency','transaction_id','Unnamed: 0'],axis=1)

df['day'] = pd.to_datetime(df['date_transact']).dt.day

In [None]:
df.head()

Unnamed: 0,customer_id,timestamp,merchant_category,merchant_type,merchant,amount,country,channel,distance_from_home,weekend_transaction,date_transact,day
0,CUST_34400,2024-10-01 00:00:32.479189+00:00,Healthcare,medical,Lab Corp,1005.88,USA,web,1,False,2024-10-01,1
1,CUST_85031,2024-10-01 00:00:34.482833+00:00,Travel,airlines,United Airlines,1879.34,USA,mobile,1,False,2024-10-01,1
2,CUST_90442,2024-10-01 00:00:44.472679+00:00,Gas,major,Mobil,403.57,USA,web,1,False,2024-10-01,1
3,CUST_37447,2024-10-01 00:00:57.244594+00:00,Gas,major,Mobil,474.12,USA,web,0,False,2024-10-01,1
4,CUST_62600,2024-10-01 00:01:09.607236+00:00,Retail,online,Wayfair,588.1,USA,web,1,False,2024-10-01,1


In [None]:
df['amount'].describe()
df['customer_id'].nunique()

4869

In [None]:
categories = df['merchant_category'].unique()
categories

array(['Healthcare', 'Travel', 'Gas', 'Retail', 'Grocery', 'Restaurant',
       'Education', 'Entertainment'], dtype=object)

In [None]:
feature_map = {
    'total_running': 0,
    'total_tx': 1,
    'restaurant_running': 2,
    'restaurant_tx': 3,
    'healthcare_running': 4,
    'healthcare_tx': 5,
    'gas_running': 6,
    'gas_tx': 7,
    'education_running': 8,
    'education_tx': 9,
    'entertainment_running': 10,
    'entertainment_tx': 11,
    'retail_running': 12,
    'retail_tx': 13,
    'grocery_running': 14,
    'grocery_tx': 15,
    'travel_running': 16,
    'travel_tx': 17,
    'normalized_date': 18
}

In [None]:
daily_totals= df.groupby(['customer_id','day'])['amount'].agg(['sum','count']).reset_index()

In [None]:
category_totals= df.groupby(['customer_id','day','merchant_category'])['amount'].agg(['sum','count']).reset_index()
category_totals

Unnamed: 0,customer_id,day,merchant_category,sum,count
0,CUST_10000,1,Healthcare,501.90,1
1,CUST_10000,8,Travel,1642.06,1
2,CUST_10000,12,Retail,2302.74,2
3,CUST_10000,18,Travel,2426.40,1
4,CUST_10000,20,Gas,1322.68,1
...,...,...,...,...,...
169659,CUST_99971,26,Retail,416.19,1
169660,CUST_99971,27,Travel,1089.54,1
169661,CUST_99971,28,Gas,359.06,1
169662,CUST_99971,29,Healthcare,290.03,1


In [None]:
def build_tensors(df):

  categories = df['merchant_category'].unique()

  daily_totals = df.groupby(['customer_id','day'])['amount'].agg(['sum','count']).reset_index()
  category_totals = df.groupby(['customer_id','day','merchant_category'])['amount'].agg(['sum','count']).reset_index()

  # running total, # total purchase, for each cat (#tx, running total)
  num_features = 2 + (len(categories) * 2)

  all_user_dfs = {}

  for cust_id in df['customer_id'].unique():
    tensor = np.zeros((30, num_features))

    user_daily_df = daily_totals[daily_totals['customer_id'] == cust_id]

    for _, day_row in user_daily_df.iterrows():
      day_idx = int(day_row['day']) - 1

      tensor[day_idx, 0] = day_row['sum']  #daily running total $
      tensor[day_idx,1] = day_row['count'] #daily running total tx#

    user_category_df = category_totals[category_totals['customer_id']== cust_id]
    cat_running_totals = {cat: 0 for cat in categories}

    for day in range(1,31):
      day_data = user_category_df[user_category_df['day'] == day]

      for cat_idx, cat in enumerate(categories):
        cat_row = day_data[day_data['merchant_category']== cat]
        daily_spent = cat_row['sum'].iloc[0] if not cat_row.empty else 0
        daily_count = cat_row['count'].iloc[0] if not cat_row.empty else 0

        cat_running_totals[cat] += daily_spent

        features_offset = 2 + (cat_idx * 2) #[total tx, total, running, then x2 for each cat]
        tensor[day-1, features_offset] = cat_running_totals[cat]
        tensor[day-1, features_offset + 1] = daily_count

      tensor[day-1, -1] = day/30 #add feature to normalize day (0,1)

    all_user_dfs[cust_id] = tensor

  return all_user_dfs

In [None]:
#user_dfs_dict = build_tensors(df)

In [None]:
#user_dfs_dict["CUST_24106"]

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

class FiveDayCheckPointDataset(Dataset):
  """
  Makes object = sequence of features, length (the benchmark date),
  target

  """
  def __init__(self, user_tensors, cutoffs, running_total_idx=0):
    self.samples = []

    for user_id, tensor in user_tensors.items():
      #total at day 31, position 0 of that tensor is the running total
      total_spend = float(tensor[-1, running_total_idx])

      for benchmark in benchmarks:
        if benchmark <= tensor.shape[0]: #days in month
          seq = tensor[:benchmark,:].astype(np.float32) #all dates up to benchmark
          self.samples.append([seq, benchmark, total_spend])

  def __len__(self):
    return len(self.samples)

  def __getitem__(self,idx):
    seq, length, target = self.samples[idx]
    return torch.from_numpy(seq), length, torch.tensor(target, dtype=torch.float32)


In [None]:
def collate_fn(batch):

  seqs, lengths, targets = zip(*batch)
  lengths = torch.tensor(lengths, dtype=torch.long)
  targets = torch.stack(targets)

  max_len = max(lengths)
  batch_size = len(seqs)
  FEATURES = seqs[0].shape[1]

  #make tensors
  padded = torch.zeros((batch_size, max_len, FEATURES), dtype= torch.float32)
  mask = torch.zeros((batch_size, max_len), dtype=torch.bool)

  for idx, s in enumerate(seqs):
    actual_length = s.shape[0]
    padded[idx, :actual_length, :] = s
    mask[idx, :actual_length] = 1

  return padded, lengths, mask, targets

In [None]:
class LSTMPredictor(nn.Module):
  def __init__(self, input_size, hidden_size=128, num_layers=1):
    super().__init__()
    self.lstm = nn.LSTM(input_size=input_size,
                        hidden_size=hidden_size,
                        num_layers=num_layers,
                        batch_first=True)

    #batch first true -> x: (batch_size, seq_len, F) padded input sequences

    self.head = nn.Linear(hidden_size, 1)

  def forward(self,x, lengths):

    packed = pack_padded_sequence( #needs inputs on CPU
        x, lengths.cpu(), batch_first= True, enforce_sorted=False)

    _, (hidden_states, final_cells) = self.lstm(packed)
    final_hidden = hidden_states[-1] #(batch size, hidden size)
    out = self.head(final_hidden).squeeze(-1)

    return out


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
benchmarks = [5,10, 15, 20, 25]

In [None]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)

train_idx, test_idx = next(gss.split(df, groups=df['customer_id']))

train_df = df.iloc[train_idx].reset_index(drop=True)
test_df = df.iloc[test_idx].reset_index(drop=True)

print("Train customers:", train_df['customer_id'].nunique())
print("Test customers:", test_df['customer_id'].nunique())

Train customers: 3895
Test customers: 974


In [None]:
train_tensors = build_tensors(train_df)
test_tensors = build_tensors(test_df)

In [None]:
lstm_train_set = FiveDayCheckPointDataset(train_tensors,
                                          cutoffs=benchmarks,
                                          running_total_idx=0)

lstm_test_set = FiveDayCheckPointDataset(test_tensors,
                                          cutoffs=benchmarks,
                                          running_total_idx=0)

In [None]:
lstm_train_loaded = DataLoader(lstm_train_set, batch_size=32, shuffle=True, collate_fn=collate_fn)

lstm_test_loaded = DataLoader(lstm_test_set, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [None]:
input_size = next(iter(lstm_train_loaded))[0].shape[1]  #features per timestep

model_0 = LSTMPredictor(input_size=input_size, hidden_size=128, num_layers=1)

input_size

25

In [None]:
TRAIN_EPOCHS = 500
TEST_EPOCHS = 3

optimizer = torch.optim.Adam(model_0.parameters(), lr = 1e-3)
loss_fn = nn.MSELoss()


In [None]:
from tqdm.auto import tqdm
import torch
from timeit import default_timer as timer

device = "cuda" if torch.cuda.is_available() else "cpu"
model_0.to(device)
torch.manual_seed(42)

train_time_start_on_cpu = timer()

for epoch in tqdm(range(TRAIN_EPOCHS)):

    model_0.train()
    train_loss = 0

    for batch_idx, (batch_seqs, batch_lengths, batch_mask, batch_targets) in enumerate(lstm_train_loaded):

        # Move batch to device
        batch_seqs = batch_seqs.to(device)
        batch_lengths = batch_lengths.to(device)
        batch_mask = batch_mask.to(device)
        batch_targets = batch_targets.to(device)

        # Forward pass
        preds = model_0(batch_seqs, batch_lengths)

        # Loss
        loss = loss_fn(preds, batch_targets)
        train_loss += loss.item()

        # Backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Average train loss per batch
    train_loss /= len(lstm_train_loaded)

    # Evaluation
    test_loss = 0
    model_0.eval()
    with torch.inference_mode():
        for test_seqs, test_lengths, test_mask, test_targets in lstm_test_loaded:
            test_seqs = test_seqs.to(device)
            test_lengths = test_lengths.to(device)
            test_mask = test_mask.to(device)
            test_targets = test_targets.to(device)

            test_preds = model_0(test_seqs, test_lengths)
            test_loss += loss_fn(test_preds, test_targets).item()

        test_loss /= len(lstm_test_loaded)

    # Print progress **only every 50 epochs**
    if (epoch + 1) % 50 == 0:
        print(f"Epoch {epoch+1} | Train loss: {train_loss:.5f} | Test loss: {test_loss:.5f}")

# Total training time
train_time_end_on_cpu = timer()
total_train_time = train_time_end_on_cpu - train_time_start_on_cpu
print(f"Total training time: {total_train_time:.2f} seconds")


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 50 | Train loss: 22584957.43155 | Test loss: 22239595.64708
Epoch 100 | Train loss: 18589512.60458 | Test loss: 18949830.97478
Epoch 150 | Train loss: 15828035.59313 | Test loss: 16808882.05122
Epoch 200 | Train loss: 14389612.74831 | Test loss: 16083528.35366
Epoch 250 | Train loss: 13116410.71416 | Test loss: 15321776.44833
Epoch 300 | Train loss: 12381743.65302 | Test loss: 13978644.79289
Epoch 350 | Train loss: 12643484.29992 | Test loss: 15044904.01593
Epoch 400 | Train loss: 12184302.05940 | Test loss: 14257132.22120
Epoch 450 | Train loss: 12262099.60955 | Test loss: 15525271.00276
Epoch 500 | Train loss: 11421868.05532 | Test loss: 14013487.64920
Total training time: 1411.57 seconds


## Additional 500 Epochs

In [None]:
from tqdm.auto import tqdm
import torch
from timeit import default_timer as timer

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(42)

train_time_start_on_cpu = timer()

for epoch in tqdm(range(TRAIN_EPOCHS)):

    model_0.train()
    train_loss = 0

    for batch_idx, (batch_seqs, batch_lengths, batch_mask, batch_targets) in enumerate(lstm_train_loaded):

        # Move batch to device
        batch_seqs = batch_seqs.to(device)
        batch_lengths = batch_lengths.to(device)
        batch_mask = batch_mask.to(device)
        batch_targets = batch_targets.to(device)

        # Forward pass
        preds = model_0(batch_seqs, batch_lengths)

        # Loss
        loss = loss_fn(preds, batch_targets)
        train_loss += loss.item()

        # Backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Average train loss per batch
    train_loss /= len(lstm_train_loaded)

    # Evaluation
    test_loss = 0
    model_0.eval()
    with torch.inference_mode():
        for test_seqs, test_lengths, test_mask, test_targets in lstm_test_loaded:
            test_seqs = test_seqs.to(device)
            test_lengths = test_lengths.to(device)
            test_mask = test_mask.to(device)
            test_targets = test_targets.to(device)

            test_preds = model_0(test_seqs, test_lengths)
            test_loss += loss_fn(test_preds, test_targets).item()

        test_loss /= len(lstm_test_loaded)

    # Print progress **only every 50 epochs**
    if (epoch + 1) % 50 == 0:
        print(f"Epoch {epoch+1} | Train loss: {train_loss:.5f} | Test loss: {test_loss:.5f}")

# Total training time
train_time_end_on_cpu = timer()
total_train_time = train_time_end_on_cpu - train_time_start_on_cpu
print(f"Total training time: {total_train_time:.2f} seconds")


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 50 | Train loss: 11323683.26042 | Test loss: 13984876.84278
Epoch 100 | Train loss: 11678908.34778 | Test loss: 14002855.15625
Epoch 150 | Train loss: 11222491.95849 | Test loss: 14522477.38235
Epoch 200 | Train loss: 11825622.26896 | Test loss: 14277373.89502
Epoch 250 | Train loss: 12670165.43776 | Test loss: 16328279.90155
Epoch 300 | Train loss: 11092085.04595 | Test loss: 14277697.97161
Epoch 350 | Train loss: 11307788.87018 | Test loss: 15314639.11244
Epoch 400 | Train loss: 11098501.78654 | Test loss: 14108018.09028
