In [None]:
import pandas as pd
import numpy as np

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
df_usa = pd.read_csv('/content/drive/MyDrive/usa_data.csv')

In [None]:
df_usa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 447860 entries, 0 to 447859
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Unnamed: 0           447860 non-null  int64  
 1   transaction_id       447860 non-null  object 
 2   customer_id          447860 non-null  object 
 3   timestamp            447860 non-null  object 
 4   merchant_category    447860 non-null  object 
 5   merchant_type        447860 non-null  object 
 6   merchant             447860 non-null  object 
 7   amount               447860 non-null  float64
 8   currency             447860 non-null  object 
 9   country              447860 non-null  object 
 10  channel              447860 non-null  object 
 11  distance_from_home   447860 non-null  int64  
 12  weekend_transaction  447860 non-null  bool   
 13  date_transact        447860 non-null  object 
dtypes: bool(1), float64(1), int64(2), object(10)
memory usage: 44.8+ MB


In [None]:
df = df_usa.drop(['currency','transaction_id','Unnamed: 0'],axis=1)

df['day'] = pd.to_datetime(df['date_transact']).dt.day

In [None]:
df.head()

Unnamed: 0,customer_id,timestamp,merchant_category,merchant_type,merchant,amount,country,channel,distance_from_home,weekend_transaction,date_transact,day
0,CUST_34400,2024-10-01 00:00:32.479189+00:00,Healthcare,medical,Lab Corp,1005.88,USA,web,1,False,2024-10-01,1
1,CUST_85031,2024-10-01 00:00:34.482833+00:00,Travel,airlines,United Airlines,1879.34,USA,mobile,1,False,2024-10-01,1
2,CUST_90442,2024-10-01 00:00:44.472679+00:00,Gas,major,Mobil,403.57,USA,web,1,False,2024-10-01,1
3,CUST_37447,2024-10-01 00:00:57.244594+00:00,Gas,major,Mobil,474.12,USA,web,0,False,2024-10-01,1
4,CUST_62600,2024-10-01 00:01:09.607236+00:00,Retail,online,Wayfair,588.1,USA,web,1,False,2024-10-01,1


In [None]:
df['amount'].describe()

Unnamed: 0,amount
count,447860.0
mean,520.383641
std,432.886229
min,25.65
25%,255.84
50%,419.96
75%,590.9925
max,2969.94


In [None]:
categories = df['merchant_category'].unique()
categories

array(['Healthcare', 'Travel', 'Gas', 'Retail', 'Grocery', 'Restaurant',
       'Education', 'Entertainment'], dtype=object)

In [None]:
feature_map = {
    'total_running': 0,
    'total_tx': 1,
    'restaurant_running': 2,
    'restaurant_tx': 3,
    'healthcare_running': 4,
    'healthcare_tx': 5,
    'gas_running': 6,
    'gas_tx': 7,
    'education_running': 8,
    'education_tx': 9,
    'entertainment_running': 10,
    'entertainment_tx': 11,
    'retail_running': 12,
    'retail_tx': 13,
    'grocery_running': 14,
    'grocery_tx': 15,
    'travel_running': 16,
    'travel_tx': 17,
    'normalized_date': 18
}

In [None]:
daily_totals= df.groupby(['customer_id','day'])['amount'].agg(['sum','count']).reset_index()

In [None]:
category_totals= df.groupby(['customer_id','day','merchant_category'])['amount'].agg(['sum','count']).reset_index()
category_totals

Unnamed: 0,customer_id,day,merchant_category,sum,count
0,CUST_10000,1,Healthcare,501.90,1
1,CUST_10000,8,Travel,1642.06,1
2,CUST_10000,12,Retail,2302.74,2
3,CUST_10000,18,Travel,2426.40,1
4,CUST_10000,20,Gas,1322.68,1
...,...,...,...,...,...
169659,CUST_99971,26,Retail,416.19,1
169660,CUST_99971,27,Travel,1089.54,1
169661,CUST_99971,28,Gas,359.06,1
169662,CUST_99971,29,Healthcare,290.03,1


In [None]:
def build_tensors(df):

  categories = df['merchant_category'].unique()

  daily_totals = df.groupby(['customer_id','day'])['amount'].agg(['sum','count']).reset_index()
  category_totals = df.groupby(['customer_id','day','merchant_category'])['amount'].agg(['sum','count']).reset_index()

  # running total, # total purchase, for each cat (#tx, running total)
  num_features = 2 + (len(categories) * 2)

  all_user_dfs = {}

  for cust_id in df['customer_id'].unique():
    tensor = np.zeros((31, num_features))

    user_daily_df = daily_totals[daily_totals['customer_id'] == cust_id]

    for _, day_row in user_daily_df.iterrows():
      day_idx = int(day_row['day']) - 1

      tensor[day_idx, 0] = day_row['sum']  #daily running total $
      tensor[day_idx,1] = day_row['count'] #daily running total tx#

    user_category_df = category_totals[category_totals['customer_id']== cust_id]
    cat_running_totals = {cat: 0 for cat in categories}

    for day in range(1,31):
      day_data = user_category_df[user_category_df['day'] == day]

      for cat_idx, cat in enumerate(categories):
        cat_row = day_data[day_data['merchant_category']== cat]
        daily_spent = cat_row['sum'].iloc[0] if not cat_row.empty else 0
        daily_count = cat_row['count'].iloc[0] if not cat_row.empty else 0

        cat_running_totals[cat] += daily_spent

        features_offset = 2 + (cat_idx * 2) #[total tx, total, running, then x2 for each cat]
        tensor[day-1, features_offset] = cat_running_totals[cat]
        tensor[day-1, features_offset + 1] = daily_count

      tensor[day-1, -1] = day/31 #add feature to normalize day (0,1)

    all_user_dfs[cust_id] = tensor

  return all_user_dfs

In [None]:
user_dfs_dict = build_tensors(df)

In [None]:
user_dfs_dict["CUST_24106"]

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.22580645e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 6.45161290e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 9.67741935e-02],
       [0.000

In [None]:
import torch
import torch.utils.data import Dataset

class 5DayCheckPointDataset(Dataset):
  def __init__(self, user_tensors, cutoffs, running_total_idx=0):
    self.samples = []

    for user_id, tensor in user_tensors.items():
      #total at day 31, position 0 of that tensor is the running total
      total_spend = float(tensor[-1, running_total_idx])

      for benchmark in benchmarks:
        if benchmark <= tensor.shape[0]: #days in month
          seq = tensor[:cutoff,:].astype(np.float32) #all dates up to benchmark
          self.samples.append([seq, benchmark, total_spend])

  def __Len__(self):
    return len(self.samples)

  def __getitem__(self,idx):
    seq, length, target = self.samples[idx]
    return torch.from_numpy(seq



