In [1]:
import pandas as pd
import json

Example row of the dataset

In [2]:
pd.read_json("raw_data.json").loc[0]

amount                                         16.36
month                                              1
day                                               17
year                                            2023
vendor                                          ALDI
from_account    Assets:Discover:Main:Needs:Groceries
Name: 0, dtype: object

In [3]:
def from_account_counts(df):
    return df.groupby("from_account").size().sort_values()

In [4]:
from_account_counts(pd.read_json("raw_data.json"))

from_account
Assets:Discover:FutureWants:Macbook           2
Assets:Discover:Furniture                     3
Assets:Discover:FutureWants:Snowboarding      3
Assets:Discover:FutureWants:Christmas        10
Assets:Discover:FutureWants                  12
Assets:Discover:Main:Needs:Other             24
Assets:Discover:Main:Needs:Monthly           31
Assets:Discover:Main:Needs:Gas               38
Assets:Discover:Travel                       62
Assets:Discover:Main:Needs:Groceries        159
Assets:Discover:Main:Wants:Monthly          168
Assets:Discover:Main:Wants:Other            418
dtype: int64

There are a number of accounts that have very low counts, because they are very specific. For these accounts I'll group them together.

In [5]:
processed = pd.read_json("raw_data.json").assign(
    from_account=lambda df: df["from_account"].apply(
        lambda account: (
            "Assets:Discover:FutureWants" if "FutureWants" in account else account
        )
    )
)

from_account_counts(processed)

from_account
Assets:Discover:Furniture                 3
Assets:Discover:Main:Needs:Other         24
Assets:Discover:FutureWants              27
Assets:Discover:Main:Needs:Monthly       31
Assets:Discover:Main:Needs:Gas           38
Assets:Discover:Travel                   62
Assets:Discover:Main:Needs:Groceries    159
Assets:Discover:Main:Wants:Monthly      168
Assets:Discover:Main:Wants:Other        418
dtype: int64

In [6]:
train_indices = []
for _, group in processed.groupby("from_account"):
    train_indices.extend(group.sample(frac=0.8, random_state=0).index)

train = processed.loc[train_indices]
test = processed.drop(train_indices)

print(f"train:\n{from_account_counts(train)}\n")
print(f"test:\n{from_account_counts(test)}")

train:
from_account
Assets:Discover:Furniture                 2
Assets:Discover:Main:Needs:Other         19
Assets:Discover:FutureWants              22
Assets:Discover:Main:Needs:Monthly       25
Assets:Discover:Main:Needs:Gas           30
Assets:Discover:Travel                   50
Assets:Discover:Main:Needs:Groceries    127
Assets:Discover:Main:Wants:Monthly      134
Assets:Discover:Main:Wants:Other        334
dtype: int64

test:
from_account
Assets:Discover:Furniture                1
Assets:Discover:FutureWants              5
Assets:Discover:Main:Needs:Other         5
Assets:Discover:Main:Needs:Monthly       6
Assets:Discover:Main:Needs:Gas           8
Assets:Discover:Travel                  12
Assets:Discover:Main:Needs:Groceries    32
Assets:Discover:Main:Wants:Monthly      34
Assets:Discover:Main:Wants:Other        84
dtype: int64


In [7]:
writable = {
    "train": train.to_dict(orient="records"),
    "test": test.to_dict(orient="records"),
}

with open("processed_data.json", "w") as f:
    json.dump(writable, f, indent=4)