# Document table generation : stateful architecture

The document Table :

            'doc_id'
            'client_id'
            'issue_date'
            'due_date'
            'state'
            'previous_state'
            'state_date'
            'amount_ht'
            'amount_tva'
            'amount_ttc'
            'currency'
            'payment_status'
            'payment_date'
            'related_doc_id'
            'comment'

In [1]:
import random
import uuid
from datetime import datetime, timedelta
import pandas as pd

In [5]:
clients_df = pd.read_csv("..\data\synthetic\clients_synthetic.csv")
professional_clients = clients_df[clients_df["client_category"] == "professional"]["client_id"].tolist()

# Document dataset Rule-based


In [6]:
num_docs = 100
currency = "tnd"

state_flow = {
    "devis": ["bon de livraison", None],
    "bon de livraison": ["bon de sortie", "facture"],
    "bon de sortie": ["facture"],
    "facture": [None, "avoir"],
    "avoir": [None]
}

today = datetime.today()
start_date = today - timedelta(days=5*365)

In [8]:
clients_df = pd.read_csv("..\data\synthetic\clients_synthetic.csv")
client_reg_date = pd.to_datetime(clients_df["registration_date"])
professional_clients = clients_df[clients_df["client_category"] == "professional"]

In [9]:
def random_date(start, end):
    delta = end - start
    random_seconds = random.randint(0, int(delta.total_seconds()))
    return start + timedelta(seconds=random_seconds)

In [10]:
def generate_doc_chain(doc_id, client_id,reg_date):
    rows = []

    amount_ht = round(random.uniform(100, 5000), 2)
    tva = round(amount_ht * 0.19, 2)
    amount_ttc = round(amount_ht + tva, 2)

    current_state = "devis"
    current_date = random_date(reg_date, today)
    previous_state = None

    while current_state:
        row = {
            "doc_id": doc_id,
            "client_id": client_id,
            "state": current_state,
            "state_date": current_date.date(),
            "previous_state": previous_state,
            "amount_ht": amount_ht if current_state != "avoir" else -amount_ht,
            "amount_tva": tva if current_state != "avoir" else -tva,
            "amount_ttc": amount_ttc if current_state != "avoir" else -amount_ttc,
            "currency": currency,
            "payment_status": None,
            "payment_date": None,
            "related_doc_id": None,
        }

        if current_state == "facture":
            row["payment_status"] = random.choice(["unpaid","partially_paid","paid"])
            if row["payment_status"] == "paid":
                row["payment_date"] = (current_date + timedelta(days=random.randint(1,30))).date()

        if current_state == "avoir":
            row["related_doc_id"] = doc_id
            row["payment_status"] = "refunded"
            row["payment_date"] = (current_date + timedelta(days=1)).date()

        rows.append(row)

        previous_state = current_state
        current_state = random.choice(state_flow[current_state])
        current_date += timedelta(days=random.randint(1, 10))

    return rows

In [11]:
all_rows = []
for _ in range(num_docs):
    client = professional_clients.sample(1).iloc[0]
    client_id = client["client_id"]
    reg_date = pd.to_datetime(client["registration_date"])
    doc_id = "doc_" + str(uuid.uuid4())[:8]
    all_rows.extend(generate_doc_chain(doc_id, client_id, reg_date))

In [12]:
df = pd.DataFrame(all_rows)
print(df.head())

         doc_id   client_id             state  state_date    previous_state  \
0  doc_6d844989  #client179             devis  2024-01-19              None   
1  doc_ab550847   #client28             devis  2023-10-27              None   
2  doc_ab550847   #client28  bon de livraison  2023-11-06             devis   
3  doc_ab550847   #client28     bon de sortie  2023-11-08  bon de livraison   
4  doc_ab550847   #client28           facture  2023-11-13     bon de sortie   

   amount_ht  amount_tva  amount_ttc currency  payment_status payment_date  \
0     549.30      104.37      653.67      tnd            None         None   
1    2752.84      523.04     3275.88      tnd            None         None   
2    2752.84      523.04     3275.88      tnd            None         None   
3    2752.84      523.04     3275.88      tnd            None         None   
4    2752.84      523.04     3275.88      tnd  partially_paid         None   

  related_doc_id  
0           None  
1           None  

In [13]:
df.to_csv("..\data\synthetic\documents_synth.csv", index=False)