In [43]:
import teradatasql
import pandas as pd
import warnings
from dotenv import dotenv_values
from modelling_pkg.config import *
from time import time
from pathlib import Path

warnings.filterwarnings(action="ignore")

In [44]:
creds = dotenv_values("../.env")

username = creds["user"]
password = creds["pwd"]
logmech = creds["logmech"]

In [None]:
# db = 'BICOEGH_MDD_DEV'
db = "DBT_MDD_DEV"
# db = 'BICOEMDD_RTP'

# outdir = DATADIR
outdir = f"{DATADIR}"
outdir

In [46]:
session = teradatasql.connect(
    None, host="dhl31.prg-dc.dhl.com", user=username, password=password, logmech=logmech
)

In [None]:
%%time
res = pd.read_sql(
    f" select tablename from dbc.tablesv where databasename = '{db}'", session
)
tables = res.TableName.values

In [None]:
sorted([t for t in tables if "_DS" in t]), sorted(tables)

**ORGANIZATIONS**

In [None]:
%%time

db2 = "DBT_MDD_DEV"


table = "T_exchange_rate"
print(table)
ers = pd.read_sql(
    f"LOCK ROW FOR ACCESS select from_currency, bill_per_dt, rate_eur_ber, rate_usd_ber from {db2}.{table}",
    session,
)
# display(ers.head())
# print(f"Saving table locally as '{outdir}/{table}.csv'")
# ers.to_csv(f"{outdir}/{table}.csv", index=False)

table = "COUNTRIES"
print(table)
ctrs = pd.read_sql(
    f"LOCK ROW FOR ACCESS select name, organization_id, code, region_code, region_name, int_prod_cur_cd as int_cur, dom_prod_cur_cd as dom_cur from {db2}.{table}",
    session,
)
# display(pd.read_sql(f'select * from {db}.{table}', session))
ctrs = ctrs.sort_values("name")
# display(ctrs.head())
# print(f"Saving table locally as '{outdir}/{table}.csv'")
# ctrs.to_csv(f"{outdir}/{table}.csv", index=False)


# if any(ctrs.int_cur.isna()):
#     ctrs.loc[ctrs.int_cur.isna(), ['int_cur']] = ctrs.loc[ctrs.int_cur.isna(), 'currency_cd']
ctrs["dual_currency"] = ctrs.dom_cur != ctrs.int_cur

ex_rates = (
    ers.drop(columns="rate_usd_ber").copy().rename(columns={"rate_eur_ber": "ex_rate"})
)
ex_rates["from_currency"] = ex_rates["from_currency"].apply(lambda s: s.strip())
last_rates = (
    ex_rates.sort_values("bill_per_dt")
    .groupby("from_currency")
    .last()
    .drop(columns="bill_per_dt")
)

organizations = ctrs.merge(
    last_rates, left_on="int_cur", right_on="from_currency", how="left"
)
organizations = organizations.merge(
    last_rates,
    left_on="dom_cur",
    right_on="from_currency",
    how="left",
    suffixes=["", "_dom2eur"],
)
organizations = organizations.rename(columns={"ex_rate": "ex_rate_int2eur"})
organizations["ex_rate_dom2int"] = (
    organizations["ex_rate_dom2eur"] / organizations["ex_rate_int2eur"]
)

organizations["name"] = organizations["name"].apply(
    lambda s: s.replace(" ", "_").upper()
)


def get_og_names(s):
    og = "_".join(map(lambda w: w.capitalize(), s.replace(",_THE", "").split("_")))
    og = og.replace("Czechia", "Czech_Republic")
    og = og.replace("Columbia", "Colombia")
    return og


organizations.insert(1, "og_name", organizations.name.apply(get_og_names))

org_ids = list(organizations.organization_id)

print("ORGANIZATIONS")
display(organizations.head())
print(f"Saving table locally as '{outdir}/organizations.csv'")
organizations.sort_values("name").to_csv(f"{outdir}/organizations.csv", index=False)

# pd.read_csv(f"{outdir}/organizations.csv", index_col=0).organization_id.to_dict()

**TRAINING DATA**

In [None]:
%%time
# 1h 6min

prefix = "V_"
suffix = "_DS"

start = time()

for table in [
    "CUSTOMER",
    "OFFERS",
    "OFFER_AUDIT",
    "OPPORTUNITY",
    "SHIPMENT_PROFILE",
]:
    table2 = prefix + table + suffix

    outdir_ = f"{outdir}/raw_tables/{table}"
    Path(outdir_).mkdir(parents=True, exist_ok=True)

    print(f'\nQuerying table {table}... saving to "{outdir_}"')

    for i, org_id in enumerate(org_ids):
        result = pd.read_sql(
            f"""
                LOCK ROW FOR ACCESS 
                select * 
                from {db}.{table2}
                where organization_id = '{org_id}'
            """,
            session,
        )

        out_path = f"{outdir_}/{org_id}.csv"
        print(
            f" - {int(time() - start):5,} s ||| len: {len(result):9,} ||| {i + 1:2}/{len(org_ids)} - {org_id}"
        )
        result.to_csv(out_path, index=False)

In [None]:
# %%time
# # 2h 30min

# # db = 'DBT_MDD_DEV'

# for table in tqdm([
#     # 'V_CUSTOMER',
#     # 'V_OFFERS',
#     # 'V_OFFER_AUDIT',
#     # 'V_OPPORTUNITY',
#     # 'V_SHIPMENT_PROFILE',
#     'V_CUSTOMER_DS',
#     'V_OFFERS_DS',
#     'V_OFFER_AUDIT_DS',
#     'V_OPPORTUNITY_DS',
#     'V_SHIPMENT_PROFILE_DS'
# ]):

#     print(f'Querying table {table}...')

#     result = pd.read_sql(
#         f"select * from {db}.{table}",
#         session
#     )

#     print(f"Saving table locally as '{outdir}/raw_tables/{table}.csv'")
#     display(result)
#     result.to_csv(
#         os.path.join(outdir, f"{table}.csv"),
#         index=False
#     )

In [108]:
session.close()