In [1]:
import time
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate
from dsgtools import azure

Matplotlib is building the font cache; this may take a moment.


In [2]:
path = "Analytics/Customer Projects/RND_multiple_clint_rv6_attr_rerun - proj 12092 (RV 6.0)/File from the Customer/ANCUMA821_bpl_4clients/input/"
df_prod_107571 = pd.read_adls(path + "107571_cnu_parquet", reader = pd.read_parquet)
df_prod_107671 = pd.read_adls(path + "107671_bankers_healthcare_parquet", reader = pd.read_parquet)
df_prod_107891 = pd.read_adls(path + "107891_bpl_parquet", reader = pd.read_parquet)
df_prod_108706 = pd.read_adls(path + "108706_blue_chip_parquet", reader = pd.read_parquet)

print(df_prod_107571.shape)
print(df_prod_107671.shape)
print(df_prod_107891.shape)
print(df_prod_108706.shape)

df_prod = pd.concat([df_prod_107571, df_prod_107671, df_prod_107891, df_prod_108706], ignore_index = True)
print(df_prod.shape)

(43516, 401)
(27572, 401)
(11494, 401)
(108596, 401)
(191178, 401)


In [3]:
display(df_prod_107891.date_added.astype(str).str.slice(0, 7).value_counts())
display(df_prod_108706.date_added.astype(str).str.slice(0, 7).value_counts())

2023-10    11494
Name: date_added, dtype: int64

2023-10    108596
Name: date_added, dtype: int64

In [4]:
path = "Analytics/Customer Projects/RND_multiple_clint_rv6_attr_rerun - proj 12092 (RV 6.0)/File from the Customer/ANCUMA821_bpl_4clients/processing/"
df_107571 = pd.read_adls(path + "ln_12092_cons1_input_107571_cnuohorvxml_riskview_v6_normal_mask_total.csv", verbose = False, low_memory = False, encoding = "iso-8859-1")
df_107671 = pd.read_adls(path + "ln_12092_cons1_input_107671_bhgrvxml_riskview_v6_normal_mask_total.csv", verbose = False, low_memory = False, encoding = "iso-8859-1")
df_107891 = pd.read_adls(path + "ln_12092_cons1_input_107891_gdsbpllrvxml_riskview_v6_normal_mask_total.csv", verbose = False, low_memory = False, encoding = "iso-8859-1")
df_108706 = pd.read_adls(path + "ln_12092_cons1_input_108706_bluechiprvxml_riskview_v6_normal_mask_total.csv", verbose = False, low_memory = False, encoding = "iso-8859-1")

print(df_107571.shape)
print(df_107671.shape)
print(df_107891.shape)
print(df_108706.shape)

df = pd.concat([df_107571, df_107671, df_107891, df_108706], ignore_index = True)
print(df.shape)

(43516, 507)
(27572, 507)
(11494, 507)
(108596, 507)
(191178, 507)


In [5]:
print(df.acctno.is_unique)
print(df_prod.transaction_id.is_unique)

True
True


In [6]:
freq(df_prod.date_added.astype(str).str.slice(0, 7))

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
date_added,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-10,191178.0,1.0,191178.0,1.0


In [7]:
freq(df_prod.company_id)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
company_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
107571,43516.0,0.22762,43516.0,0.22762
107671,27572.0,0.144222,71088.0,0.371842
107891,11494.0,0.060122,82582.0,0.431964
108706,108596.0,0.568036,191178.0,1.0


In [8]:
## merged the df
keep = ["transaction_id", "date_added", "o_lexid"]
temp_prod = df_prod[keep]

keep =["acctno", "lexid"]
temp_retro = df[keep]
merged = temp_retro.merge(temp_prod, left_on = "acctno", right_on = "transaction_id")
print(merged.shape)

(191178, 5)


In [9]:
merged["lexid_change"] = np.where(merged.o_lexid == merged.lexid, 0, 1)
fmt = make_format(cuts = [-np.inf, 0, np.inf])
freq(merged.lexid_change, merged.o_lexid, merged.lexid, format = [None, fmt, fmt], cross = False, observed = True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Count,Pct,Cuml Count,Cuml Pct
lexid_change,o_lexid,lexid,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1+,1+,188221.0,0.984533,188221.0,0.984533
1,<= 0,1+,506.0,0.002647,188727.0,0.987179
1,<= 0,Missing,1465.0,0.007663,190192.0,0.994843
1,1+,1+,623.0,0.003259,190815.0,0.998101
1,1+,Missing,363.0,0.001899,191178.0,1.0


In [10]:
temp = merged[merged.lexid_change == 0]
print(temp.shape)
df = df.merge(temp[["acctno", "transaction_id"]], on = "acctno")
df_prod = df_prod.merge(temp[["acctno", "transaction_id"]], on = "transaction_id")
print(df.shape)
print(df_prod.shape)

(188221, 6)


(188221, 508)
(188221, 402)


In [11]:
len([x for x in df.columns if "pl_" in x and x != "pl_fileaddrtype"])

310

In [12]:
col = [x for x in df.columns if "pl_" in x or "pi_" in x]
print(len(col))
not_in = ['PL_FileAddrTaxValYr', 'PL_FileAddrType', 'PL_DrgBkNewChType10Y', 'PL_DrgBkNewDispType10Y', 'PI_InpAddrTaxValYr', 'PI_InpAddrType']
not_in = [x.lower() for x in not_in]
df[[x for x in col if x not in not_in]] = df[[x for x in col if x not in not_in]].astype(float) 
df_prod[[x for x in col if x not in not_in]] = df_prod[[x for x in col if x not in not_in]].astype(float) 
col = col + ["acctno"]

340


In [23]:
from dsgtools.reporting import ImpactAnalysisReport
cr = ImpactAnalysisReport(df[col], df_prod[col], keys="acctno",
           exceptions= [-99_999, -99_998, -99_997],
           dataframe_names = ("retro", "prod"))
cr.to_xlsx("./_temp/riskview-comparison.xlsx", overwrite= True)

<dsgtools.reporting.tablewriter.TableWriter at 0x7fe527271d20>

In [20]:
temp_prod = df_prod[df_prod.company_id == 107891]
print(temp_prod.shape)
from dsgtools.reporting import ImpactAnalysisReport
cr = ImpactAnalysisReport(df[col], temp_prod[col], keys="acctno",
           exceptions= [-99_999, -99_998, -99_997],
           dataframe_names = ("retro", "prod"))
cr.to_xlsx("./_temp/riskview-comparison_test.xlsx", overwrite= True)

(31772, 407)


<dsgtools.reporting.tablewriter.TableWriter at 0x7f63b32aef20>

In [13]:
samples = ['pl_srcoldmsncev', 'pl_inqcnt2y', "pl_inqcnt1m"]
temp_retro = df[samples + ['acctno', 'lexid']]
temp_prod = df_prod[samples + ['acctno', 'o_lexid', "company_id", "date_added"]]
merged = temp_retro.merge(temp_prod, on = "acctno")
print(merged.shape)

(188221, 11)


In [14]:
merged["flag"] = np.where(merged.lexid == merged.o_lexid, 0, 1)
freq(merged["flag"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
flag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,188221.0,1.0,188221.0,1.0


In [15]:
for x in samples:
    merged[x + "_diff"] = np.where((merged[x + "_x"] < 0) & (merged[x + "_y"] < 0), -999_999_999, np.where(
        (merged[x + "_x"] >= 0) & (merged[x + "_y"] < 0), -999_999_998, np.where(
            (merged[x + "_x"] < 0) & (merged[x + "_y"] >= 0), -999_999_997, merged[x+ "_x"] - merged[x + "_y"]
        )
    ))

In [16]:
temp = merged[merged["pl_srcoldmsncev_diff"] ==3].head(25)
temp_2 = merged[merged["pl_srcoldmsncev_diff"] >50].tail(25)
pd.concat([temp, temp_2], ignore_index = True).to_csv("./_temp/example_for_engineering_1222.csv", index = False)

In [20]:
fmt_2 = make_format(cuts = [-np.inf, -51, -21, -16, -11, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 10, 15, 20, 50, np.inf], exceptions = [-999_999_999, -999_999_998, -999_999_997])
freq('pl_srcoldmsncev_diff', format = fmt_2, df = merged, observed = True)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
pl_srcoldmsncev_diff,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<= -51,4.0,2.1e-05,4.0,2.1e-05
-50--21,6.0,3.2e-05,10.0,5.3e-05
-15--11,1.0,5e-06,11.0,5.8e-05
-10--6,6.0,3.2e-05,17.0,9e-05
-4,1.0,5e-06,18.0,9.6e-05
-3,2.0,1.1e-05,20.0,0.000106
-2,1.0,5e-06,21.0,0.000112
-1,1.0,5e-06,22.0,0.000117
0,2.0,1.1e-05,24.0,0.000128
1,5.0,2.7e-05,29.0,0.000154
