In [2]:
import time
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate
from dsgtools import azure

### collection score

In [2]:
collection = pd.read_adls('Analytics/RnD Projects/Product RnD/Credit/202311-CRD-Model-Revalidations/Formatted_Files/collection_samples_weighted_1117.parquet')
print(collection.shape)

(711654, 95)


In [3]:
collection.unique_id.is_unique

True

In [4]:
stacked_final = pd.read_adls("Analytics/RnD Projects/Product RnD/Credit/Payment_Collection_Score_202306/QB_11843/redo_20230925/parquet_data/merged_data_for_model_build_0927.parquet")
print(stacked_final.shape)

stacked_final = stacked_final[["unique_id", "age", "senior", "gender", "cfpb_race_estimate_80", "lexid",  'r_rv_score_payment',
 'r_rv_payment_reason', 'r_rv_payment_reason2', 'r_rv_payment_reason3', 'r_rv_payment_reason4', 'r_rv_payment_reason5', 'rv_score_payment',
 'rv_payment_reason', 'rv_payment_reason2', 'rv_payment_reason3', 'rv_payment_reason4', 'rv_payment_reason5',]]

(6314179, 408)


In [5]:
print(collection.shape)
collection = collection.merge(stacked_final, on = ['unique_id'])
print(collection.shape)

(711654, 95)
(711654, 112)


In [6]:
collection.Std_weight.value_counts()

4.116161     236490
1.000000     225164
2.000000     198063
42.511337     51937
Name: Std_weight, dtype: int64

In [7]:
freq(collection.Std_customer)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
Std_customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Pendrick_Capital_Partners,250000.0,0.351294,250000.0,0.351294
Penn_Credit_Corp,250000.0,0.351294,500000.0,0.702589
Phillips_and_Cohen,211654.0,0.297411,711654.0,1.0


In [8]:
collection.to_adls("Analytics/RnD Projects/Product RnD/Credit/202311-CRD-Model-Revalidations/Formatted_Files/Existing_score/collection/collection.parquet")

### IAG10

In [15]:
imoc = pd.read_adls("Analytics/RnD Projects/Product RnD/Credit/IMOC/LexisNexis-proj 11882/processed/20231115_model_ready_attributes_v3.parquet")
print(imoc.shape)

(2000000, 507)


In [16]:
tags_synth = [
    "tag_citi_synthetic",
    "tag_kohls_synth",
    "tag_synch_syn_tag",
    "tag_usba_fprc_flag",
]
imoc["tag_synth"] = imoc[tags_synth].eq(1).any(axis="columns").astype("int")
imoc.loc[imoc[tags_synth].isnull().all(axis="columns"), "tag_synth"] = np.nan
display("tag_synth", freq("tag_synth", df=imoc, sample_weight="std_weight"))
display("tag_synth", freq("tag_synth", df=imoc))

'tag_synth'

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
tag_synth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,17432130.0,0.861671,17432130.0,0.861671
1.0,61586.0,0.003044,17493710.0,0.864715
Missing,2736887.0,0.135285,20230600.0,1.0


'tag_synth'

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
tag_synth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,1536945.0,0.768473,1536945.0,0.768473
1.0,61586.0,0.030793,1598531.0,0.799265
Missing,401469.0,0.200735,2000000.0,1.0


In [17]:
freq("tag_synth", "tag_imoc", df=imoc, cross = False, observed = True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Count,Pct,Cuml Count,Cuml Pct
tag_synth,tag_imoc,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Missing,0.0,363715.0,0.181858,363715.0,0.181858
Missing,Missing,1557.0,0.000779,365272.0,0.182636
Missing,1.0,36197.0,0.018099,401469.0,0.200734
0.0,0.0,1386017.0,0.693009,1787486.0,0.893743
0.0,Missing,101880.0,0.05094,1889366.0,0.944683
0.0,1.0,49048.0,0.024524,1938414.0,0.969207
1.0,Missing,36610.0,0.018305,1975024.0,0.987512
1.0,1.0,24976.0,0.012488,2000000.0,1.0


In [18]:
imoc["imoc_synth_final"] = np.where(imoc.tag_synth.isnull() & imoc.tag_imoc.isnull(), np.nan, 
                                    imoc[['tag_synth', 'tag_imoc']].max(axis=1))
freq(imoc["imoc_synth_final"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
imoc_synth_final,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,1851612.0,0.925806,1851612.0,0.925806
1.0,146831.0,0.073415,1998443.0,0.999221
Missing,1557.0,0.000779,2000000.0,1.0


In [19]:
# freq(imoc.Input_historydate.astype(str).str.slice(0, 6))

In [20]:
imoc = imoc[(imoc.Input_historydate.astype(str).str.slice(0, 6).astype(float) >=201908) & (imoc.imoc_synth_final.notnull())]
print(imoc.shape)

(1810630, 509)


In [21]:
bivariate(imoc.client, imoc.imoc_synth_final, sample_weight= imoc.std_weight)

tag,imoc_synth_final,imoc_synth_final,imoc_synth_final,imoc_synth_final,imoc_synth_final,imoc_synth_final
stats,N,PctN,Sum,Mean,WoE,IV
client,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
alliance,2717085.0,0.146264,36197.0,0.013322,0.621163,0.077752
citi,8839260.0,0.47583,48872.0,0.005529,-0.266119,0.029637
kohls,1472367.0,0.07926,12327.0,0.008372,0.151663,0.001966
synch,5399607.0,0.290668,36100.0,0.006686,-0.07499,0.001576
usba,148205.7,0.007978,302.0,0.002038,-1.267799,0.007306
Missing,0.0,0.0,0.0,0.0,0.0,0.0
Total,18576520.0,1.0,133798.0,0.007203,0.0,0.118236


In [22]:
keep = ["p_inpacct", "imoc_synth_final", "tag_synth", "tag_imoc", "std_weight",'client', 'innovis_segment', 
        'coim', 'coim_reasoncode1', 'coim_reasoncode2', 'coim_reasoncode3', 'coim_reasoncode4', 'coim_reasoncode5',
        'age_category', "gender", 'cfpb_race_estimate_80', 
        'Input_FirstName', 'Input_MiddleName', 'Input_LastName', 'Input_StreetAddress', 'Input_City', 'Input_State', 'Input_Zip', 'Input_HomePhone', 'Input_SSN',
        'Input_DateOfBirth', 'Input_WorkPhone', 'Input_income', 'Input_DLNumber', 'Input_DLState', 'Input_BALANCE', 'Input_CHARGEOFFD', 'Input_FormerName',
        'Input_EMAIL', 'Input_employername', 'Input_historydate', 'Input_IPAddress', 'Std_customer', 'Std_quickbase_project', 'Std_account_on_file',]
imoc = imoc[keep]
imoc = imoc[~imoc.client.isin(["alliance"])]
print(imoc.shape)
imoc.to_adls("Analytics/RnD Projects/Product RnD/Credit/202311-CRD-Model-Revalidations/Formatted_Files/Existing_score/IAG10/imoc_201908p_w_perf_1204_w_synth.parquet",
             overwrite = True)

(1410718, 40)
