In [1]:
import time
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate

In [2]:
full_path = "Analytics/RnD Projects/Product RnD/Credit/202311-CRD-Model-Revalidations/Formatted_Files/Full_sample_formatted_w_flags/ready_to_be_formatted/"
full_path_save = "Analytics/RnD Projects/Product RnD/Credit/202311-CRD-Model-Revalidations/Formatted_Files/Full_Converted/"

cleaned_path = "Analytics/RnD Projects/Product RnD/Credit/202311-CRD-Model-Revalidations/Formatted_Files/"
cleaned_path_save = "Analytics/RnD Projects/Product RnD/Credit/202311-CRD-Model-Revalidations/Formatted_Files/ReVal_Converted/"

### Convert

In [3]:
def reformat(column, input_df):
    output["Std_account_on_file"] = input_df[format_tbl.loc[format_tbl["Field Name"] == "Std_account_on_file", column].values[0]]
    output["Input_Account"] = format_tbl.loc[format_tbl["Field Name"] == "Std_customer", column].values[0] + "_" + column + "_" + (output.index + 1).astype(str)
    output["Std_sample_indicator"] =  format_tbl.loc[format_tbl["Field Name"] == "Std_industry", column].values[0] + "_" + \
                                         format_tbl.loc[format_tbl["Field Name"] == "Std_customer", column].values[0] + "_" + \
                                             format_tbl.loc[format_tbl["Field Name"] == "Std_quickbase_project", column].values[0]
    
    for x in format_tbl["Field Name"].to_list():
        value = format_tbl.loc[format_tbl["Field Name"] == x, column].values[0]
        if (x not in output.columns) & (value == value):
            name_col = x.find("_name")          
            if (name_col == -1) & (x not in ["Std_customer", "Std_quickbase_project", "Std_industry"]):
                code = value.find("pycode: ")
                if code != -1:
                    value = value.split(": ")
                    cols = value.split(" + ")
                    output[x] = input_df[cols].apply(lambda x: ', '.join(x.dropna().astype(str).values), axis=1)
                else:
                    output[x] = input_df[value]
            else:
                output[x] = value
        elif (x not in output.columns) & (value != value):
            output[x] = value
    return output

In [4]:
format_tbl = pd.read_excel("./doc/reformat_data.xlsx", sheet_name = "Revalidation_2023", verbose = False, dtype = str)
print(format_tbl.shape)

(94, 16)


In [5]:
format_tbl.columns

Index(['Field Name', 'Required', 'Python Data Type', 'PNC_11414',
       'TMobile_10869', 'AFF_10698', 'Kohl_11696', 'OneMain_11012',
       'Elevate_11167', 'AllyMetro2', 'GLSMetro2', 'DiscoverMetro2',
       'Toyota 11056', 'Extra 11103', 'Uprova 10907', 'Description'],
      dtype='object')

In [6]:
from dsgtools import azure

### PNC

In [7]:
output = pd.DataFrame()
pnc = pd.read_adls(full_path + "pnc_11414_full_file.parquet")
pnc["weight"] = 1
print(pnc.shape)
pnc_input_fmt = reformat("PNC_11414", pnc)
pnc_input_fmt = pnc_input_fmt[list(format_tbl["Field Name"])]
print(pnc_input_fmt.shape)
pnc_input_fmt.to_adls(full_path_save + "/pnc_input_fmt.parquet", overwrite=True)

(1857583, 866)
(1857583, 94)


In [8]:
output = pd.DataFrame()
pnc = pd.read_adls(cleaned_path + "pnc_250K_11_17.parquet")
print(pnc.shape)
pnc_input_fmt = reformat("PNC_11414", pnc)
pnc_input_fmt = pnc_input_fmt[list(format_tbl["Field Name"])]
print(pnc_input_fmt.shape)
pnc_input_fmt.to_adls(cleaned_path_save + "/pnc_cleaned_input_fmt.parquet", overwrite=True)

(250000, 866)
(250000, 94)


### TMobile

In [14]:
output = pd.DataFrame()
tmobile = pd.read_adls(full_path + "tmobile_10869_full_file.parquet")
tmobile["weight"] = 1
print(tmobile.shape)
tmobile_input_fmt = reformat("TMobile_10869", tmobile)
tmobile_input_fmt = tmobile_input_fmt[list(format_tbl["Field Name"])]
print(tmobile_input_fmt.shape)
tmobile_input_fmt.to_adls(full_path_save + "/tmobile_full_input_fmt.parquet", overwrite=True)

(3516579, 150)
(3516579, 94)


In [16]:
output = pd.DataFrame()
tmobile = pd.read_adls(cleaned_path + "tmobile_250K_11_20.parquet")
print(tmobile.shape)
tmobile_input_fmt = reformat("TMobile_10869", tmobile)
tmobile_input_fmt = tmobile_input_fmt[list(format_tbl["Field Name"])]
print(tmobile_input_fmt.shape)
tmobile_input_fmt.to_adls(cleaned_path_save + "/tmobile_cleaned_input_fmt.parquet", overwrite=True)

(250000, 150)
(250000, 94)


### AFF

In [17]:
output = pd.DataFrame()
aff = pd.read_adls(full_path + "aff_10698_full_file.parquet")
aff["weight"] = 1
print(aff.shape)
aff_input_fmt = reformat("AFF_10698", aff)
aff_input_fmt = aff_input_fmt[list(format_tbl["Field Name"])]
print(aff_input_fmt.shape)
aff_input_fmt.to_adls(full_path_save + "/aff_full_input_fmt.parquet", overwrite=True)

(249833, 26)
(249833, 94)


In [20]:
output = pd.DataFrame()
aff = pd.read_adls(cleaned_path + "aff_cleaned_11_20.parquet")
print(aff.shape)
aff_input_fmt = reformat("AFF_10698", aff)
aff_input_fmt = aff_input_fmt[list(format_tbl["Field Name"])]
print(aff_input_fmt.shape)
aff_input_fmt.to_adls(cleaned_path_save + "aff_cleaned_input_fmt.parquet", overwrite=True)

(198310, 25)
(198310, 94)


### Kohl

In [25]:
output = pd.DataFrame()
kohl = pd.read_adls(full_path + "kohls_11696_full_file.parquet")
kohl["weight"] = 1
print(kohl.shape)
kohl_input_fmt = reformat("Kohl_11696", kohl)
kohl_input_fmt = kohl_input_fmt[list(format_tbl["Field Name"])]
print(kohl_input_fmt.shape)
kohl_input_fmt.to_adls(full_path_save + "/kohl_full_input_fmt.parquet", overwrite=True)

(4652190, 73)
(4652190, 94)


In [26]:
output = pd.DataFrame()
kohl = pd.read_adls(cleaned_path + "kohls_250K_11_21.parquet")
print(kohl.shape)
kohl_input_fmt = reformat("Kohl_11696", kohl)
kohl_input_fmt = kohl_input_fmt[list(format_tbl["Field Name"])]
print(kohl_input_fmt.shape)
kohl_input_fmt.to_adls(cleaned_path_save + "/kohl_cleaned_input_fmt.parquet", overwrite=True)

(250000, 73)
(250000, 94)


### OneMain

In [27]:
output = pd.DataFrame()
onemain = pd.read_adls(full_path + "OneMain_11012_full_file.parquet")
print(onemain.shape)
onemain_input_fmt = reformat("OneMain_11012", onemain)
onemain_input_fmt = onemain_input_fmt[list(format_tbl["Field Name"])]
print(onemain_input_fmt.shape)
onemain_input_fmt.to_adls(full_path_save + "/onemain_full_input_fmt.parquet", overwrite=True)

(1574646, 38)
(1574646, 94)


In [28]:
output = pd.DataFrame()
onemain = pd.read_adls(cleaned_path + "OneMain_11_21.parquet")
print(onemain.shape)
onemain_input_fmt = reformat("OneMain_11012", onemain)
onemain_input_fmt = onemain_input_fmt[list(format_tbl["Field Name"])]
print(onemain_input_fmt.shape)
onemain_input_fmt.to_adls(cleaned_path_save + "/onemain_cleaned_input_fmt.parquet", overwrite=True)

(194187, 38)
(194187, 94)


### Elevate

In [6]:
from dsgtools import azure

In [7]:
output = pd.DataFrame()
elevate = pd.read_adls(full_path + "elevate_11167_full_file.parquet")
print(elevate.shape)
elevate_input_fmt = reformat("Elevate_11167", elevate)
elevate_input_fmt = elevate_input_fmt[list(format_tbl["Field Name"])]
print(elevate_input_fmt.shape)
elevate_input_fmt.to_adls(full_path_save + "/elevate_full_input_fmt.parquet", overwrite=True)

(379951, 43)
(379951, 94)


In [8]:
output = pd.DataFrame()
elevate = pd.read_adls(cleaned_path + "elevate_11_21.parquet")
print(elevate.shape)
elevate_input_fmt = reformat("Elevate_11167", elevate)
elevate_input_fmt = elevate_input_fmt[list(format_tbl["Field Name"])]
print(elevate_input_fmt.shape)
elevate_input_fmt.to_adls(cleaned_path_save + "/elevate_cleaned_input_fmt.parquet", overwrite=True)

(131691, 43)
(131691, 94)


In [12]:
elevate_input_fmt.Input_historydate.str.slice(0, 6).value_counts()

202108    20035
202107    19019
202106    16329
202109    11892
202111    11099
202110    10345
202105    10048
202112     8360
202201     6050
202104     4373
202101     4359
202202     3919
202102     3249
202103     2614
Name: Input_historydate, dtype: int64