In [1]:
import os
os.environ["HTTPS_PROXY"] = ""
os.environ["HTTP_PROXY"] = ""

In [2]:
import time
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate

import warnings
warnings.filterwarnings('ignore')

In [3]:
## Formated input
from dsgtools import azure
input_df = pd.read_adls("Analytics/RnD Projects/Product RnD/Credit/Payment_Collection_Score_202306/QB_11843/input_data.parquet")
print(input_df.shape)

(6803471, 96)


In [4]:
# Rolled up data
path = "Analytics/RnD Projects/Product RnD/Credit/Payment_Collection_Score_202306/QB_11843/redo_20230925/parquet_data/merged_data_for_model_build_0927.parquet"
rolled_up = pd.read_adls(path)
print(rolled_up.shape)
rolled_up = rolled_up[rolled_up.std_customer.isin(['Pendrick_Capital_Partners', 'Penn_Credit_Corp', 'Phillips_and_Cohen'])]
rolled_up = rolled_up[["unique_id"]]
print(rolled_up.shape)

(6314179, 408)
(3814887, 1)


In [5]:
input_df = input_df.merge(rolled_up, on = "unique_id")
print(input_df.shape)
bivariate(input_df.Std_customer, input_df.Std_perf1)


(3814887, 96)


tag,Std_perf1,Std_perf1,Std_perf1,Std_perf1,Std_perf1,Std_perf1
stats,N,PctN,Sum,Mean,WoE,IV
Std_customer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Pendrick_Capital_Partners,986941.0,0.258708,13510.0,0.013689,-2.180089,0.553991
Penn_Credit_Corp,2616292.0,0.685811,395521.0,0.151176,0.371902,0.109505
Phillips_and_Cohen,211654.0,0.055481,8160.0,0.038553,-1.119085,0.045135
Missing,0.0,0.0,0.0,0.0,0.0,0.0
Total,3814887.0,1.0,417191.0,0.109359,0.0,0.708632


In [6]:
# bivariate(input_df.Input_historydate.str.slice(0, 6), input_df.Std_perf1, groups = input_df.Std_customer)

In [7]:
input_df = input_df.drop(columns = ['sufficient_input'])
print(input_df.shape)

(3814887, 95)


In [8]:
input_df.Std_weight = 1

### PCP

In [9]:
PCP_1 = input_df[(input_df.Std_customer == "Pendrick_Capital_Partners") & (input_df.Std_perf1 == 1)].copy()
PCP_0 = input_df[(input_df.Std_customer == "Pendrick_Capital_Partners") & (input_df.Std_perf1 == 0)].copy()
print(PCP_1.shape)
print(PCP_0.shape)

(13510, 95)
(973431, 95)


In [10]:
PCP_0_sample = PCP_0.sample(n = 250_000 - PCP_1.shape[0], random_state=1)
PCP_0_sample.Std_weight = PCP_0.shape[0]/(250_000 - PCP_1.shape[0])
print(PCP_0_sample.Std_weight.unique())
print(PCP_0_sample.shape)

[4.11616136]
(236490, 95)


In [11]:
PCP = pd.concat([PCP_1, PCP_0_sample], ignore_index= True)
print(PCP.shape)

(250000, 95)


### Penn_Credit_Corp

In [12]:
PCC_1 = input_df[(input_df.Std_customer == "Penn_Credit_Corp") & (input_df.Std_perf1 == 1)].copy()
PCC_0 = input_df[(input_df.Std_customer == "Penn_Credit_Corp") & (input_df.Std_perf1 == 0)].copy()
print(PCC_1.shape)
print(PCC_0.shape)

(395521, 95)
(2220771, 95)


In [13]:
PCC_0_sample = PCC_0.sample(n = 500_000 - PCC_1.shape[0], random_state=1)
PCC_0_sample.Std_weight = PCC_0.shape[0]/(500_000 - PCC_1.shape[0])
print(PCC_0_sample.Std_weight.unique())
print(PCC_0_sample.shape)

[21.2556686]
(104479, 95)


In [14]:
PCC = pd.concat([PCC_1, PCC_0_sample], ignore_index= True)
print(PCC.shape)
freq(PCC.Std_perf1)

(500000, 95)


Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
Std_perf1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,104479.0,0.208958,104479.0,0.208958
1,395521.0,0.791042,500000.0,1.0


In [15]:
PCC_sample = PCC.sample(n = 250_000, random_state=1)
freq(PCC_sample.Std_perf1)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
Std_perf1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,51937.0,0.207748,51937.0,0.207748
1,198063.0,0.792252,250000.0,1.0


In [16]:
PCC_sample.Std_weight = 2* PCC_sample.Std_weight

In [17]:
PCC_sample.Std_weight.unique()

array([ 2.        , 42.51133721])

### Phillips_and_Cohen

In [18]:
PAC = input_df[(input_df.Std_customer == "Phillips_and_Cohen")].copy()
print(PAC.shape)

(211654, 95)


### Stack

In [19]:
sample = pd.concat([PCP, PCC_sample, PAC], ignore_index = True)
print(sample.shape)

(711654, 95)


In [20]:
bivariate(sample.Std_customer, sample.Std_perf1)

tag,Std_perf1,Std_perf1,Std_perf1,Std_perf1,Std_perf1,Std_perf1
stats,N,PctN,Sum,Mean,WoE,IV
Std_customer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Pendrick_Capital_Partners,250000.0,0.351294,13510.0,0.05404,-2.056572,0.862246
Penn_Credit_Corp,250000.0,0.351294,198063.0,0.792252,2.144457,1.706556
Phillips_and_Cohen,211654.0,0.297411,8160.0,0.038553,-2.410489,0.907635
Missing,0.0,0.0,0.0,0.0,0.0,0.0
Total,711654.0,1.0,219733.0,0.308764,0.0,3.476437


In [21]:
bivariate(sample.Std_customer, sample.Std_perf1, sample_weight = sample.Std_weight)

tag,Std_perf1,Std_perf1,Std_perf1,Std_perf1,Std_perf1,Std_perf1
stats,N,PctN,Sum,Mean,WoE,IV
Std_customer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Pendrick_Capital_Partners,986941.0,0.259542,13510.0,0.013689,-2.18533,0.557804
Penn_Credit_Corp,2604037.0,0.684799,396126.0,0.15212,0.373997,0.110642
Phillips_and_Cohen,211654.0,0.05566,8160.0,0.038553,-1.124326,0.045634
Missing,0.0,0.0,0.0,0.0,0.0,0.0
Total,3802632.0,1.0,417796.0,0.10987,0.0,0.714081


In [22]:
sample.to_adls("Analytics/RnD Projects/Product RnD/Credit/202311-CRD-Model-Revalidations/Formatted_Files/collection_samples_weighted_1117.parquet", overwrite = True)

In [23]:
sample.shape

(711654, 95)