In [1]:
import os
os.environ["HTTPS_PROXY"] = ""
os.environ["HTTP_PROXY"] = ""

In [2]:
import time
import pandas as pd
import numpy as np
import math

# internal tools
from dsgtools.reporting import make_format
from dsgtools.reporting import TableWriter
from dsgtools.reporting import freq
from dsgtools.reporting import bivariate

import warnings
warnings.filterwarnings('ignore')

In [3]:
path = "Analytics/Customer Projects/American First Finance-proj 10698 (RVS Custom)/File from the Customer/LN_custom_credit_model_file_2021-12-10.csv"

In [4]:
from dsgtools import azure
data = pd.read_adls(path, reader = pd.read_csv, dtype = str, encoding='iso-8859-1')
print(data.shape)

(250000, 19)


In [5]:
data.loan_num.is_unique

True

## Overall File

In [6]:
data[['first_name', 'last_name', 'address_1', 'city', 'state', 'zip', 'ssn', 'dob', 'phone']].isnull().sum()

first_name      1
last_name       5
address_1       0
city            0
state           0
zip             0
ssn             1
dob           355
phone           0
dtype: int64

In [7]:
Input_FirstName = "first_name"
Input_LastName = "last_name"
Input_StreetAddress = "address_1"
Input_City = "city"
Input_State = "state"
Input_Zip = "zip"
Input_SSN = "ssn"

data["sufficient_input"] = np.where((data[Input_FirstName].notnull() & data[Input_LastName].notnull() & data[Input_StreetAddress].notnull() & data[Input_Zip].notnull()) | \
             (data[Input_FirstName].notnull() & data[Input_LastName].notnull() & data[Input_StreetAddress].notnull() & data[Input_City].notnull() & data[Input_State].notnull()) | \
             (data[Input_FirstName].notnull() & data[Input_LastName].notnull() & data[Input_SSN].notnull()), 1, 0)
freq(data["sufficient_input"])

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
sufficient_input,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,6.0,2.4e-05,6.0,2.4e-05
1,249994.0,0.999976,250000.0,1.0


In [8]:
data = data[data["sufficient_input"] == 1]
data["APPDATETIME_fmt"] = data['application_date'].astype(str).str.slice(0,4) + data['application_date'].astype(str).str.slice(5,7)
data["APPDATETIME_fmt_dt"] = data['application_date'].astype(str).str.slice(0,4) + data['application_date'].astype(str).str.slice(5,7) + data['application_date'].astype(str).str.slice(8,10)
freq(data["APPDATETIME_fmt"] )

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
APPDATETIME_fmt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
202012,13443.0,0.053773,13443.0,0.053773
202101,20366.0,0.081466,33809.0,0.135239
202102,18396.0,0.073586,52205.0,0.208825
202103,24109.0,0.096438,76314.0,0.305263
202104,23524.0,0.094098,99838.0,0.399362
202105,26658.0,0.106635,126496.0,0.505996
202106,26692.0,0.106771,153188.0,0.612767
202107,26835.0,0.107343,180023.0,0.720109
202108,24414.0,0.097658,204437.0,0.817768
202109,24665.0,0.098662,229102.0,0.91643


In [11]:
data["never_pay_fmt"] = np.where(data.never_pay.isnull(), -1, np.where(data.never_pay == "1", 1, 0))
freq(data.never_pay_fmt)

Unnamed: 0_level_0,Count,Pct,Cuml Count,Cuml Pct
never_pay_fmt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,49999.0,0.200001,49999.0,0.200001
0,193685.0,0.774759,243684.0,0.974759
1,6310.0,0.025241,249994.0,1.0


In [12]:
data = data.sort_values(by = ['first_name', 'last_name', 'address_1', 'city', 'state', 'zip', 'ssn', 'dob', 'phone', 
                              'APPDATETIME_fmt_dt', "never_pay_fmt"], ascending = False)
print(data.shape)
data = data.drop_duplicates(subset = ['first_name', 'last_name', 'address_1', 'city', 'state', 'zip', 'ssn', 'dob', 'phone',  'APPDATETIME_fmt_dt'], keep = "first")
print(data.shape)

(249994, 23)
(249833, 23)


In [20]:
data.to_adls("Analytics/RnD Projects/Product RnD/Credit/202311-CRD-Model-Revalidations/Formatted_Files/Full_sample_formatted_w_flags/ready_to_be_formatted/aff_10698_full_file.parquet",
             overwrite = True)

In [3]:
from dsgtools import azure
data = pd.read_adls("Analytics/RnD Projects/Product RnD/Credit/202311-CRD-Model-Revalidations/Formatted_Files/Full_sample_formatted_w_flags/ready_to_be_formatted/aff_10698_full_file.parquet",)
print(data.shape)

(249833, 23)


In [16]:
from datetime import datetime
data["dob_fmt"] = pd.to_datetime(data.dob.str.slice(0,10), errors = 'coerce', format = "%Y-%m-%d", )
data["dob_fmt_dt"] = data['dob_fmt'].astype(str).str.slice(0,4) + data['dob_fmt'].astype(str).str.slice(5,7) + data['dob_fmt'].astype(str).str.slice(8,10)

## Sample for validation

In [21]:
freq(data.is_approved, data.is_funded, cross = False, observed = True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Count,Pct,Cuml Count,Cuml Pct
is_approved,is_funded,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,199872.0,0.800022,199872.0,0.800022
1,0,22161.0,0.088703,222033.0,0.888726
0,0,27800.0,0.111274,249833.0,1.0


In [22]:
data= data[(data.is_approved =="1") & (data.is_funded =="1")]
print(data.shape)
freq(data.is_fraud, data.fpd_45, data.spd, data.never_pay, cross = False, observed = True)

(199872, 25)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Count,Pct,Cuml Count,Cuml Pct
is_fraud,fpd_45,spd,never_pay,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,0,0,177886.0,0.89,177886.0,0.89
0,0,1,0,13683.0,0.068459,191569.0,0.958458
0,1,1,0,1126.0,0.005634,192695.0,0.964092
0,1,1,1,5615.0,0.028093,198310.0,0.992185
0,Missing,Missing,0,4.0,2e-05,198314.0,0.992205
1,0,0,0,616.0,0.003082,198930.0,0.995287
1,0,1,0,234.0,0.001171,199164.0,0.996458
1,1,1,0,31.0,0.000155,199195.0,0.996613
1,1,1,1,676.0,0.003382,199871.0,0.999995
1,Missing,Missing,0,1.0,5e-06,199872.0,1.0


In [23]:
data = data[(data.is_fraud == "0") & (data.fpd_45.notnull())]
print(data.shape)

(198310, 25)


In [24]:
bivariate(data.APPDATETIME_fmt, data.fpd_45.astype(float))

tag,fpd_45,fpd_45,fpd_45,fpd_45,fpd_45,fpd_45
stats,N,PctN,Sum,Mean,WoE,IV
APPDATETIME_fmt,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
202012,11276.0,0.05686,242.0,0.021462,-0.472831,0.010261
202101,16809.0,0.084761,324.0,0.019275,-0.582494,0.02213
202102,15159.0,0.076441,285.0,0.018801,-0.607913,0.0215
202103,19823.0,0.09996,445.0,0.022449,-0.426851,0.015002
202104,19668.0,0.099178,759.0,0.038591,0.131577,0.001826
202105,22489.0,0.113403,1080.0,0.048023,0.360118,0.017446
202106,22283.0,0.112364,1192.0,0.053494,0.473754,0.031609
202107,22224.0,0.112067,866.0,0.038967,0.141671,0.002404
202108,19743.0,0.099556,599.0,0.03034,-0.117515,0.001302
202109,20166.0,0.101689,719.0,0.035654,0.049381,0.000254


In [27]:
data.to_adls("Analytics/RnD Projects/Product RnD/Credit/202311-CRD-Model-Revalidations/Formatted_Files/cleaned/aff_cleaned_11_20.parquet", overwrite = True)

In [28]:
data.to_adls("Analytics/RnD Projects/Product RnD/Credit/202311-CRD-Model-Revalidations/Formatted_Files/aff_cleaned_11_20.parquet", overwrite = True)

In [29]:
data.shape

(198310, 25)