# Import Packages

In [None]:
# library(devtools)
# install_github(
#     'jasonchang2018/opploansanalytics',
#     auth_token = Sys.getenv('GITHUB_PAT_OPPLOANSANALYTICS')
# )

library(opploansanalytics)
load.packages()

library(mlr)
library(pdp)
library(vip)
library(reshape2)

# Prepare Data

In [None]:
#### Clarity Field Analysis ####
clarityAnalysis = function () {

# ####  Clarity Report Type Validation  ####
# test %>%
#     filter(
#         ! report_received %>% str_detect('(?:FWB)?Leads01.*')
#     ) %>% 
#     transmute(
#         lead_id,
#         report_received,
#         report_requested,
#         lead_time,
#         report_time,
#         lead.date = lead_time %>% as.Date(),
#         report.date = report_time %>% as.Date(),
#         diff = lead.date - report.date
#     ) %>% 
#     arrange(
#         diff
#     )
# #     ) %T>%
# #     write.csv("..\\docs\\received-not-leads01.csv")

# test %>% group_by(report_received) %>% summarize(n = n()) %>% ungroup() %>% arrange(desc(n))

#### Get Clarity Data

####  Existing Fields  ####
getClarityFields = function () {

    a <- queryReporting(
    "
    select
        *
    from
        lde4.leads
    where
        --lead_time >= now()::date - '5 days'::interval
        lead_id = '99f418da-8b66-471e-9586-f4112718ed21'
    limit 100
    "
    ) %>%
        select(
            lead_id,
            clarity_report,
            accepted
        )

    b <- a %>%
        filter(
            !is.na(clarity_report)
        ) %>% 
        mutate(
            json.df = clarity_report %>% 
                map(
                    .f = ~ .x %>%
                        fromJSON() %>%
                        .$xml_response %>% 
                        unlist() %>% 
                        as.data.frame(
                            stringsAsFactors = FALSE
                        ) %>% 
                        t()
                )
        )

    all.fields <<- b %>%
        filter(
            lead_id == '99f418da-8b66-471e-9586-f4112718ed21'
        ) %>%
        .$clarity_report %>%
        fromJSON(
        ) %>% 
        .$xml_response %>% 
        unlist(
        ) %>% 
        as.data.frame(
            stringsAsFactors = FALSE
        ) %>% 
        rownames_to_column(
            var = 'key'
        ) %>% 
        rename(
            value = "."
        )

    inquiry.fields <<-
        all.fields %>%
            filter(
                key %>% str_detect('^inquiry\\..*') &
                value != '' &
                !is.na(value)
            ) %>% 
            filter(
                key %in% paste0(
                    'inquiry.',
                    c(
                        'ofac_match',
                        'ofac_score',
                        'social_security_valid',
                        'social_security_deceased',
                        'ssn_distinct_first_last_name_count',
                        'paycheck_direct_deposit',
                        'bank_routing_valid',
                        'inquiry_purpose_type'
                    )
                )
            )

    ccr.fields <<-
        all.fields %>%
            filter(
                key %>% str_detect('^clear_credit_risk\\..*') &
                value != '' &
                !is.na(value)
            ) %>% 
            filter(
                !key %>% str_detect('inquiries\\.member_id') &
                !key %>% str_detect('inquiry_received_at') &
                !key %>% str_detect('inquiry_purpose_type') &
                !key %>% str_detect('inquiry_tradeline_type') &
                !key %>% str_detect('tradelines\\..*') &
                !key %>% str_detect('stabilities\\..*') &
                !key %>% str_detect('experian_attribute\\..*') &
                !key %>% str_detect('description') &
                !key %>% str_detect('full_name') &
                !key %>% str_detect('code') &
                !key %>% str_detect('date') &
                !key %>% str_detect('first') &
                !key %>% str_detect('48')
#                 !key %>% str_detect('inquiry_purpose_type') & #keep
#                 !key %>% str_detect('inquiry_tradeline_type') & #keep
#                 !key %>% str_detect('tradelines\\.account_opened') & #keep
#                 !key %>% str_detect('tradelines\\.highest_credit') & #keep
#                 !key %>% str_detect('tradelines\\.amount_past_due') & #keep
#                 !key %>% str_detect('tradelines\\.current_balance') & #keep
            )

    crh.fields <<-
        all.fields %>%
            filter(
                key %>% str_detect('^clear_recent_history\\..*') &
                value != '' &
                !is.na(value)
            ) %>% 
            filter(
                !key %>% str_detect('tradeline_stabilities') &
                !key %>% str_detect('date') &
                !key %>% str_detect('name') &
                !key %>% str_detect('\\d+')
            )

    rbind(
        inquiry.fields,
        ccr.fields,
        crh.fields
    ) %>% .$key
    
}

####  Pull Test Clarity Report  ####
test = queryReporting(
"
select

    --  Identifiers --
    lde.lead_id
    , lde.leadofferid
    , lde.passthru_lead_offer_id
    , lde.lead_time at time zone 'America/Chicago' as lead_time
    , lde.partnerid

    --  Credit  --
    , case when lde.clarity_report notnull then TRUE else FALSE end as has_clarity
    , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'inquiry_received_at' as report_time
    , lde.clarity_report -> 'xml_response' -> 'opploans' ->> 'received' as report_received
    , lde.clarity_report -> 'xml_response' -> 'opploans' ->> 'requested_file' as report_requested

    , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'ofac_score' as ofac_score
    , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'paycheck_direct_deposit' as paycheck_direct_deposit
    , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'ssn_distinct_first_last_name_count' as ssn_distinct_first_last_name_count

    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'score' as ccr_score
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'clarity_seen' as ccr_clarity_seen
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'number_of_loans' as ccr_number_of_loans
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'number_of_bank_accounts' as ccr_number_of_bank_accounts
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'highest_number_of_days_past_due' as ccr_highest_number_of_days_past_due
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'current_inquiry_cluster_position' as ccr_current_inquiry_cluster_position
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'days_since_last_loan_charged_off' as ccr_days_since_last_loan_charged_off
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'days_since_inquiry_previously_seen' as ccr_days_since_inquiry_previously_seen
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'number_of_employers_last_six_months' as ccr_number_of_employers_last_six_months

    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'loans_in_collections' as srh_loans_in_collections
    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'spml_average_rollovers' as srh_spml_average_rollovers
    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'amount_loans_charged_off' as srh_amount_loans_charged_off
    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'online_loan_opened_in_the_last_year' as srh_online_loan_opened_in_the_last_year
    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'online_loan_inquiry_in_the_last_thirty_days' as srh_online_loan_inquiry_in_the_last_thirty_days

    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'inquiry_cluster_recent_history' -> 'total_inquiry_clusters_recent_history' ->> 'ninety_days_ago' as ticrh_ninety_days_ago
    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'inquiry_cluster_recent_history' -> 'total_inquiry_clusters_recent_history' ->> 'twentyfour_hours_ago' as ticrh_twentyfour_hours_ago


    from
        lde4.leads as lde
    inner join
        cloudlending.advertising_method as c_am
        on lde.partnerid = c_am.external_id
        and c_am.name = 'LenderEdge 4' 
    where
        lde.accepted = TRUE
        and lde.lead_time >= '2020-03-09'::date
    limit 1000
"
)

####  Identify Data Types  ####
not.features = c(
    'lead_id',
    'leadofferid',
    'passthru_lead_offer_id',
    'lead_time',
    'partnerid',
    'has_clarity',
    'report_time',
    'report_received',
    'report_requested'
)

boolean.features = c(
    'paycheck_direct_deposit',
    'ccr_hit',
    'ccr_clarity_seen',
    'srh_online_loan_opened_in_the_last_year',
    'srh_online_loan_inquiry_in_the_last_thirty_days'
)

numeric.features = colnames(test)[
    which(
        !colnames(test) %in% c(
            boolean.features,
            not.features,
            'ccr_worst_payment_rating_null',
            'ccr_worst_payment_rating_plus',
            'ccr_worst_payment_rating_zero',
            'ccr_worst_payment_rating_hash',
            'ccr_worst_payment_rating_else',
            'ccr_worst_payment_rating'
        )
    )
]

processed.features = c(
    'ccr_worst_payment_rating',
    'ccr_days_since_last_loan_charged_off',
    'ccr_days_since_last_loan_paid_off',
    'ccr_days_since_last_ontime_payment',
    'ccr_days_since_last_loan_payment',
    'ccr_days_since_last_loan_opened'
)

impute.median = c(
    'ccr_days_since_previous_bank_account_previously_seen',
    'ccr_days_since_reported_income_previously_seen',
    'ccr_days_since_inquiry_previously_seen',
    'ccr_highest_number_of_days_past_due',
    'paycheck_direct_deposit'
)

impute.mean = c(
    'ccr_number_of_loans',
    'ccr_number_of_bank_accounts',
    'ccr_number_of_loans_paid_off',
    'ccr_number_of_loans_paid_off',
    'ccr_number_of_loans_past_due',
    'ccr_current_inquiry_cluster_position',
    'ccr_number_of_loans_current_and_open',
    'ccr_number_of_employers_last_six_months',
    'ccr_score'
)

correlated.features.numeric = c(
    'icrh_ten_minutes_ago',                                    #ccr_current_inquiry_cluster_position
    'icrh_twenty_minutes_ago',                                 #ccr_current_inquiry_cluster_position
    'icrh_thirty_minutes_ago',                                 #ccr_current_inquiry_cluster_position
    'icrh_one_hour_ago',                                       #ccr_current_inquiry_cluster_position
    'icrh_twentyfour_hours_ago',                               #ccr_current_inquiry_cluster_position
    'icrh_seven_days_ago',                                     #ccr_current_inquiry_cluster_position
    'icrh_thirty_days_ago',                                    #ccr_current_inquiry_cluster_position
    'icrh_ninety_days_ago',                                    #ccr_current_inquiry_cluster_position
    'icrh_recent_history_current_inquiry_cluster_position',    #ccr_current_inquiry_cluster_position
    
    'ticrh_seven_days_ago',                                    #ticrh_twentyfour_hours_ago
    'ticrh_thirty_days_ago',                                   #ticrh_twentyfour_hours_ago
    
    'ccr_number_of_loans_paid_off',                            #ccr_number_of_loans
    'ccr_number_of_loans_past_due',                            #ccr_number_of_loans
    'ccr_number_of_loans_current_and_open',                    #ccr_number_of_loans,
    'ccr_days_since_reported_income_previously_seen',          #ccr_days_since_inquiry_previously_seen
    'ccr_days_since_previous_bank_account_previously_seen',    #ccr_days_since_inquiry_previously_seen
    
    'srh_amount_loans_in_collections',                         #srh_loans_in_collections
    'srh_days_with_open_loans_in_the_last_ninety_days',        #srh_loans_in_collections
    'srh_days_with_open_loans_in_the_last_year'                #srh_loans_in_collections
)

correlated.features.logical = c(
    'ccr_hit',                                                 #ccr_clarity_seen
    'ccr_worst_payment_rating_plus',                           #ccr_has_previous_loan_charged_off
    'ccr_worst_payment_rating_null',                           #ccr_has_previous_loan_opened  
    'ccr_has_previous_loan_payment',                           #ccr_has_previous_loan_opened  
    'ccr_has_previous_ontime_payment',                         #ccr_has_previous_loan_opened  
    'ccr_has_previous_loan_paid_off',                          #ccr_has_previous_loan_opened   
    'ccr_has_previous_loan_charged_off'                        #ccr_has_previous_loan_opened  
)

#### Convert Data Types

####  Convert Data Types  ####
test.clean = test %>%
    select(
        -not.features
    ) %>%
    mutate_at(
        .vars = boolean.features[which(! boolean.features %in% correlated.features.numeric)],
        .funs = as.logical
    ) %>%
    mutate_at(
        .vars = numeric.features[which(! numeric.features %in% correlated.features.numeric)],
        .funs = as.numeric
    ) %>% 
    mutate(
        ccr_worst_payment_rating_null = is.na(ccr_worst_payment_rating),
        ccr_worst_payment_rating_plus = !ccr_worst_payment_rating_null & ccr_worst_payment_rating == '+',
        ccr_worst_payment_rating_zero = !ccr_worst_payment_rating_null & ccr_worst_payment_rating == '0',
        ccr_worst_payment_rating_hash = !ccr_worst_payment_rating_null & ccr_worst_payment_rating == '#',
        ccr_worst_payment_rating_else = !(
            ccr_worst_payment_rating_plus |
            ccr_worst_payment_rating_zero |
            ccr_worst_payment_rating_hash |
            ccr_worst_payment_rating_null
        ),
        
        ccr_has_previous_loan_charged_off = ccr_days_since_last_loan_charged_off %>%
            getClarityMapping()$convertDaysChargedOff(),
        ccr_has_previous_loan_paid_off = ccr_days_since_last_loan_paid_off %>% 
            getClarityMapping()$convertDaysPaidOff(),
        ccr_has_previous_ontime_payment = ccr_days_since_last_ontime_payment %>% 
            getClarityMapping()$convertDaysOntimePayment(),
        ccr_has_previous_loan_payment = ccr_days_since_last_loan_payment %>% 
            getClarityMapping()$convertDaysAnyPayment(),
        ccr_has_previous_loan_opened = ccr_days_since_last_loan_opened %>% 
            getClarityMapping()$convertDaysLoanOpened()
        
        
    ) %>%
    select(
        -processed.features
    )
# test.clean %>% str()

#### Impute

####  Examine Values in Field  ####
field = quo(paycheck_direct_deposit)

test.clean[[quo_name(field)]] %>% median(na.rm = TRUE)
test.clean[[quo_name(field)]] %>% mean(na.rm = TRUE)

test %>%
    group_by(
#         var = !!field %>% as.numeric
        var = !!field
    ) %>% 
    summarize(
        n = n()
    ) %>%
    ungroup() %>%
#     filter(
#         !is.na(var)
#         var < 100
#     ) %>% 
    arrange(
#         desc(n)
        var
#     )
    ) %>% ggplot(aes(x = var, y = n)) + geom_bar(stat = 'identity')

####  Impute and/or Remove Missing Values  ####
test.impute.value = test.clean %>%
    mutate_at(
        .vars = impute.median[ which(! impute.median %in% correlated.features.numeric) ],
        .funs = ~ .x %>%
            replace_na(
                replace = .x %>% median(na.rm = TRUE)
            )
    ) %>%
    mutate_at(
        .vars = impute.mean[ which(! impute.mean %in% correlated.features.numeric) ],
        .funs = ~ .x %>%
            replace_na(
                replace = .x %>% mean(na.rm = TRUE)
            )
    ) %>% 
    mutate(
        paycheck_direct_deposit = paycheck_direct_deposit %>% as.logical()
    )

test.impute = test.impute.value %>% 
    filter(
        apply(
            X = test.impute.value,
            FUN = function (x) { x %>% is.na() %>% sum() },
            MARGIN = 1
        ) == 0
    )

#### Numeric Collinearity

####  Calculate Correlation Matrix (Numeric)  ####
test.numeric.cor = test.impute %>%
    select(
        numeric.features[ which(!numeric.features %in% processed.features)]
    ) %>% 
    cor()

test.numeric.cor[upper.tri(test.numeric.cor)] = NA
test.numeric.cor.upper = test.numeric.cor %>% melt(na.rm = TRUE)

# ####  Sum Missing (NA) Values for Numeric  ####
# apply(
# #     X = test.clean %>%
#     X = test.impute %>%
#         select(
#             numeric.features[ which(!numeric.features %in% c(processed.features, correlated.features.numeric)) ]
#         ),
#     FUN = function (x) { is.na(x) %>% sum() },
#     MARGIN = 2
# ) %>% 
# as.data.frame() %>% select(n = '.') %>% rownames_to_column('field') %>% arrange(desc(n))

# ####  Find / Remove Collinear Features (Numeric)  ####
# correlated.features.numeric = c(
#     'icrh_ten_minutes_ago',                                    #ccr_current_inquiry_cluster_position
#     'icrh_twenty_minutes_ago',                                 #ccr_current_inquiry_cluster_position
#     'icrh_thirty_minutes_ago',                                 #ccr_current_inquiry_cluster_position
#     'icrh_one_hour_ago',                                       #ccr_current_inquiry_cluster_position
#     'icrh_twentyfour_hours_ago',                               #ccr_current_inquiry_cluster_position
#     'icrh_seven_days_ago',                                     #ccr_current_inquiry_cluster_position
#     'icrh_thirty_days_ago',                                    #ccr_current_inquiry_cluster_position
#     'icrh_ninety_days_ago',                                    #ccr_current_inquiry_cluster_position
#     'icrh_recent_history_current_inquiry_cluster_position',    #ccr_current_inquiry_cluster_position
    
#     'ticrh_seven_days_ago',                                    #ticrh_twentyfour_hours_ago
#     'ticrh_thirty_days_ago',                                   #ticrh_twentyfour_hours_ago
    
#     'ccr_number_of_loans_paid_off',                            #ccr_number_of_loans
#     'ccr_number_of_loans_past_due',                            #ccr_number_of_loans
#     'ccr_number_of_loans_current_and_open',                    #ccr_number_of_loans,
#     'ccr_days_since_reported_income_previously_seen',          #ccr_days_since_inquiry_previously_seen
#     'ccr_days_since_previous_bank_account_previously_seen',    #ccr_days_since_inquiry_previously_seen
    
#     'srh_amount_loans_in_collections',                         #srh_loans_in_collections
#     'srh_days_with_open_loans_in_the_last_ninety_days',        #srh_loans_in_collections
#     'srh_days_with_open_loans_in_the_last_year'                #srh_loans_in_collections
# )

# test.numeric.cor.upper.removed = test.numeric.cor.upper %>% 
#     filter(
#         Var1 != Var2
#     ) %>% 
#     arrange(
#         desc(value)
#     ) %>% 
#     filter(
#         ! Var1 %in% correlated.features.numeric &
#         ! Var2 %in% correlated.features.numeric
#     ) %>% 
# #     group_by(
# #         Var1
# #     ) %>% 
# #     summarize(
# #         n = n(),
# #         total.cor = sum(value^2)
# #     ) %>% 
# #     ungroup() %>% 
#     arrange(
# #         total.cor %>% desc
#         value %>% desc
#     )

# test.numeric.cor.upper.removed %T>%
#     head() %>% 
#     ggplot(
#         mapping = aes(
#             x = Var1,
#             y = Var2,
#             fill = value
#         )
#     ) +
#     geom_tile(
#         color = 'white'
#     ) +
#     scale_fill_gradient2(
#         low = "blue",
#         high = "red",
#         mid = "white", 
#         midpoint = 0,
#         limit = c(-1,1),
#         space = "Lab", 
#         name="Pearson\nCorrelation"
#     ) +
#     theme_minimal() +
#     theme(
#         axis.text.x = element_text(
#             angle = -45,
#             hjust = 0
#         )
#     )

#### Boolean Collinearity

####  Calculate Correlation Matrix (Boolean)  ####
test.logical.cor = test.impute %>%
    select(
        -c(numeric.features[ which(!numeric.features %in% processed.features)])
    ) %>%
    cor()

test.logical.cor[upper.tri(test.logical.cor)] = NA
test.logical.cor.upper = test.logical.cor %>% melt(na.rm = TRUE)

# ####  Find / Remove Collinear Features (Logical)  ####
# correlated.features.logical = c(
#     'ccr_hit',                               # ccr_clarity_seen
#     'ccr_worst_payment_rating_plus',         # ccr_has_previous_loan_charged_off
#     'ccr_worst_payment_rating_null',         # ccr_has_previous_loan_opened  
#     'ccr_has_previous_loan_payment',         # ccr_has_previous_loan_opened  
#     'ccr_has_previous_ontime_payment',       # ccr_has_previous_loan_opened  
#     'ccr_has_previous_loan_paid_off',        # ccr_has_previous_loan_opened   
#     'ccr_has_previous_loan_charged_off'      # ccr_has_previous_loan_opened  
# )

# test.logical.cor.upper.removed = test.logical.cor.upper %>% 
#     filter(
#         Var1 != Var2
#     ) %>% 
#     arrange(
#         desc(value)
# #         value
#     ) %>% 
#     filter(
#         ! Var1 %in% correlated.features.logical &
#         ! Var2 %in% correlated.features.logical
# #     ) %>% 
# #     group_by(
# #         Var1
# #     ) %>% 
# #     summarize(
# #         n = n(),
# #         total.cor = sum(value^2)
# #     ) %>% 
# #     ungroup() %>% 
# #     arrange(
# #         total.cor %>% desc
#     )

# test.logical.cor.upper.removed

# test.logical.cor.upper.removed %>%
#     ggplot(
#         mapping = aes(
#             x = Var1,
#             y = Var2,
#             fill = value
#         )
#     ) +
#     geom_tile(
#         color = 'white'
#     ) +
#     scale_fill_gradient2(
#         low = "blue",
#         high = "red",
#         mid = "white", 
#         midpoint = 0,
#         limit = c(-1,1),
#         space = "Lab", 
#         name="Pearson\nCorrelation"
#     ) +
#     theme_minimal() +
#     theme(
#         axis.text.x = element_text(
#             angle = -45,
#             hjust = 0
#         )
#     )

# getClarityMapping = function () {
    
#     convertInquiryPurposeType = function (purpose.code) {
        
#         case_when(
#             purpose.code == 'AR' ~ 'New Credit',
#             purpose.code == 'AS' ~ 'New Credit Soft',
#             purpose.code == 'RA' ~ 'Account Review Soft',
#             purpose.code == 'RP' ~ 'Consumer Inquiry Soft',
#             purpose.code == 'CL' ~ 'Collection Inquiry',
#             purpose.code == 'PC' ~ 'Pre-check Soft',
#             purpose.code == 'MS' ~ 'Credit Monitor Soft',
#             purpose.code == 'CC' ~ 'Check Cash',
#             purpose.code == 'CS' ~ 'Collection Soft',
#             purpose.code == 'PS' ~ 'Pre-screen Soft',
#             purpose.code == 'IV' ~ 'Item Verification',
#             purpose.code == 'IS' ~ 'Item Verification Soft',
#             purpose.code == 'EH' ~ 'Employment',
#             purpose.code == 'ES' ~ 'Employment Soft',
#             purpose.code == 'LH' ~ 'Lease',
#             purpose.code == 'LS' ~ 'Lease Soft',
#             purpose.code == 'WS' ~ 'Written Authorization Soft',
#             purpose.code == 'WH' ~ 'Written Authorization - Hard',
#             purpose.code == 'PR' ~ 'Portfolio Review',
#             purpose.code == 'PA' ~ 'Portfolio Acquisition',
#             purpose.code == 'SP' ~ 'Subpoena',
#             TRUE ~ 'Other'
#         )
#     }
#     convertWorstPaymentRatingCCR = function (rating) {
        
#         case_when(
#             is.na(rating) ~ 0,
#             rating == '+' ~ 1,
#             rating == '0' ~ 2,
#             rating == '#' ~ 3,
#             rating == '@' ~ 4,
#             rating == 'X' ~ 5,
#             rating == '4' ~ 6,
#             rating == 'V' ~ 7,
#             rating == 'W' ~ 8,
#             rating == '1' ~ 9,
#             rating == '5' ~ 10,
#             rating == 'B' ~ 11,
#             rating == 'L' ~ 12,
#             rating == '7' ~ 13,
#             rating == '8' ~ 14,
#             rating == 'C' ~ 15,
#             rating == 'D' ~ 16,
#             rating == 'E' ~ 17,
#             rating == 'H' ~ 18,
#             rating == 'U' ~ 19,
#             rating == 'Y' ~ 20,
#             rating == 'Z' ~ 21,
#             TRUE ~ 22
#         )
#     }
#     convertDaysChargedOff = function (days) {
        
#         case_when(
#             is.na(days) ~ FALSE,
#             TRUE ~ TRUE
#         )
        
#     }
#     convertDaysPaidOff = function (days) {
        
#         case_when(
#             is.na(days) ~ FALSE,
#             TRUE ~ TRUE
#         )
        
#     }
#     convertDaysOntimePayment = function (days) {
        
#         case_when(
#             is.na(days) ~ FALSE,
#             TRUE ~ TRUE
#         )
        
#     }
#     convertDaysAnyPayment = function (days) {
        
#         case_when(
#             is.na(days) ~ FALSE,
#             TRUE ~ TRUE
#         )
        
#     }
#     convertDaysLoanOpened = function (days) {
        
#         case_when(
#             is.na(days) ~ FALSE,
#             TRUE ~ TRUE
#         )
        
#     }
    
#     list(
#         convertInquiryPurposeType = convertInquiryPurposeType,
#         convertWorstPaymentRatingCCR = convertWorstPaymentRatingCCR,
#         convertDaysChargedOff = convertDaysChargedOff,
#         convertDaysPaidOff = convertDaysPaidOff,
#         convertDaysOntimePayment = convertDaysOntimePayment,
#         convertDaysAnyPayment = convertDaysAnyPayment,
#         convertDaysLoanOpened = convertDaysLoanOpened
#     )
        
# }

}

In [None]:
getLeadsPerformance = function (admethod, timestart='2019-11-01', timeend='2020-02-01', limit=NA, write=FALSE) {
    
    getLeadsDF = function (admethod, timestart, timeend, limit = NA) {
    
        queryReporting(
            paste0(
    "
    select

        --  Identifiers --
        lde.lead_id
        , lde.leadofferid
        , lde.lead_time at time zone 'America/Chicago' as lead_time
        , extract(day from lde.lead_time at time zone 'America/Chicago') as lead_day
        , lde.partnerid

        --  Outcome  --
        , lde.accepted
        , lde.reason
        , lde.code

        --  Bank  --
        , lde.bankname
        , lde.abaroutingnumber
        , lde.accountnumber

        --  Income  --
        , lde.grossmonthlyincome
        , lde.incometype
        , lde.payrollfrequency
        , lde.payrolltype
        , lde.lastpayrolldate

        --  Identity  --
        , lde.dateofbirth
        , floor((lde.lead_time::date - lde.dateofbirth::date)::numeric/365) as age
        , lde.statecode

        --  Employment  --
        , lde.work_hiredate

        --  Offer  --
        , lde.offer_amount
        , lde.offer_interestrate
        , lde.offer_monthlypayment
        , (lde.raw_lead -> 'requestedLoanAmount')::numeric as requestedLoanAmount
        , lde.raw_lead -> 'campaign_id' as campaign_id

        --  Credit  --
        , case when lde.clarity_report notnull then TRUE else FALSE end as has_clarity
        , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'inquiry_received_at' as report_time
        , lde.clarity_report -> 'xml_response' -> 'opploans' ->> 'received' as report_received
        , lde.clarity_report -> 'xml_response' -> 'opploans' ->> 'requested_file' as report_requested

        , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'ofac_score' as ofac_score
        , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'paycheck_direct_deposit' as paycheck_direct_deposit
        , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'ssn_distinct_first_last_name_count' as ssn_distinct_first_last_name_count

        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'score' as ccr_score
        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'number_of_loans' as ccr_number_of_loans
        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'number_of_bank_accounts' as ccr_number_of_bank_accounts
        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'highest_number_of_days_past_due' as ccr_highest_number_of_days_past_due
        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'current_inquiry_cluster_position' as ccr_current_inquiry_cluster_position
        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'days_since_last_loan_charged_off' as ccr_days_since_last_loan_charged_off
        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'days_since_inquiry_previously_seen' as ccr_days_since_inquiry_previously_seen
        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'number_of_employers_last_six_months' as ccr_number_of_employers_last_six_months
        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'worst_payment_rating' as ccr_worst_payment_rating

        , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'loans_in_collections' as srh_loans_in_collections
        , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'spml_average_rollovers' as srh_spml_average_rollovers
        , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'amount_loans_charged_off' as srh_amount_loans_charged_off
        , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'online_loan_opened_in_the_last_year' as srh_online_loan_opened_in_the_last_year
        , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'online_loan_inquiry_in_the_last_thirty_days' as srh_online_loan_inquiry_in_the_last_thirty_days

        , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'inquiry_cluster_recent_history' -> 'total_inquiry_clusters_recent_history' ->> 'ninety_days_ago' as ticrh_ninety_days_ago
        , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'inquiry_cluster_recent_history' -> 'total_inquiry_clusters_recent_history' ->> 'twentyfour_hours_ago' as ticrh_twentyfour_hours_ago

    from
        lde4.leads as lde
    inner join
        cloudlending.advertising_method as c_am
        on lde.partnerid = c_am.external_id
        and c_am.name = '", admethod, "' 
    where
        lde.accepted = TRUE
        and lde.lead_time >= '", timestart, "'::date
        and lde.lead_time < '", timeend, "'::date
    ", ifelse(is.na(limit), "", paste0("limit ", limit))
            )
        )
    }
    getFunnelDF = function (admethod, timestart, timeend) {
        queryReporting(paste0(
    "
    with status as
    (
        select
            c_app.id as application
            , c_app.lde4_lead_id as lead_id
            , c_app.createddate at time zone 'America/Chicago' as appldate
            , c_app.funded_amount
            , c_app.type_formula

            , max(case  when new_value = 'NEW - ENTERED'
                        then 1 else 0
                        end) as newentered
            , max(case  when old_value = 'NEW - ENTERED'
                        and new_value = 'BUSINESS RULES PASSED'
                        then 1 else 0
                        end) as bizrulespassed

            , max(case  when new_value = 'BUREAU APPROVED'
                        then 1 else 0
                        end) as qualified

            , max(case  when new_value in ('BANK VERIFICATION COMPLETED', 'NEW - SCORECARD GENERATED')
                        then 1 else 0
                        end) as bankverified

            , max(case  when c_ash.old_value = 'NEW - PRICING GENERATED'
                        and c_ash.new_value in ('CONTRACT SIGNED', 'WAITING ON STIPULATIONS')
                        then 1 else 0
                        end) as passscorecardratecard

            , max(case  when c_ash.old_value in ('NEW - PRICING GENERATED', 'WAITING ON STIPULATIONS')
                        and c_ash.new_value = 'CONTRACT SIGNED'
                        then 1 else 0 end) as contractsigned

            , max(case  when c_ash.new_value = 'LOAN APPROVED'
                        then 1 else 0
                        end) as funded
        from
            cloudlending.applications as c_app
            inner join
                cloudlending.advertising_method as c_am
                on c_app.advertising_method = c_am.id
                and c_am.name = '", admethod, "'
            inner join
                cloudlending.application_status_history as c_ash
                on c_app.id = c_ash.application
        where
            c_app.createddate at time zone 'America/Chicago' >= '", timestart, "'
            and c_app.createddate at time zone 'America/Chicago' < '", timeend, "'
        group by
            1,2,3,4,5
    )
    , cs_decisioned_apps as
    (
        select
            c_app.id as application
        from
            cloudlending.applications as c_app
            inner join
                status
                on c_app.id = status.application
                and status.contractsigned = 1
        where
            denialreason not in ('Time In Pending', 'Withdraw')
            or (denialreason isnull and status = 'LOAN APPROVED')
    )
    select
        status.application
        , status.lead_id
        , status.appldate
        , status.type_formula
        , status.funded_amount
        , status.newentered
        , status.bizrulespassed
        , status.qualified
        , status.bankverified
        , status.passscorecardratecard
        , status.contractsigned
        , case when cs_decisioned_apps.application notnull then 1 else 0 end as cs_decisioned
        , status.funded
    from
      status
      left join
        cs_decisioned_apps
        on status.application = cs_decisioned_apps.application
    "
    ))
    }
    
    leads = getLeadsDF(
        admethod = admethod,
        timestart = timestart,
        timeend = timeend,
        limit = limit
    )
    
    funnel = getFunnelDF(
        admethod = admethod,
        timestart = timestart,
        timeend = timeend
    )
    
    loan.performance <<- suppressWarnings({suppressMessages({
        read_csv("..\\data\\loan-performance-by-app.csv") %>% 
            select(
                application = applicationid,
                truefpd
            )    
    })})
    
    
    
    df = leads %>%
        left_join(
            funnel,
            by = 'lead_id'
        ) %>% 
        left_join(
            loan.performance,
            by = 'application'
        ) %>% 
        mutate_at(
            .vars = colnames(funnel %>% select(-application, -lead_id, -appldate, -funded_amount, -type_formula)),
            .funs = function (x) { x %>% replace_na(replace = 0) %>% as.logical() }
        )
    
    if (write)
        df %>% write.csv(paste0("..\\data\\df-lenderedge-", timestart, ".csv"))
    
    return(df)
    
}

In [None]:
convertDataTypes = function (raw.df) {
    
    data.type.list = list(
        booleans = c(
            'paycheck_direct_deposit',
            'srh_online_loan_opened_in_the_last_year',
            'srh_online_loan_inquiry_in_the_last_thirty_days'
        ),
        numerics = c(
            'ofac_score',
            'ssn_distinct_first_last_name_count',
            'ccr_score',
            'ccr_number_of_loans',
            'ccr_number_of_bank_accounts',
            'ccr_highest_number_of_days_past_due',
            'ccr_current_inquiry_cluster_position',
            'ccr_days_since_last_loan_charged_off',
            'ccr_days_since_inquiry_previously_seen',
            'ccr_number_of_employers_last_six_months',
            'srh_loans_in_collections',
            'srh_spml_average_rollovers',
            'srh_amount_loans_charged_off',
            'ticrh_ninety_days_ago',
            'ticrh_twentyfour_hours_ago'
        )
    )
    
    raw.df %>% 
        mutate_at(
            .vars = data.type.list$booleans,
            .funs = as.logical
        ) %>%  
        mutate_at(
            .vars = data.type.list$numerics,
            .funs = as.numeric
        )
    
}

In [None]:
processColumns = function (converted.df) {

    converted.df %>% 
        mutate(
            ccr_worst_payment_rating_null = is.na(ccr_worst_payment_rating),
            ccr_worst_payment_rating_plus = !ccr_worst_payment_rating_null & ccr_worst_payment_rating == '+',
            ccr_worst_payment_rating_zero = !ccr_worst_payment_rating_null & ccr_worst_payment_rating == '0',
            ccr_worst_payment_rating_hash = !ccr_worst_payment_rating_null & ccr_worst_payment_rating == '#',
            ccr_worst_payment_rating_else = !(
                ccr_worst_payment_rating_plus |
                ccr_worst_payment_rating_zero |
                ccr_worst_payment_rating_hash |
                ccr_worst_payment_rating_null
            ),

            ccr_has_previous_loan_charged_off = case_when(
                is.na(ccr_days_since_last_loan_charged_off) ~ FALSE,
                TRUE ~ TRUE
            ),
            
            payrolltype = payrolltype %>% replace_na('Missing'),
            campaign_id = campaign_id %>% map(~ .x %>% str_match_all("\\\"(.*)\\\"") %>% .[[1]] %>% .[,2]) %>% as.character()
            
        ) %>% 
        select(
            -ccr_worst_payment_rating,
            -ccr_days_since_last_loan_charged_off
        )
}

In [None]:
imputeMissingValues = function (processed.df) {
    

    impute.median = c(
        'ccr_days_since_inquiry_previously_seen',
        'ccr_highest_number_of_days_past_due',
        'paycheck_direct_deposit'
#         'ccr_days_since_previous_bank_account_previously_seen',
#         'ccr_days_since_reported_income_previously_seen',
    )

    impute.mean = c(
        'ccr_number_of_loans', ##
        'ccr_number_of_bank_accounts', ##
        'ccr_current_inquiry_cluster_position',
        'ccr_number_of_employers_last_six_months',
        'ccr_score' ##
#         'ccr_number_of_loans_paid_off',
#         'ccr_number_of_loans_past_due',
#         'ccr_number_of_loans_current_and_open',
    )
    
    processed.df %>%
        mutate_at(
#             .vars = impute.median[ which(! impute.median %in% correlated.features.numeric) ],
            .vars = impute.median,
            .funs = ~ .x %>%
                replace_na(
                    replace = .x %>% median(na.rm = TRUE)
                )
        ) %>%
        mutate_at(
#             .vars = impute.mean[ which(! impute.mean %in% correlated.features.numeric) ],
            .vars = impute.mean,
            .funs = ~ .x %>%
                replace_na(
                    replace = .x %>% mean(na.rm = TRUE)
                )
        ) %>% 
        mutate(
            paycheck_direct_deposit = paycheck_direct_deposit %>% as.logical()
        )
    
}

In [None]:
standardizeValues = function (imputed.df) {
    return(imputed.df)
}

In [None]:
keepModelFeatures = function (standardized.df) {
    
    standardized.df %>% 
        select(
            -lead_id,
            -leadofferid,
            -lead_time,
            -partnerid,
            -accepted,
            -reason,
            -code,
            -report_requested,
            -report_received,
            -bankname,
            -abaroutingnumber,
            -accountnumber,
            -payrollfrequency,
            -payrolltype,
            -work_hiredate,
            -lastpayrolldate,
            -dateofbirth,
            -report_time,
            -has_clarity,
            -appldate,
#             -newentered,
            -bizrulespassed,
#             -qualified,
            -bankverified,
            -passscorecardratecard,
            -contractsigned,
            -cs_decisioned,
            -offer_interestrate,
            -offer_monthlypayment,
            -application
        ) %>% 
        mutate_if(
            .predicate = is.character,
            .funs = as.factor
        ) %>% 
        mutate_if(
            .predicate = is.logical,
            .funs = as.factor
        )
    
}

In [None]:
removeMissingObservations = function (feature.df) {
    
    feature.df %>% 
#         mutate(
        filter(
            apply(
                X = feature.df,
                FUN = function (x) { x %>% is.na() %>% sum() },
                MARGIN = 1
            ) == 0
        )
    
}

### Get Training

In [None]:
writeTraining = function () {
    
    df.feb =
        getLeadsPerformance(
            admethod = 'LenderEdge 4',
            timestart = '2020-02-01',
            timeend = '2020-02-23',
#             timeend = '2020-03-01',
            limit = NA,
            write = TRUE
        ) %>%
        convertDataTypes() %>%
        processColumns() %>%
        imputeMissingValues() %>%
        standardizeValues() %>% 
        keepModelFeatures() %>%
        removeMissingObservations()

    df.jan =
        getLeadsPerformance(
            admethod = 'LenderEdge 4',
            timestart = '2020-01-01',
            timeend = '2020-02-01',
            limit = NA,
            write = TRUE
        ) %>%
        convertDataTypes() %>%
        processColumns() %>%
        imputeMissingValues() %>%
        standardizeValues() %>% 
        keepModelFeatures() %>%
        removeMissingObservations()

    df.dec =
        getLeadsPerformance(
            admethod = 'LenderEdge 4',
            timestart = '2019-12-01',
            timeend = '2020-01-01',
            limit = NA,
            write = TRUE
        ) %>%
        convertDataTypes() %>%
        processColumns() %>%
        imputeMissingValues() %>%
        standardizeValues() %>% 
        keepModelFeatures() %>%
        removeMissingObservations()

#     df.nov =
#         getLeadsPerformance(
#             admethod = 'LenderEdge 4',
#             timestart = '2019-11-01',
#             timeend = '2019-12-01',
#             limit = NA,
#             write = TRUE
#         ) %>%
#         convertDataTypes() %>%
#         processColumns() %>%
#         imputeMissingValues() %>%
#         standardizeValues() %>% 
#         keepModelFeatures() %>%
#         removeMissingObservations()


    df = do.call(
        rbind,
        list(df.feb, df.jan, df.dec)
    )
    
    df %>% write.csv("..\\data\\df-lenderedge.csv")
    
    return(df)
    
}

In [None]:
readTraining = function () {

    # df = writeTest()
    
    df.feb =
        suppressMessages({suppressWarnings({
            read_csv("..\\data\\df-lenderedge-2020-02-01.csv") %>% select(-X1)
        })}) %>% 
        convertDataTypes() %>%
        processColumns() %>%
#         imputeMissingValues() %>%
#         standardizeValues() %>% 
        keepModelFeatures()
#         removeMissingObservations()

    df.jan =
        suppressMessages({suppressWarnings({
            read_csv("..\\data\\df-lenderedge-2020-01-01.csv") %>% select(-X1)
        })}) %>% 
        convertDataTypes() %>%
        processColumns() %>%
#         imputeMissingValues() %>%
#         standardizeValues() %>% 
        keepModelFeatures()
#         removeMissingObservations()

    df.dec =
        suppressMessages({suppressWarnings({
            read_csv("..\\data\\df-lenderedge-2019-12-01.csv") %>% select(-X1)
        })}) %>% 
        convertDataTypes() %>%
        processColumns() %>%
#         imputeMissingValues() %>%
#         standardizeValues() %>% 
        keepModelFeatures()
#         removeMissingObservations()
    
    df = do.call(
        rbind,
        list(df.feb, df.jan, df.dec)
    )
    
    
#     df = do.call(
#         what = rbind,
#         args = lapply(
#             X = c(
#                 "..\\data\\df-lenderedge-2020-01-01.csv",
#                 "..\\data\\df-lenderedge-2020-02-01.csv"
#             ),
#             FUN = function (x) {
#                 suppressWarnings({suppressMessages({
#                     read_csv(x)
#                 })})
#             }
#         )
#     ) %>% select(-X1)
    
    return(df)
    
}

In [None]:
df.train = readTraining()

### Get Test

In [None]:
writeTest = function (timestart = '2020-03-01', timeend = '2020-03-08') {
    
    df =
        getLeadsPerformance(
            admethod = 'LenderEdge 4',
            timestart = timestart,
            timeend = timeend,
            limit = NA,
            write = TRUE
        ) %>%
        convertDataTypes() %>%
        processColumns() %>%
        imputeMissingValues() %>%
        standardizeValues() %>% 
        keepModelFeatures() %>%
        removeMissingObservations()
    
    df %>% write.csv("..\\data\\df-lenderedge-test.csv")
    
    return(df)
    
}

In [None]:
readTest = function () {

    # df = writeTest()
    
    df =
        suppressMessages({suppressWarnings({
            read_csv("..\\data\\df-lenderedge-2020-03-01.csv") %>% select(-X1)
        })}) %>% 
        convertDataTypes() %>%
        processColumns() %>%
#         imputeMissingValues() %>%
#         standardizeValues() %>% 
        keepModelFeatures()
#         removeMissingObservations()
    
#     df = do.call(
#         what = rbind,
#         args = lapply(
#             X = c(
#                 "..\\data\\df-lenderedge-2020-01-01.csv",
#                 "..\\data\\df-lenderedge-2020-02-01.csv"
#             ),
#             FUN = function (x) {
#                 suppressWarnings({suppressMessages({
#                     read_csv(x)
#                 })})
#             }
#         )
#     ) %>% select(-X1)
    
    return(df)
    
}

In [None]:
df.test = readTest()

### Get Validate

In [None]:
writeValidate = function (timestart = '2020-02-23', timeend = '2020-03-01') {
    
    df =
        getLeadsPerformance(
            admethod = 'LenderEdge 4',
            timestart = timestart,
            timeend = timeend,
            limit = NA,
            write = TRUE
        ) %>%
        convertDataTypes() %>%
        processColumns() %>%
        imputeMissingValues() %>%
        standardizeValues() %>% 
        keepModelFeatures() %>%
        removeMissingObservations()
    
    df %>% write.csv("..\\data\\df-lenderedge-test.csv")
    
    return(df)
    
}

In [None]:
readValidate = function () {

    # df = writeTest()
    
    df =
        suppressMessages({suppressWarnings({
            read_csv("..\\data\\df-lenderedge-2020-02-23.csv") %>% select(-X1)
        })}) %>% 
        convertDataTypes() %>%
        processColumns() %>%
#         imputeMissingValues() %>%
#         standardizeValues() %>% 
        keepModelFeatures()
#         removeMissingObservations()
    
#     df = do.call(
#         what = rbind,
#         args = lapply(
#             X = c(
#                 "..\\data\\df-lenderedge-2020-01-01.csv",
#                 "..\\data\\df-lenderedge-2020-02-01.csv"
#             ),
#             FUN = function (x) {
#                 suppressWarnings({suppressMessages({
#                     read_csv(x)
#                 })})
#             }
#         )
#     ) %>% select(-X1)
    
    return(df)
    
}

In [None]:
df.validate = readValidate()

### Initial Feature Selection & Split Training, Test, Validation into 60 / 20 / 20

In [None]:
upsampleImbalancedClassInTraining = function (df.train, upsample.multiple = 34, quietly = FALSE) {

    ###  Upsample Minority Class (Funded == 1)  ###
    if (!quietly) {
        cat("Pre Upsample:\n")
        print(df.train %>% group_by(funded) %>% summarize(n()))
        cat('\n')
    }

    train.funded = df.train %>% filter(funded == 'TRUE')
    funded.rep.df = do.call(
        rbind,
        replicate(
            n = upsample.multiple,
            expr = {
                rbind(data.frame(), train.funded)
            },
            simplify = FALSE
        )
    )
    
    train.bal = rbind(
        df.train,
        funded.rep.df
    )
    
    if (!quietly) {
        cat("\nPost Upsample:\n")
        print(train.bal %>% group_by(funded) %>% summarize(n()))
        cat('\n')
    }
    
    return(train.bal)
    
}

In [None]:
removeNewClassesInGeneralization = function (df.generalize, df.train, generalize.name = NA) {
    
    df.similar = df.generalize %>%
        filter(
            incometype %in% (df.train$incometype %>% unique()) &
#             payrolltype %in% (df.train$payrolltype %>% unique()) &
            statecode %in% (df.train$statecode %>% unique()) &
            campaign_id %in% (df.train$campaign_id %>% unique()) &
            paycheck_direct_deposit %in% (df.train$paycheck_direct_deposit %>% unique()) &
            srh_online_loan_opened_in_the_last_year %in% (df.train$srh_online_loan_opened_in_the_last_year %>% unique()) &
            srh_online_loan_inquiry_in_the_last_thirty_days %in% (df.train$srh_online_loan_inquiry_in_the_last_thirty_days %>% unique()) &
            ccr_worst_payment_rating_null %in% (df.train$ccr_worst_payment_rating_null %>% unique()) &
            ccr_worst_payment_rating_plus %in% (df.train$ccr_worst_payment_rating_plus %>% unique()) &
            ccr_worst_payment_rating_zero %in% (df.train$ccr_worst_payment_rating_zero %>% unique()) &
            ccr_worst_payment_rating_hash %in% (df.train$ccr_worst_payment_rating_hash %>% unique()) &
            ccr_worst_payment_rating_else %in% (df.train$ccr_worst_payment_rating_else %>% unique()) &
            ccr_has_previous_loan_charged_off %in% (df.train$ccr_has_previous_loan_charged_off %>% unique())
        )
    
    print(
        paste0(
            nrow(df.generalize) - nrow(df.similar),
            ifelse(
                is.na(generalize.name),
                " rows removed.",
                paste0(" rows removed from ", generalize.name, ".")
            )
        )
    )
    cat("\n")
    
    return(df.similar)
    
}

In [None]:
processAll = function (df.train, df.test, df.validate) {    
    
    list(
        train = df.train %>% 
            mutate_if(
                .predicate = ~ ! .x %>% is.numeric,
                .funs = as.factor
            ) %>% 
            select(
                -statecode
            ),
        
        test = df.test %>%
            removeNewClassesInGeneralization(
                df.train,
                "test set"
            ) %>% 
            mutate_if(
                .predicate = ~ ! .x %>% is.numeric,
                .funs = as.factor
            ) %>% 
            select(
                -statecode
            ),
        
        validate = df.validate %>%
            removeNewClassesInGeneralization(
                df.train,
                "validation set"
            ) %>% 
            mutate_if(
                .predicate = ~ ! .x %>% is.numeric,
                .funs = as.factor
            ) %>% 
            select(
                -statecode
            ),
        
        train.bal = df.train %>%
            upsampleImbalancedClassInTraining(
            ) %>% 
            mutate_if(
                .predicate = ~ ! .x %>% is.numeric,
                .funs = as.factor
            ) %>% 
            select(
                -statecode
            )
    )

}

In [None]:
data.split = processAll(
    df.train,
    df.test,
    df.validate
)

In [None]:
data.split$train.bal %>% str()

# Modeling

### Initial

In [None]:
fitSVM = function (data.split) {

    ####  Setup  ####

    # library(kernlab)
    # suppressWarnings({
    #     listLearners() %>%
    #         filter(
    #             type == 'classif' &
    #             name %>% str_detect('(?:svm|SVM|[Ss]upport)')
    #         )
    # })

    svm.task = makeClassifTask(
        id = 'svm.task',
        data = data.split$train.bal,
        target = 'funded',
        positive = 'TRUE'
    )

    svm.classif = makeLearner(
        cl = 'classif.ksvm',
        id = 'svm.learner',
        predict.type = 'prob',
        fix.factors.prediction = FALSE
    )

    svm.model = train(
        learner = svm.classif,
        task = svm.task
    )

    svm.predict = predict(
        object = svm.model,
        newdata = data.split$test
    )

    svm.validate = predict(
        object = svm.model,
        newdata = data.split$validate
    )

    ####  Evaluate  ####

    svm.predict %>% getEvaluation()
    svm.validate %>% getEvaluation()

    # svm.model$learner.model %>% attributes() %>% names()

    # w = colSums(
    #     svm.model$learner.model %>% attributes() %>% .$coef *
    #     lenderedge.train.df.bal[svm.model$learner.model %>% attributes() %>% .$alphaindex %>% unlist(), ]
    # )
    # b = svm.model$learner.model %>% attributes() %>% .$b

    # # # abline(b/w[1],-w[2]/w[1])
    # # # abline((b+1)/w[1],-w[2]/w[1],lty=2)
    # # # abline((b-1)/w[1],-w[2]/w[1],lty=2)
    
}

In [None]:
fitLogisticRegression = function (data.split) {

    # library(stats)
    # suppressWarnings({
    #     listLearners() %>%
    #         filter(
    #             type == 'classif' &
    #             name %>% str_detect('[Ll]ogistic')
    #         )
    # })

    ####  Setup  ####

    logistic.task = makeClassifTask(
        id = 'logistic.task',
        data = data.split$train.bal,
    #     data = data.split$train,
        target = 'funded',
        positive = 'TRUE'
    )

    logistic.learner = makeLearner(
        cl = 'classif.logreg',
        id = 'logistic.learner'
    )

    logistic.model = train(
        learner = logistic.learner,
        task = logistic.task
    )

    logistic.predict = predict(
        object = logistic.model,
        newdata = data.split$test
    )

    logistic.validate = predict(
        object = logistic.model,
        newdata = data.split$validate
    )

    ####  Evaluate  ####

    logistic.predict %>% getEvaluation()
    logistic.validate %>% getEvaluation()
    
}

### Decision Tree

In [None]:
getEvaluation = function (predict.object) {

    detailed = predict.object %>% 
        as.data.frame(
            stringsAsFactors = FALSE
        ) %>% 
        group_by(
            truth,
            response
        ) %>% 
        summarize(
            n = n()
        ) %>%
        ungroup()
    
    prior = detailed %>% filter(truth == TRUE) %>% .$n %>% sum() /
            detailed %>% .$n %>% sum()
    
    accuracy = detailed %>% filter(truth == response) %>% .$n %>% sum() /
            detailed %>% .$n %>% sum()                              ##TP,TN
    
    recall = detailed %>% filter(truth == TRUE & response == TRUE) %>% .$n %>% sum()/
            detailed %>% filter(truth == TRUE) %>% .$n %>% sum()    ##FP
    
    precision = detailed %>% filter(truth == TRUE & response == TRUE) %>% .$n %>% sum()/
            detailed %>% filter(response == TRUE) %>% .$n %>% sum() ##FN
    
    
    metrics = data.frame(
        metric = c('prior', 'accuracy', 'recall', 'precision'),
        value = c(prior, accuracy, recall, precision),
        stringsAsFactors = FALSE
    )
 
    list(
        detailed = detailed,
        metrics = metrics
    )
}

##### Set Hyperparameters

In [None]:
getDecisionTree = function (data.split, evaluate = TRUE, xval = 0, minsplit = 1000, minbucket = 1000, cp = 0.002) {
 

    ####  Setup  ####

    library(rattle)
    # library(rpart)
    # suppressWarnings({
    #     listLearners() %>%
    #         filter(
    #             type == 'classif' &
    #             name %>% str_detect('[Tt]ree')
    #         )
    # })
    # getParamSet('classif.rpart')

    # loss.matrix = matrix(
    #     c(0,1,1.6,0),
    #     byrow = TRUE,
    #     nrow = 2
    # )

    rpart.task = makeClassifTask(
        id = 'rpart.task',
        data = data.split$train.bal %>%
            select(
                -newentered,
                -qualified,
                -funded_amount,
                -type_formula,
                -truefpd
#             ) %>% 
#             filter(
#                 ccr_score < 543 &
#                 ccr_number_of_bank_accounts >= 2.5
            ),
        target = 'funded',
        positive = 'TRUE'
    )

    rpart.learner = makeLearner(
        cl = 'classif.rpart',
        id = 'rpart.learner',
        xval = xval,                  ##  Number of Cross Validations
        minsplit = minsplit,             ##  Number of Obs in Node for Split to be Attempted
        minbucket = minbucket,             ##  Minimum number of observations in Leaf Node
        cp = cp                ##  Minimum information gain for split to execute.
    #     loss = loss.matrix
    )



    ####  Resampling  ####

    # rpart.resample = makeResampleDesc('CV', iters = 5, stratify = TRUE)

    # rpart.cv = resample(
    #     learner = rpart.learner,
    #     task = rpart.task,
    #     resampling = rpart.resample,
    #     measures = list(acc, mmce, fpr),
    #     show.info = FALSE
    # )

    # # rpart.cv %>% .$measures.test
    # # rpart.cv %>% .$aggr %>% as.data.frame()



    ####  Hypertuning  ####

    # rpart.params = makeParamSet(
    #     makeIntegerParam('minsplit', lower = 10, upper = 50),
    #     makeIntegerParam('minbucket', lower = 5, upper = 50),
    #     makeNumericParam('cp', lower = 0.0001, upper = 0.2)
    # )

    # rpart.search = makeTuneControlGrid()

    # rpart.tune = tuneParams(
    #     learner = rpart.learner,
    #     task = rpart.task,
    #     resampling = rpart.resample,
    #     par.set = rpart.params,
    #     control = rpart.search,
    #     measures = list(mmce, acc, fpr),
    #     show.info = FALSE
    # )

    # setHyperPars(
    #     learner = rpart.learner,
    #     par.vals = rpart.tune$x
    # ) 



    ####  Training  ####

    rpart.model = train(
        learner = rpart.learner,
        task = rpart.task
    )

#     rpart.predict = predict(
#         object = rpart.model,
# #         newdata = data.split$test %>% filter(qualified == 'TRUE') %>% select(-newentered, -qualified)
#         newdata = data.split$test %>% select(-newentered, -qualified)
#     )

#     rpart.validate = predict(
#         object = rpart.model,
#         newdata = data.split$validate %>% select(-newentered, -qualified)
#     )



    ####  Evaluation  ####

#     test.eval = rpart.predict %>% getEvaluation()
#     validate.eval = rpart.validate %>% getEvaluation()


#     rules = rpart.model$learner.model
#     rules.df = rpart.plot::rpart.rules(rpart.model$learner.model)
# #     rules.tree = rpart.plot::rpart.plot(rpart.model$learner.model, type = 4)
# #     fancyRpartPlot(rpart.model$learner.model)
    
    
    pdf(tf <- tempfile(fileext = ".pdf"))
    fancyRpartPlot(rpart.model$learner.model)
    dev.off()
    cat(tf)

#     if (evaluate) {
#         list(
#             test.eval = test.eval,
#             validate.eval = validate.eval
#         )
#     }
   
}

In [None]:
getDecisionTreeProposal = function (data.split, nodes.to.exclude.logic.expr) {    

    
    ####  Evaluation  ####
    evaluatePerformance = function (data.split, stage) {
        
        observed = data.split[[stage]] %>%
            summarize(
                accepted = n(),
                app = sum(newentered == 'TRUE'),
                qualified = sum(qualified == 'TRUE'),
                funded.dollar = sum(
                    (funded == 'TRUE') *
                    funded_amount,
                    na.rm = TRUE
                ),
                new.funded.dollar = sum(
                    (funded == 'TRUE') *
                    (type_formula == 'New') *
                    funded_amount,
                    na.rm = TRUE
                ),
                funded = sum(funded == 'TRUE'),
                app.rate = app/accepted,
                qr = qualified/app,
                fr = funded/qualified,
                app.to.fund = funded/app,
                accept.to.fund = funded/accepted,
                fpd.mature = sum(!is.na(truefpd))/n(),
                fpd = mean(
                    truefpd,
                    na.rm = TRUE
                ),
                new.fpd = mean(
                    truefpd *
                    (type_formula == 'New'),
                    na.rm = TRUE
                )
            )

        proposed = data.split[[stage]] %>%
            filter(
                !eval(nodes.to.exclude.logic.expr)
            ) %>%
            summarize(
                accepted = n(),
                app = sum(newentered == 'TRUE'),
                qualified = sum(qualified == 'TRUE'),
                funded.dollar = sum(
                    (funded == 'TRUE') *
                    funded_amount,
                    na.rm = TRUE
                ),
                new.funded.dollar = sum(
                    (funded == 'TRUE') *
                    (type_formula == 'New') *
                    funded_amount,
                    na.rm = TRUE
                ),
                funded = sum(funded == 'TRUE'),
                app.rate = app/accepted,
                qr = qualified/app,
                fr = funded/qualified,
                app.to.fund = funded/app,
                accept.to.fund = funded/accepted,
                fpd.mature = sum(!is.na(truefpd))/n(),
                fpd = mean(
                    truefpd,
                    na.rm = TRUE
                ),
                new.fpd = mean(
                    truefpd *
                    (type_formula == 'New'),
                    na.rm = TRUE
                )
            )

        change = data.frame(
            'Size__' = '',
            accepts.size = paste0(round(100*(
                data.split[[stage]] %>% filter(eval(nodes.to.exclude.logic.expr)) %>% nrow() /
                data.split[[stage]] %>% nrow()),
                2), '%'),
            funded.dollars.size = paste0(round(100*(
                data.split[[stage]] %>% filter(funded == TRUE & eval(nodes.to.exclude.logic.expr)) %>% nrow() /
                data.split[[stage]] %>% filter(funded == TRUE) %>% nrow()),
                2), '%'),
            'Volumes__' = '',
            accepted = paste0(round(100*(proposed$accepted - observed$accepted)/observed$accepted, 2), '%'),
            app = paste0(round(100*(proposed$app - observed$app)/observed$app, 2), '%'),
            qualified = paste0(round(100*(proposed$qualified - observed$qualified)/observed$qualified, 2), '%'),
            funded = paste0(round(100*(proposed$funded - observed$funded)/observed$funded, 2), '%'),
            funded.dollar = paste0(round(100*(proposed$funded.dollar - observed$funded.dollar)/observed$funded.dollar, 2), '%'),
            new.funded.dollar = paste0(round(100*(proposed$new.funded.dollar - observed$new.funded.dollar)/observed$new.funded.dollar, 2), '%'),
            'Rates__' = '',
            app.rate = paste0(
                ifelse(round(10000 * (proposed$app.rate - observed$app.rate),0) >= 0, '+', ''),
                round(10000 * (proposed$app.rate - observed$app.rate),0)
            ),
            qr = paste0(
                ifelse(round(10000 * (proposed$qr - observed$qr),0) >= 0, '+', ''),
                round(10000 * (proposed$qr - observed$qr),0)
            ),
            fr = paste0(
                ifelse(round(10000 * (proposed$fr - observed$fr),0) >= 0, '+', ''),
                round(10000 * (proposed$fr - observed$fr),0)
            ),
            app.to.fund = paste0(
                ifelse(round(10000 * (proposed$app.to.fund - observed$app.to.fund),0) >= 0, '+', ''),
                round(10000 * (proposed$app.to.fund - observed$app.to.fund),0)
            ),
            accept.to.fund = paste0(
                ifelse(round(10000 * (proposed$accept.to.fund - observed$accept.to.fund),0) >= 0, '+', ''),
                round(10000 * (proposed$accept.to.fund - observed$accept.to.fund),0)
            ),
            fpd.mature = paste0(
                ifelse(round(10000 * (proposed$fpd.mature - observed$fpd.mature),0) >= 0, '+', ''),
                round(10000 * (proposed$fpd.mature - observed$fpd.mature),0)
            ),
            fpd = paste0(
                ifelse(round(10000 * (proposed$fpd - observed$fpd),0) >= 0, '+', ''),
                round(10000 * (proposed$fpd - observed$fpd),0)
            ),
            new.fpd = paste0(
                ifelse(round(10000 * (proposed$new.fpd - observed$new.fpd),0) >= 0, '+', ''),
                round(10000 * (proposed$new.fpd - observed$new.fpd),0)
            ),
            stringsAsFactors = FALSE
        )
        
        list(
#             observed = observed,
#             proposed = proposed,
            change = change
        )
        
    }
    
    list(
        data.split %>% evaluatePerformance('train.bal'),
        data.split %>% evaluatePerformance('train'),
        data.split %>% evaluatePerformance('test'),
        data.split %>% evaluatePerformance('validate')
    )
    
}

##### Run Model

In [None]:
suppressWarnings({suppressMessages({
    data.split %>%
        getDecisionTree(evaluate = FALSE)
})})

##### Evaluate Model for Exclusion Nodes

In [None]:
exclude.logic = expr(
    (
        is.na(ccr_score) &
        is.na(ccr_number_of_bank_accounts)
    ) |
    (
        ccr_score < 543 &
        ccr_number_of_bank_accounts >= 2.5 &
        campaign_id %in% c('1716','1724','1726','1730','1731','1732','1734','1744','1745') &
        ccr_highest_number_of_days_past_due < 5.5
    )
)

##### Evaluate Exclusion Nodes Generalization - Aggregate

In [None]:
suppressWarnings({suppressMessages({
    data.split %>% getDecisionTreeProposal(exclude.logic)
})})

##### Evaluate Exclusion Nodes Generalization - Time Series

In [None]:
df.ts = do.call(
    what = rbind,
    args = lapply(
        X = c(
#             "..\\data\\df-lenderedge-2019-11-01.csv",
            "..\\data\\df-lenderedge-2019-12-01.csv",
            "..\\data\\df-lenderedge-2020-01-01.csv",
            "..\\data\\df-lenderedge-2020-02-01.csv",
            "..\\data\\df-lenderedge-2020-02-23.csv",
            "..\\data\\df-lenderedge-2020-03-01.csv"
        ),
        FUN = function (x) {
            suppressWarnings({suppressMessages({read_csv(x)})})
        }
    )
)

In [None]:
###  Mix  ###
df.ts %>%
    group_by(
        lead_date = lead_time %>% as.Date()
    ) %>% 
    mutate(
        day.total = n()
    ) %>% 
    ungroup() %>% 
    group_by(
        lead_date,
        exclude = eval(exclude.logic)
    ) %>% 
    summarize(
        p = n()/mean(day.total)
    ) %>% 
    ungroup() %>% 
    ggplot(
        mapping = aes(
            x = lead_date,
            y = p,
            color = exclude
        )
    ) +
    geom_line() +
    geom_point() +
    scale_y_continuous(
        labels = scales::percent
    ) +
    labs(
        title = "Mix of Exclude"
    ) +
    theme_bw()

###  Conversion - FR  ###
df.ts %>%
    group_by(
        lead_date = lead_time %>% as.Date(),
        exclude = eval(exclude.logic)
    ) %>% 
    summarize(
#         conversion = sum(funded)/sum(qualified)
#         conversion = sum(qualified)/sum(newentered)
#         conversion = sum(newentered)/n()
#         conversion = sum(funded)/sum(newentered)
        conversion = sum(funded)/n()
    ) %>% 
    ungroup() %>% 
    ggplot(
        mapping = aes(
            x = lead_date,
            y = conversion,
            color = exclude
        )
    ) +
    geom_line() +
    geom_point() +
    scale_y_continuous(
        labels = scales::percent
    ) +
    labs(
        title = "Conversion of Exclude"
    ) +
    theme_bw()

In [None]:
####  Mix Chart  ####

# top.campaigns = df.train %>% filter(lead_time >= '2020-02-01' %>% as.Date() & payrolltype == 'DirectDeposit') %>% group_by(campaign_id) %>% summarize(n = n()) %>% ungroup() %>% arrange(desc(n)) %>% head()

# df.train %>%
#     mutate(
#         lead.date = lead_time %>% cut("week") %>% as.Date()
#     ) %>% 
#     group_by(
#         lead.date
#     ) %>% 
#     mutate(
#         date.total = n()
#     ) %>% 
#     ungroup() %>% 
#     group_by(
#         lead.date,
#         campaign_id
#     ) %>% 
#     summarize(
#         mix = n()/min(date.total),
#         dd = sum(payrolltype == 'DirectDeposit') / min(date.total)
#     ) %>% 
#     ungroup() %>% 
#     filter(
#         ! campaign_id %in% c() &
#         lead.date < '2020-03-02' %>% as.Date()
#     ) %>% 
#     arrange(
#         desc(lead.date)
#     ) %>% 
#     ggplot(
#         mapping = aes(
#             x = lead.date,
# #             y = mix,
#             y = dd,
#             color = campaign_id
#         )
#     ) + 
#     geom_line() +
#     geom_point() +
#     theme_minimal()

# top.campaigns

In [None]:
####  DD Chart  ####

# df.train %>%
#     mutate(
#         lead.date = lead_time %>% cut("week") %>% as.Date()
#     ) %>% 
#     group_by(
#         lead.date
#     ) %>% 
#     mutate(
#         date.total = n()
#     ) %>% 
#     ungroup() %>% 
#     group_by(
#         lead.date,
#         payrolltype
#     ) %>% 
#     summarize(
#         mix = n()/min(date.total)
#     ) %>% 
#     ungroup() %>% 
#     filter(
# #         ! campaign_id %in% c() &
# #         lead.date < '2020-03-02' %>% as.Date()
#     ) %>% 
#     arrange(
#         desc(lead.date)
#     ) %>% 
#     ggplot(
#         mapping = aes(
#             x = lead.date,
#             y = mix,
#             color = payrolltype
#         )
#     ) + 
#     geom_line() +
#     geom_point() +
#     theme_minimal()

# df.train %>%
#     mutate(
#         lead.date = lead_time %>% cut("week") %>% as.Date()
#     ) %>% 
#     group_by(
#         lead.date
#     ) %>% 
#     mutate(
#         date.total = n()
#     ) %>% 
#     ungroup() %>% 
#     group_by(
#         lead.date,
#         payrolltype
#     ) %>% 
#     summarize(
#         volume = n(),
#         mix = n()/min(date.total),
#         fr = sum(funded)/sum(qualified)
#     ) %>% 
#     ungroup() %>% 
#     filter(
# #         ! campaign_id %in% c() &
# #         lead.date < '2020-03-02' %>% as.Date()
#     ) %>% 
#     arrange(
#         desc(lead.date)
#     )