# Import Packages

In [None]:
options(java.parameters = "-Xmx8048m")

In [None]:
library(devtools)
# install_github(
#     'jasonchang2018/opploansanalytics',
#     auth_token = Sys.getenv('GITHUB_PAT_OPPLOANSANALYTICS')
# )



library(opploansanalytics)
load.packages()

library(mlr)
library(pdp)
library(vip)
library(reshape2)
library(furrr)

library(rpart)   ##  decision trees
library(rattle)  ##  fancyrpartplot

# Read Raw Data
#### Acquire each admethod's data and attach to a list.

In [None]:
getDF = function (timestart = '2020-06-01', timeend = '2020-07-01', limit = NA, write = FALSE, admethod) {
    
    if (write) {
        querySnowflake(paste0(
            "
            select distinct

                ----  LDE  ----

                leads.lead_id
                , case  when leads.accepted = TRUE
                        and  leads.apiversion = '5' 
                        then offers.lead_id 
                        when leads.accepted = FALSE
                        and  leads.apiversion = '5'
                        then leads.lead_id
                        else NULL
                        end  as lead_id_to_visine
                , leads.lead_time
                , extract(day from leads.lead_time) as lead_day

                --  Outcome  --
                , leads.apiversion
                , case when leads.accepted = TRUE then 1 else 0 end as accepted
                , leads.reason
                , leads.code

                --  Attributes  --
                , leads.partnerid
                , c_adm.name as admethod
                , json_extract_path_text(leads.raw_lead, 'request.requestedLoanAmount') as requested_loan_amount
                , coalesce(
                    json_extract_path_text(leads.raw_lead, 'request.campaignID'),
                    json_extract_path_text(leads.raw_lead, 'request.campaignId'),
                    json_extract_path_text(leads.raw_lead, 'request.campaignid'),
                    json_extract_path_text(leads.raw_lead, 'request.campaign_id'),
                    json_extract_path_text(leads.raw_lead, 'request.click_id'),

                    json_extract_path_text(leads.raw_lead, 'campaignID'),
                    json_extract_path_text(leads.raw_lead, 'campaignId'),
                    json_extract_path_text(leads.raw_lead, 'campaignid'),
                    json_extract_path_text(leads.raw_lead, 'campaign_id'),
                    json_extract_path_text(leads.raw_lead, 'click_id')

                ) as campaign_id

                --  Income  --
                , leads.grossmonthlyincome
                , leads.incometype
                , leads.payrollfrequency
                , leads.payrolltype
                , leads.lastpayrolldate

                --  Identity  --
                , leads.\"dateofbirth::text\" as dateofbirth
                , floor((leads.lead_time::date - leads.\"dateofbirth::text\"::date)::numeric/365) as age
                , leads.statecode

                --  Employment  --
                , leads.work_hiredate

                --  Offer  --
                , leads.leadofferid
                , leads.offer_amount
                , leads.offer_interestrate
                , leads.offer_monthlypayment




                ----  CLARITY - LEAD ----
                , usages.id as usage_id_lead
                , usages.created_at as usage_time_lead

                , reports.id as cached_report_id_lead
                , reports.created_at as report_time_lead
                , reports.report_name as report_name_lead
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.inquiry.products_executed') as products_executed_lead

                --  Inquiry --
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.inquiry.ofac_score') as ofac_score
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.inquiry.paycheck_direct_deposit') as paycheck_direct_deposit
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.inquiry.ssn_distinct_first_last_name_count') as ssn_distinct_first_last_name_count

                --  CCR --
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.clear_credit_risk.score') as ccr_score
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.clear_credit_risk.number_of_loans') as ccr_number_of_loans
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.clear_credit_risk.number_of_bank_accounts') as ccr_number_of_bank_accounts
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.clear_credit_risk.highest_number_of_days_past_due') as ccr_highest_number_of_days_past_due
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.clear_credit_risk.current_inquiry_cluster_position') as ccr_current_inquiry_cluster_position
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.clear_credit_risk.days_since_last_loan_charged_off') as ccr_days_since_last_loan_charged_off
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.clear_credit_risk.days_since_inquiry_previously_seen') as ccr_days_since_inquiry_previously_seen
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.clear_credit_risk.number_of_employers_last_six_months') as ccr_number_of_employers_last_six_months
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.clear_credit_risk.worst_payment_rating') as ccr_worst_payment_rating

                --  CRH --
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.clear_recent_history.supplier_recent_history.summary_recent_history.loans_in_collections') as srh_loans_in_collections
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.clear_recent_history.supplier_recent_history.summary_recent_history.spml_average_rollovers') as srh_spml_average_rollovers
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.clear_recent_history.supplier_recent_history.summary_recent_history.amount_loans_charged_off') as srh_amount_loans_charged_off
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.clear_recent_history.supplier_recent_history.summary_recent_history.online_loan_opened_in_the_last_year') as srh_online_loan_opened_in_the_last_year
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.clear_recent_history.supplier_recent_history.summary_recent_history.online_loan_inquiry_in_the_last_thirty_days') as srh_online_loan_inquiry_in_the_last_thirty_days

                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.clear_recent_history.inquiry_cluster_recent_history.total_inquiry_clusters_recent_history.ninety_days_ago') as ticrh_ninety_days_ago
                , json_extract_path_text(coalesce(leads.clarity_report, reports.body), 'xml_response.clear_recent_history.inquiry_cluster_recent_history.total_inquiry_clusters_recent_history.twentyfour_hours_ago') as ticrh_twentyfour_hours_ago




                ----  CLOUDLENDING  ----

                , bridge.app_id as application_bridge
                , details.loanid
                , details.application
                , details.origination_id

                --  Attributes --
                , details.appldate
                , details.adgrp

                --  Decision --
                , details.denygrp
                , details.denial_reason

                --  Funnel  --
                , coalesce(details.newentered, 0) as newentered
                , coalesce(details.bizrulespassed, 0) as bizrulespassed
                , coalesce(details.qualified, 0) as qualified
                , coalesce(details.bankverified, 0) as bankverified
                , coalesce(details.passscorecardratecard, 0) as passscorecardratecard
                , coalesce(details.contractsigned, 0) as contractsigned
                , coalesce(details.cs_decisioned, 0) as cs_decisioned
                , coalesce(details.funded, 0) as funded
                , coalesce(details.funded_amount, 0) as funded_amount

                , case  when  details.bankverified_pl = 1
                        then  0.47 + 0.48 * coalesce(plaid_items.item_count, 1)
                        else  0
                        end   +  --as pl_costs
                  case  when  details.bankverified_dl = 1
                        then  1.05
                        when  details.attempt_dl = 1
                        and   details.bankverified_dl = 0
                        then  0.52
                        else  0
                        end   +  --as dl_costs
                  case  when  details.bankverified_mb = 1
                        then  1.40
                        else  0
                        end     --as mb_costs
                        as    ibv_costs




                ----  CLARITY  ----
                , coalesce(products_executed_total.clearrecenthistory, 0) as clear_recent_history_app
                , coalesce(products_executed_total.clearcreditrisk, 0) as clear_credit_risk_app
                , coalesce(products_executed_total.clearbankbehavior, 0) as clear_bank_behavior_app
                , coalesce(products_executed_total.clearadvancedattributes, 0) as clear_advanced_attributes_app
                , coalesce(products_executed_total.clearfraudinsights, 0) as clear_fraud_insights_app


            from
                loan_dw_prod.lead.leads_reporting as leads
                inner join
                    loan_dw_prod.cloudlending.advertising_method as c_adm
                    on leads.partnerid = c_adm.external_id
                    and c_adm.name = '", admethod, "'
                left join
                    loan_dw_prod.leadzeppelin.offers as offers
                    on leads.lead_id = offers.id

                left join
                    loan_dw_prod.visine.report_usages as usages
                    on case when leads.accepted = TRUE
                            and  leads.apiversion = '5' 
                            then offers.lead_id 
                            when leads.accepted = FALSE
                            and  leads.apiversion = '5'
                            then leads.lead_id
                            else NULL
                            end = usages.lead_id
                left join
                    loan_dw_prod.visine.cached_reports as reports
                    on usages.cached_report_id = reports.id
                    and reports.report_name ilike '%leads01%'


                left join
                    periscope_de.periscope_views.apps_leads_bridge as bridge
                    on leads.lead_id = bridge.lead_id
                left join
                    periscope_de.periscope_views.bizops_application_details as details
                    on bridge.app_id = details.application

                left join
                    periscope_de.periscope_views.bizops_clarity_products_executed as products_executed_total
                    on details.origination_id = products_executed_total.origination_id

                left join
                    (
                        select
                            loanid
                            , count(distinct account) as item_count
                        from
                            periscope_de.periscope_views.bizops_temp_plaid_account_list
                        group by
                            1
                        having
                            count(distinct account) > 1
                    ) as plaid_items
                    on details.loanid = plaid_items.loanid


            where
                leads.lead_time >= '", timestart, "'
                and leads.lead_time < '", timeend, "'
            ",
            if_else(
                !is.na(limit),
                paste0("limit ", limit),
                ""
            )
        )) %>% 
        rename_all(
            .funs = ~ .x %>% str_to_lower
        ) %>% 
        left_join(
            y = suppressWarnings({suppressMessages({
                    read_csv("..\\data-v2\\loan-performance-2020-08-01.csv")
                })}) %>% 
                select(
                    loanid,
                    truefpd
                ),
            by = 'loanid'
        ) %>% 
        write.csv(
            paste0("..\\data-v2\\", admethod %>% str_to_lower() %>% str_remove_all("\\s"), "-df-", timestart, ".csv"),
            row.names = FALSE
        )
    } else {
        suppressWarnings({suppressMessages({
            read_csv("..\\data-v2\\clarity-df.csv")
        })})
    }
    
}

In [None]:
getAllDFs = function () {
    
    ##  Define month ranges and admethods
    month.vec = seq.Date(
        from = '2020-08-01' %>% as.Date(),
        to = '2020-09-01' %>% as.Date(),
        by = '1 month'
    )
    
    admethod = c('LenderEdge 4', 'Even Financial 4', 'Monevo', 'Quin Street 4', 'LeadGroup', 'LeapTheory 4')
    
    
    ##  Get all permutations of (date range) x admethod
    all.pulls = merge(
        x = data.frame(
            start = month.vec[-length(month.vec)],
            end = month.vec[-1],
            stringsAsFactors = FALSE
        ),
        y = data.frame(
            admethod,
            stringsAsFactors = FALSE
        )
#     ) %>%
#     slice(
#         1
    )
    
#     return(all.pulls)
    
    
    ##  Run (try) query for each permutation.
    pmap(
        .l = list(all.pulls$start, all.pulls$end, all.pulls$admethod),
        .f = function (start, end, admethod) {
            try({
                getDF(
                    timestart = start,
                    timeend = end,
                    admethod = admethod,
                    write = TRUE
                )
            })
        }
    )
    
}

In [None]:
getBindedDFs = function (write = FALSE) {
    
    ##  Define files and admethods
    data.files = list.files("..\\data-v2")
    admethod =
        c(
            'LenderEdge 4',
            'Even Financial 4',
            'Monevo',
            'Quin Street 4',
            'LeadGroup',
            'LeapTheory 4'
        ) %>%
        str_to_lower() %>%
        str_remove_all('\\s')
    
    ##  Return list, where each element is the data frame for a given partner.
    if (write) {
        admethod %>% 
            map( ## 115s
                .f = ~ data.files %>%
                    as.data.frame(
                        stringsAsFactors = FALSE
                    ) %>% 
                    rename(
                        file.name = '.'
                    ) %>% 
                    filter(
                        file.name %>% str_detect(
                            paste0(.x, '\\-df\\-\\d{4}\\-\\d{2}\\-\\d{2}\\.csv$')
                        )
                    ) %>% 
                    transmute(
                        file.path = paste0("..\\data-v2\\", file.name)
                    ) %>% 
                    .$file.path %>%
                    lapply(
                        FUN = function (x) {
                            suppressWarnings({suppressMessages({
                                read_csv(x)
                            })})
                        }
                    ) %>%
                    do.call(
                        what = rbind,
                        args = .
                    ) %>%
                    {
                        if (write) { ##  redundant, but keeping it in because it's cool
                            . %T>% write.csv(
                                paste0("..\\data-v2\\", .x, ".csv"),
                                row.names = FALSE
                            )
                        } else {
                            return(.)
                        }
                    }()
            ) %>% 
            setNames(
                admethod %>% str_remove_all('\\d')
            )
    } else {
        admethod %>% 
            map(
                .f = ~ suppressWarnings({suppressMessages({
                    read_csv(paste0("..\\data-v2\\", .x, ".csv"))
                })})
            ) %>% 
            setNames(
                admethod %>% str_remove_all('\\d')
            )
    }
    
}

In [None]:
appendCutV1 = function (df.list) {
    
    ##  This function indicates whether the application would be captured by v1's rules.  ##
    
    #   Defines v1 rules. Returns list where each element is the rule for that admethod   #
    getCutsV1 = function () {

        list(
            lenderedge = expr(
                (
                  is.na(ccr_score) &
                  is.na(ccr_number_of_bank_accounts)
#                 ) |
#                 (
#                   !is.na(ccr_score) & ccr_score < 543 &
#                   !is.na(ccr_number_of_bank_accounts) & ccr_number_of_bank_accounts >= 2.5 &
#                   !is.na(campaign_id) & campaign_id %in% c('1716','1724','1726','1730','1731','1732','1734','1744','1745') &
#                   !is.na(ccr_highest_number_of_days_past_due) & ccr_highest_number_of_days_past_due < 5.5
                )
            ),
            evenfinancial = expr(
                is.na(ccr_score) &
                is.na(ccr_number_of_bank_accounts)
            ),
            monevo = expr(
                (
                    is.na(ccr_score) &
                    is.na(ccr_number_of_bank_accounts)
#                 ) |
#                 (
#                     !is.na(ccr_number_of_loans) &ccr_number_of_loans < 0.5 &
#                     !is.na(requestedloanamount) & requestedloanamount >= 5150
                )
            ),
            quinstreet = expr(
                is.na(ccr_score) &
                is.na(ccr_number_of_bank_accounts)
            ),
            leadgroup = expr(
                is.na(ccr_score) &
                is.na(ccr_number_of_bank_accounts)
            ),
            leaptheory = expr(FALSE)
        )

    }
    
    
    #   For each admethod in df.list, append a column that indicates whether it was a v1 cut.   #
    df.list %>% names() %>% 
        map(
            .f = ~ df.list[[.x]] %>% 
                mutate(
                    is.cut.v1 = eval(getCutsV1()[[.x]])
                )
        ) %>% 
        setNames(
            nm = df.list %>% names()
        )
    
}

# Preliminary data cleaning.

In [None]:
cleanColumnValues = function (df) {
    
    ##  This function converts defined unexpected values to a value that can be better detected by the model.  ##
    
    ##  Brackets show up in many numeric columns; Want to differentiate from NA.
    convertBracketToNeg1 = function (val) {
        ifelse(val == '{}', -1, val)
    }
    
    
    ##  For each admethod-element: run above functions on each column in these DFs. Then clean campaign.
    ##  Returns DF of all <chr>.
    df %>% 
        apply(
            MARGIN = 2,
            FUN = convertBracketToNeg1
        ) %>% 
        as.data.frame(
            stringsAsFactors = FALSE
        ) %>% 
        mutate(
            ##  Specifies any pattern to the admethod campaign from which to extract the true classifier.
            campaign_id = case_when(
                admethod == 'Even Financial 4' ~ campaign_id %>% str_remove_all('::.*$'),
                admethod == 'LeadGroup' ~ campaign_id %>% map_chr(.f = ~ .x %>% str_match('\\-(\\d{6})\\-') %>% .[,2]),
                admethod == 'LeapTheory 4' ~ campaign_id %>% str_remove_all('-.*$'),
                admethod %in% c('LenderEdge 4', 'Monevo', 'Quin Street 4') ~ campaign_id
            )
        )
    
}

In [None]:
expandColumnValues = function (df) {
    
    df %>% 
        mutate(
            clear_bank_behavior_lead = products_executed_lead %>% str_detect('Clear Bank Behavior'),
            clear_advanced_attributes_lead = products_executed_lead %>% str_detect('Clear Advanced Attributes'),
            clear_fraud_insights_lead = products_executed_lead %>% str_detect('Clear Fraud Insight'),
            clear_recent_history_lead = products_executed_lead %>% str_detect('Clear Recent History'),
            clear_credit_risk_lead = products_executed_lead %>% str_detect('Clear Credit Risk')
            
#             clear_bank_behavior_app = clear_bank_behavior_total == 1,
#             clear_advanced_attributes_app = clear_advanced_attributes_total == 1,
#             clear_fraud_insights_app = clear_fraud_insights_total == 1,
#             clear_recent_history_app = clear_recent_history_total == 1,
#             clear_credit_risk_app = clear_credit_risk_total == 1
        )
    
}

In [None]:
standardizeColumnValues = function (df) {
    
    df
}

In [None]:
convertColumnTypes = function (df) {
    
    defineColumnTypes = function () {
    
        char = c(
            'lead_id',
            'lead_id_to_visine',
            'apiversion',
            'admethod',
            'reason',
            'leadofferid',
            'usage_id_lead',
            'cached_report_id_lead',
            'report_name_lead',
            'application_bridge',
            'loanid',
            'application',
            'origination_id',
            'adgrp',
            'denygrp',
            'denial_reason'
        )

        factor = c(
            'campaign_id',
            'incometype',
            'payrollfrequency',
            'payrolltype',
            'statecode',
            'paycheck_direct_deposit',
            'ccr_worst_payment_rating',
            'accepted',
            'newentered',
            'bizrulespassed',
            'qualified',
            'bankverified',
            'passscorecardratecard',
            'contractsigned',
            'cs_decisioned',
            'funded',
            'clear_bank_behavior_lead',
            'clear_advanced_attributes_lead',
            'clear_fraud_insights_lead',
            'clear_recent_history_lead',
            'clear_credit_risk_lead',
            'clear_bank_behavior_app',
            'clear_advanced_attributes_app',
            'clear_fraud_insights_app',
            'clear_recent_history_app',
            'clear_credit_risk_app'
        )

        date = c(
            'lead_time',
            'lastpayrolldate',
            'dateofbirth',
#             'usage_time_lead', ##epochs
#             'report_time_lead', ##epochs
            'work_hiredate'
#             'appldate'
        )
        
        logical = c(
            'srh_online_loan_opened_in_the_last_year', ##also in nums. str -> logical -> num
            'srh_online_loan_inquiry_in_the_last_thirty_days', ##also in nums. str -> logical -> num
            'is.cut.v1'
        )
        
        logical.num = c(
            'accepted',
            'newentered',
            'bizrulespassed',
            'qualified',
            'bankverified',
            'passscorecardratecard',
            'contractsigned',
            'cs_decisioned',
            'funded',
            'clear_bank_behavior_lead',
            'clear_advanced_attributes_lead',
            'clear_fraud_insights_lead',
            'clear_recent_history_lead',
            'clear_credit_risk_lead',
            'clear_bank_behavior_app',
            'clear_advanced_attributes_app',
            'clear_fraud_insights_app',
            'clear_recent_history_app',
            'clear_credit_risk_app'
        )

        num = c(
            'lead_day',
            'code',
            'partnerid',
            'requested_loan_amount',
            'grossmonthlyincome',
            'age',
            'ofac_score',
            'ssn_distinct_first_last_name_count',
            'ccr_score',
            'ccr_number_of_loans',
            'ccr_number_of_bank_accounts',
            'ccr_highest_number_of_days_past_due',
            'ccr_current_inquiry_cluster_position',
            'ccr_days_since_last_loan_charged_off',
            'ccr_days_since_inquiry_previously_seen',
            'ccr_number_of_employers_last_six_months',
            'srh_loans_in_collections',
            'srh_spml_average_rollovers',
            'srh_amount_loans_charged_off',
            'srh_online_loan_opened_in_the_last_year',
            'srh_online_loan_inquiry_in_the_last_thirty_days',
            'ticrh_ninety_days_ago',
            'ticrh_twentyfour_hours_ago',
            'funded_amount',
            'ibv_costs'
        )

        list(
            char = char,
            factor = factor,
            date = date,
            num = num,
            logical = logical,
            logical.num = logical.num
        )

    }
    
    df %>% 
        mutate_at(
            .vars = defineColumnTypes()$char,
            .funs = as.character
        ) %>% 
        mutate_at(
            .vars = defineColumnTypes()$logical,
            .funs = ~ .x %>% str_trim(side = 'both') %>% as.logical()
        ) %>% 
        mutate_at(
            .vars = defineColumnTypes()$logical.num,
            .funs = ~ .x %>% as.numeric() %>% as.logical() %>% as.character() %>% str_trim(side = 'both')
        ) %>% 
        mutate_at(
            .vars = defineColumnTypes()$factor,
            .funs = ~ .x %>% str_to_upper() %>% as.factor()
        ) %>% 
        mutate_at(
            .vars = defineColumnTypes()$date,
            .funs = as.Date
        ) %>%
        mutate_at(
            .vars = defineColumnTypes()$num,
            .funs = as.numeric
        )
        
}

In [None]:
imputeMissingValues = function (df) {
    
    df %>% 
        mutate_if(
            .predicate = is.numeric,
            .funs = replace_na,
            replace = -10000
        ) %>% 
        mutate_if(
            .predicate = is.character,
            .funs = replace_na,
            replace = '<blank>'
        )
    
}

In [None]:
reduceFactorLevels = function (df) {
    
    top.levels = df %>% 
        group_by(
            campaign_id
        ) %>% 
        summarize(
            n = n()
        ) %>%
        arrange(
            desc(n)
        ) %>% 
        slice(1:20) %>% 
        .$campaign_id
    
    df %>% 
        mutate(
            campaign_id = if_else(
                campaign_id %in% top.levels,
                campaign_id %>% as.character,
                'Else'
            ) %>% as.factor()
        )
    
} ##should move this to PreModel

In [None]:
completePreProcess = function (df.list, admethod, timestart = NA, timeend = NA) {
    
    admethod.short = admethod %>% 
        str_to_lower() %>% 
        str_remove_all(
            '\\d'
        ) %>% 
        str_remove_all(
            '\\s'
        )
    
    df.list[[admethod.short]] %>%
        ##  Filter by date if specified
        {
            if (!is.na(timestart) & !is.na(timeend)) {

                . %>% filter(
                    lead_time >= timestart &
                    lead_time <  timeend
                )

            } else {

                return(.)
            }
        }() %>% 
        cleanColumnValues() %>% 
        expandColumnValues() %>% 
        standardizeColumnValues() %>% 
        convertColumnTypes() %>% 
        imputeMissingValues() %>% 
        reduceFactorLevels()
    
}

# Filter for rows and columns to be directly inserted into model.

- 0 = accepted
- 1 = newentered
- 2 = businessrulespassed
- 3 = qualified
- 4 = bankverified
- 5 = passscorecardratecard
- 6 = contractsigned
- 7 = cs_decisioned
- 8 = funded

In [None]:
columnsToKeep = function () {
    c(
        'campaign_id',
#         'incometype',
#         'payrollfrequency',
#         'payrolltype',
        'paycheck_direct_deposit',
        'ccr_worst_payment_rating',
        'lead_day',
        'requested_loan_amount',
        'grossmonthlyincome',
#             'age',
        'ofac_score',
        'ssn_distinct_first_last_name_count',
        'ccr_score',
        'ccr_number_of_loans',
        'ccr_number_of_bank_accounts',
        'ccr_highest_number_of_days_past_due',
        'ccr_current_inquiry_cluster_position',
        'ccr_days_since_last_loan_charged_off',
        'ccr_days_since_inquiry_previously_seen',
        'ccr_number_of_employers_last_six_months',
        'srh_loans_in_collections',
        'srh_spml_average_rollovers',
        'srh_amount_loans_charged_off',
        'srh_online_loan_opened_in_the_last_year',
        'srh_online_loan_inquiry_in_the_last_thirty_days',
        'ticrh_ninety_days_ago',
        'ticrh_twentyfour_hours_ago',
#             quo_name(numerator.quo),
        'accepted',
        'newentered',
        'bizrulespassed',
        'qualified',
        'bankverified',
        'passscorecardratecard',
        'contractsigned',
        'cs_decisioned',
        'funded',
        'clear_bank_behavior',
        'clear_advanced_attributed',
        'clear_fraud_insight',
        'clear_recent_history',
        'clear_credit_risk',
        'clear_fraud'
    )
}

In [None]:
filterModelRows = function (df, denominator.quo, campaign_id_keep, timestart, timeend) {
    
    df %>% 
        filter(
            !is.cut.v1
        ) %>% 
        ##  Filter by denominator if specified
        {
            if (!is.na(denominator.quo)) {

                . %>% filter(!!denominator.quo == 'TRUE')

            } else {

                return(.)
            }
        }() %>% 
        ##  Filter by campaign if specified
        {
            if (!is.na(campaign_id_keep)) {

                . %>% filter(campaign_id %in% campaign_id_keep)

            } else {

                return(.)
            }
        }() %>% 
        ##  Filter by date if specified
        {
            if (!is.na(timestart) & !is.na(timeend)) {

                . %>% filter(
                    lead_time >= timestart &
                    lead_time <  timeend
                )

            } else {

                return(.)
            }
        }()
    
}

In [None]:
filterModelColumns = function (df) {
    
    columnsToKeep = function () {
        c(
            'campaign_id',
            'incometype',
            'payrollfrequency',
            'payrolltype',
            'paycheck_direct_deposit',
            'ccr_worst_payment_rating',
            'lead_day',
            'requested_loan_amount',
            'grossmonthlyincome',
#             'age',
            'ofac_score',
            'ssn_distinct_first_last_name_count',
            'ccr_score',
            'ccr_number_of_loans',
            'ccr_number_of_bank_accounts',
            'ccr_highest_number_of_days_past_due',
            'ccr_current_inquiry_cluster_position',
            'ccr_days_since_last_loan_charged_off',
            'ccr_days_since_inquiry_previously_seen',
            'ccr_number_of_employers_last_six_months',
            'srh_loans_in_collections',
            'srh_spml_average_rollovers',
            'srh_amount_loans_charged_off',
            'srh_online_loan_opened_in_the_last_year',
            'srh_online_loan_inquiry_in_the_last_thirty_days',
            'ticrh_ninety_days_ago',
            'ticrh_twentyfour_hours_ago',
#             quo_name(numerator.quo),
            'accepted',
            'newentered',
            'bizrulespassed',
            'qualified',
            'bankverified',
            'passscorecardratecard',
            'contractsigned',
            'cs_decisioned',
            'funded'
        )
    }
    
    df %>% 
        select_at(
            .vars = columnsToKeep()
        )
    
}

In [None]:
splitSets = function (df, train.perc, seed.num) {
    
    ##  Generate training/test/validate ROWS.
    set.seed(seed.num)
    
    ##  Test = 80%.
    train.rows = sample(
        x = 1:nrow(df),
        size = floor(train.perc * nrow(df))
    )
    
    ##  Train = 10%.
    test.rows = sample(
        x = (1:nrow(df))[which(!( (1:nrow(df)) %in% train.rows) )],
        size = ceiling((1 - train.perc) / 2 * nrow(df))
    )
    
    ##  Validate = 10% Remaining.
    validate.rows = (1:nrow(df))[which(!( (1:nrow(df)) %in% c(train.rows, test.rows)) )]
    
    
    
    
    ##  Generate training/test/validate sets.
    list(
        original = df,
        train = df %>% slice(train.rows),
        test = df %>% slice(test.rows),
        validate = df %>% slice(validate.rows)
    )
    
    
}

In [None]:
removeUntrainedFactors = function (df.sets, numerator.quo) {
    
    keepTrainedFactors = function (df.sets, df.to.filter.string) {
        
        columnsToKeep = function (numerator.quo) {
            c(
                'campaign_id',
#                 'incometype',
#                 'payrollfrequency',
#                 'payrolltype',
                'paycheck_direct_deposit',
                'ccr_worst_payment_rating',
                'lead_day',
                'requested_loan_amount',
                'grossmonthlyincome',
#                 'age',
                'ofac_score',
                'ssn_distinct_first_last_name_count',
                'ccr_score',
                'ccr_number_of_loans',
                'ccr_number_of_bank_accounts',
                'ccr_highest_number_of_days_past_due',
                'ccr_current_inquiry_cluster_position',
                'ccr_days_since_last_loan_charged_off',
                'ccr_days_since_inquiry_previously_seen',
                'ccr_number_of_employers_last_six_months',
                'srh_loans_in_collections',
                'srh_spml_average_rollovers',
                'srh_amount_loans_charged_off',
                'srh_online_loan_opened_in_the_last_year',
                'srh_online_loan_inquiry_in_the_last_thirty_days',
                'ticrh_ninety_days_ago',
                'ticrh_twentyfour_hours_ago',
                quo_name(numerator.quo)
            )
        }
        
        df.sets[[df.to.filter.string]] %>% 
            filter(
                incometype %in% df.sets$train$incometype &
                campaign_id %in% df.sets$train$campaign_id &
                payrollfrequency %in% df.sets$train$payrollfrequency &
                payrolltype %in% df.sets$train$payrolltype &
                paycheck_direct_deposit %in% df.sets$train$paycheck_direct_deposit &
                ccr_worst_payment_rating %in% df.sets$train$ccr_worst_payment_rating
            ) %>% 
            select_at(
                .vars = columnsToKeep(numerator.quo)
            )
        
    }
    
    df.sets %>% 
        append(
            values = list(test.factored = df.sets %>% keepTrainedFactors('test'))
        ) %>% 
        append(
            values = list(validate.factored = df.sets %>% keepTrainedFactors('validate'))
        )
    
    
        
    
}

In [None]:
balanceResponseWeight = function (df.sets, numerator.quo) {
    
    columnsToKeep = function (numerator.quo) {
        c(
            'campaign_id',
#             'incometype',
#             'payrollfrequency',
#             'payrolltype',
            'paycheck_direct_deposit',
            'ccr_worst_payment_rating',
            'lead_day',
            'requested_loan_amount',
            'grossmonthlyincome',
#             'age',
            'ofac_score',
            'ssn_distinct_first_last_name_count',
            'ccr_score',
            'ccr_number_of_loans',
            'ccr_number_of_bank_accounts',
            'ccr_highest_number_of_days_past_due',
            'ccr_current_inquiry_cluster_position',
            'ccr_days_since_last_loan_charged_off',
            'ccr_days_since_inquiry_previously_seen',
            'ccr_number_of_employers_last_six_months',
            'srh_loans_in_collections',
            'srh_spml_average_rollovers',
            'srh_amount_loans_charged_off',
            'srh_online_loan_opened_in_the_last_year',
            'srh_online_loan_inquiry_in_the_last_thirty_days',
            'ticrh_ninety_days_ago',
            'ticrh_twentyfour_hours_ago',
            quo_name(numerator.quo)
        )
    }
    
    ##  Will return error in nrow(x) == 0
    if (nrow(df.sets$train) > 0) {

        positive.response = df.sets$train %>% filter(!!numerator.quo == 'TRUE')
        negative.response = df.sets$train %>% filter(!!numerator.quo == 'FALSE')

        positive.count = positive.response %>% nrow()
        negative.count = negative.response %>% nrow()


        ##  How many times do we need each response level?
        ##  Need to account for case where a response level has 0 obs.
        positive.n = max(
            ifelse(
                is.infinite(floor(negative.count / positive.count)),
                0,
                floor(negative.count / positive.count)
            ),
            1
        )
        negative.n = max(
            ifelse(
                is.infinite(floor(positive.count / negative.count)),
                0,
                floor(positive.count / negative.count)
            ),
            1
        )

        train.bal = rbind(  ##  Returns DF
            do.call(  ##  Returns DF
                what = rbind,
                args = replicate(  ##  Returns list of length n of DFs
                    n = positive.n,
                    expr = {rbind(data.frame(), positive.response)},
                    simplify = FALSE
                )
            ),
            do.call(  ##  Returns DF
                what = rbind,
                args = replicate(  ##  Returns list of length n of DFs
                    n = negative.n,
                    expr = {rbind(data.frame(), negative.response)},
                    simplify = FALSE
                )
            )
        )
        
        df.sets %>% 
            append(
                values = list(
                    train.bal = train.bal %>% select_at(.vars = columnsToKeep(numerator.quo))
                )
            )
        

    } else {
        
        df.sets %>% 
            append(
                values = list(
                    train.bal = df.sets$train %>% select_at(.vars = columnsToKeep(numerator.quo))
                )
            )
    }   
    
}

In [None]:
completePreModel = function (
    preprocessed.df, denominator.quo, numerator.quo,
    campaign_id_keep = NA,
    timestart = NA, timeend = NA,
    train.perc = 0.8, seed.num = 1
) {
    
    preprocessed.df %>% 
        filterModelRows(
            denominator.quo = denominator.quo,
            campaign_id_keep = campaign_id_keep,
            timestart = timestart,
            timeend = timeend
        ) %>% 
        filterModelColumns(
        ) %>% 
        splitSets(
            train.perc = train.perc,
            seed.num = seed.num
        ) %>% 
        removeUntrainedFactors(
            numerator.quo = numerator.quo
        ) %>% 
        balanceResponseWeight(
            numerator.quo = numerator.quo
        )
    
}

# Generate Model

In [None]:
getDecisionTree = function (df.sets, numerator.quo, xval = 0, minsplit = 1000, minbucket = 1000, cp = 0.001) {
 

    ####  Setup  ####
    getTreeFunction = function () {

        suppressWarnings({
            listLearners() %>%
                filter(
                    type == 'classif' &
                    name %>% str_detect('[Tt]ree')
                )
        })
#         getParamSet('classif.rpart')
    }
    createTask = function () {
       
        makeClassifTask(
            id = 'rpart.task',
            data = df.sets$train.bal,
            target = quo_name(numerator.quo),
            positive = 'TRUE'
        )
        
    }
    createLearner = function () {

        makeLearner(
            cl = 'classif.rpart',
            id = 'rpart.learner',
            xval = xval,                     ##  Number of Cross Validations
            minsplit = minsplit,             ##  Number of Obs in Node for Split to be Attempted
            minbucket = minbucket,           ##  Minimum number of observations in Leaf Node
            cp = cp                          ##  Minimum information gain for split to execute.
#             loss = matrix(
#                 c(0,1,1.6,0),
#                 byrow = TRUE,
#                 nrow = 2
#             )
        )
        
    }

    
    
    ####  Execution  ####
    executeHyperparameterTuning = function (task.obj, learner.obj) {
        
        ####  Resampling  ####

        # rpart.resample = makeResampleDesc('CV', iters = 5, stratify = TRUE)

        # rpart.cv = resample(
        #     learner = rpart.learner,
        #     task = rpart.task,
        #     resampling = rpart.resample,
        #     measures = list(acc, mmce, fpr),
        #     show.info = FALSE
        # )

        # # rpart.cv %>% .$measures.test
        # # rpart.cv %>% .$aggr %>% as.data.frame()



        ####  Hypertuning  ####

        # rpart.params = makeParamSet(
        #     makeIntegerParam('minsplit', lower = 10, upper = 50),
        #     makeIntegerParam('minbucket', lower = 5, upper = 50),
        #     makeNumericParam('cp', lower = 0.0001, upper = 0.2)
        # )

        # rpart.search = makeTuneControlGrid()

        # rpart.tune = tuneParams(
        #     learner = rpart.learner,
        #     task = rpart.task,
        #     resampling = rpart.resample,
        #     par.set = rpart.params,
        #     control = rpart.search,
        #     measures = list(mmce, acc, fpr),
        #     show.info = FALSE
        # )

        # setHyperPars(
        #     learner = rpart.learner,
        #     par.vals = rpart.tune$x
        # ) 
        
    }
    executeTrainingModel = function (task.obj, learner.obj) {
        
        train(
            learner = learner.obj,
            task = task.obj
        )
        
    }
    executePrediction = function (model.obj, df) {
        
        predict(
            object = model.obj,
            newdata = df
        )
        
    }
    
    task = createTask()
    learner = createLearner()
    
    model = executeTrainingModel(
        task.obj = task,
        learner.obj = learner
    )
    
#     test.prediction = model %>% executePrediction(df.sets$test.factored)
#     validate.prediction = model %>% executePrediction(df.sets$validate.factored)


    
    
    pdf(
        tf <- tempfile(fileext = ".pdf")
    )
    fancyRpartPlot(
        model$learner.model
    )
    dev.off()
    cat(tf)
   
}

In [None]:
# lenderedge %>% 
#     filter(
#         campaign_id %>% as.character() %>% str_trim(side = 'both') %in% c('1730')
#     ) %>% 
#     completePreModel(
#         denominator.quo = quo(accepted),
#         numerator.quo = quo(funded)
#     ) %>% 
#     getDecisionTree(
#         numerator.quo = quo(funded),
#         minbucket = 10,
#         cp = 0.005,
#     )

# Evaluate Model

In [None]:
# getExclusions = function () {
    
#     list(
#         lenderedge = expr(
#             ( ##  yes
#                 campaign_id %in% c('1710','1732','1747','1758','1781','1782','1784','1785','1793','1794','1795') &
#                 ccr_highest_number_of_days_past_due < 89 &
#                 ccr_score < 540  
#             )
#         ),
#         evenfinancial = expr(
#             (
#                 campaign_id %in% c('05778644')
#             ) |
#             (
#                 campaign_id %in% c('1CAA4F4A')
#             ) |
#             (
#                 campaign_id %in% c('30E8F0F4') &
#                 ccr_score < 548
#             )
#         ),
#         monevo = expr(
# #             (
# #                 campaign_id %in% c('711', '829', '835', '840', '864', '870', '883', '907', '919','1033','1039')
# #             ) |
# #             (
# #                 ccr_score < 523
# #             )
#             (
#                 campaign_id %in% c('829','835','842','845','864','870','883','1033')
#             ) |
#             (
#                 campaign_id %in% c('690','711','846','867','907','919','940','1007') &
#                 ccr_score < 552
#             )
#         ),
#         quinstreet = expr(
#             (
#                 campaign_id %in% c('105644644','110359244','111370544','111448744','111606844','112726844','113048944','113492144') &
#                 ccr_score < 533
#             ) |
#             (
#                 campaign_id %in% c('105644644','110359244','111370544','111448744','111606844','112726844','113048944','113492144') &
#                 ccr_score >= 550 &
#                 ccr_number_of_loans < 1.5 &
#                 ccr_number_of_employers_last_six_months < 0.5
#             )
#         ),
#         leadgroup = expr(
#             ( ##  yes
#                 campaign_id %in% c('220026','220223','220257','220392','550022','550061','550065','550067','550102','550108') &
#                 ccr_score < 542
# #             ) |
# #             (
# #                 campaign_id %in% c('220205','220378','550041','550053') &
# #                 ccr_worst_payment_rating %in% c('{}', '-1')
#             )
#         ),
#         leaptheory = expr(
#             (
#                 ccr_score != -10000 &
#                 ccr_score < 520
#             )
#         )
#     )
    
# } ##FINAL!

In [None]:
# getExclusions = function () {
    
#     list(
#         lenderedge = expr(
#             ( ##  yes
#                 campaign_id %in% c('1710','1732','1747','1758','1781','1782','1784','1785','1793','1794','1795') &
#                 ccr_highest_number_of_days_past_due < 89 &
# #                 ccr_score < 530  ## least aggressive
#                 ccr_score < 540  
# #                 ccr_score < 550  ## most aggressive
#             ) |
#             (
#                 ccr_score < 546 &
#                 ccr_number_of_bank_accounts >= 1.5
#             )
#         ),
#         evenfinancial = expr(
#             ( ##  yes
#                 campaign_id %in% c('05778644')
#             ) |
#             ( ##  yes
#                 campaign_id %in% c('1CAA4F4A')
#             ) |
#             ( ##  yes
#                 campaign_id %in% c('30E8F0F4') &
#                 ccr_score < 548
#             ) |
#             (
#                 campaign_id %in% c('1267B561','345CE360','5EEAB848','6161C955','A86ADF41','ACCD153F','B285B235','EDAA9844','EE6F467F','EFAB46A6','F821F4FA','FE13A9E9') &
#                 ccr_worst_payment_rating %in% c('{}', '-1')
#             )
#         ),
#         monevo = expr(
#             (
#                 campaign_id %in% c('711', '829', '835', '840', '864', '870', '883', '907', '919','1033','1039')
#             ) |
#             (
#                 ccr_score < 523
#             )
#         ),
#         quinstreet = expr(
#             ( ##  yes
#                 campaign_id %in% c('105644644','110359244','111370544','111448744','111606844','112726844','113048944','113492144') &
#                 ccr_score < 533 ## less aggressive
# #                 ccr_score < 540 ## more aggressive
#             ) |
#             ( ##  yes
#                 campaign_id %in% c('105644644','110359244','111370544','111448744','111606844','112726844','113048944','113492144') &
# #                 ccr_score >= 533 & ## more aggressive
#                 ccr_score >= 550 &
# #                 ccr_score >= 600 & ## less aggressive
#                 ccr_number_of_loans < 1.5 &
#                 ccr_number_of_employers_last_six_months < 0.5
# #             ) |
# #             ( ##  maybe
# #                 campaign_id %in% c('105644644','110359244','111370544','111448744','111606844','112726844','113048944','113492144') &
# # #                 ccr_score >= 533 & ## more aggressive
# #                 ccr_score >= 560 &
# # #                 ccr_score >= 600 & ## less aggressive
# #                 ccr_number_of_loans < 1.5 &
# #                 ccr_days_since_inquiry_previously_seen < 50
#             )
#         ),
#         leadgroup = expr(
#             ( ##  yes
#                 campaign_id %in% c('220026','220223','220257','220392','550022','550061','550065','550067','550102','550108') &
#                 ccr_score < 542 ## less aggressive
# #                 ccr_score < 550 ## more aggressive
# #             ) |
# #             ( ##  maybe
# #                 ccr_number_of_bank_accounts >= 2.5 &
# #                 ccr_score < 542
#             ) |
#             (
#                 campaign_id %in% c('220205','220378','550041','550053') &
#                 ccr_worst_payment_rating %in% c('{}', '-1')
#             ) |
#             (
#                 campaign_id %in% c('220130','220178','550002') &
#                 ccr_score < 552
#             )
#         ),
#         leaptheory = expr(
# #             (
# #                 ccr_score == -10000 ##aka is.na(ccr_score) pre-imputation
# #             ) |
#             (
#                 ccr_score != -10000 &
#                 ccr_score < 520 ## less aggressive
# #                 ccr_score < 525
# #                 ccr_score < 529 ## most aggressive
#             ) |
#             (
#                 ccr_score >= 529 &
#                 campaign_id %in% c('101366','102959','211205','321098','43583','52270','53376','71605','81279','90384','90448') &
#                 ccr_number_of_bank_accounts >= 2.5
#             )
#         )
#     )
    
# } ##experimental-aggressive

In [None]:
getExclusions = function () {
    
    list(
        lenderedge = expr(
            ( ##  yes
                campaign_id %in% c('1710','1732','1747','1758','1781','1782','1784','1785','1793','1794','1795') &
                ccr_highest_number_of_days_past_due < 89 &
                ccr_score < 540  
            )
        ),
        evenfinancial = expr(
            (
                campaign_id %in% c('05778644') &
                ccr_score < 532
            ) |
            (
                campaign_id %in% c('1CAA4F4A')
            ) |
            (
                campaign_id %in% c('30E8F0F4') &
                ccr_score < 548
            )
        ),
        monevo = expr(
#             (
#                 campaign_id %in% c('711', '829', '835', '840', '864', '870', '883', '907', '919','1033','1039')
#             ) |
#             (
#                 ccr_score < 523
#             )
            (
                campaign_id %in% c('829','835','842','845','864','870','883','1033')
            ) |
            (
                campaign_id %in% c('690','711','846','867','907','919','940','1007') &
                ccr_score < 552
            )
        ),
        quinstreet = expr(
            (
                campaign_id %in% c('105644644','110359244','111370544','111448744','111606844','112726844','113048944','113492144') &
                ccr_score < 533
#             ) |
#             (
#                 campaign_id %in% c('105644644','110359244','111370544','111448744','111606844','112726844','113048944','113492144') &
#                 ccr_score >= 550 &
#                 ccr_number_of_loans < 1.5 &
#                 ccr_number_of_employers_last_six_months < 0.5
            )
        ),
        leadgroup = expr(
            ( ##  yes
                campaign_id %in% c('220026','220223','220257','220392','550022','550061','550065','550067','550102','550108') &
                ccr_score < 542
#             ) |
#             (
#                 campaign_id %in% c('220205','220378','550041','550053') &
#                 ccr_worst_payment_rating %in% c('{}', '-1')
            )
        ),
        leaptheory = expr(
            (
                ccr_score != -10000 &
                ccr_score < 520
            )
        )
    )
    
} ##FINAL!

In [None]:
getExclusionsPartnerSide = function () {
    
    list(
        lenderedge = expr(
            FALSE
#             ( ##  yes
#                 campaign_id %in% c('1710','1732','1747','1758','1781','1782','1784','1785','1793','1794','1795')
#             )
        ),
        evenfinancial = expr(
#             ( ##  yes
#                 campaign_id %in% c('05778644')
#             ) |
            ( ##  yes
                campaign_id %in% c('1CAA4F4A')
#             ) |
#             ( ##  yes
#                 campaign_id %in% c('30E8F0F4')
            )
        ),
        monevo = expr(
            FALSE
#             ( ##  yes
#                 campaign_id %in% c('829','835','842','845','864','870','883','1033')
# #             ) |
#             ( ##  yes
#                 campaign_id %in% c('690','711','846','867','907','919','940','1007') &
#                 ccr_score < 552
#             )
        ),
        quinstreet = expr(
            FALSE
#             ( ##  yes
#                 campaign_id %in% c('105644644','110359244','111370544','111448744','111606844','112726844','113048944','113492144') &
# #                 ccr_score < 533 ## less aggressive
#                 ccr_score < 540 ## more aggressive
#             ) |
#             ( ##  yes
#                 campaign_id %in% c('105644644','110359244','111370544','111448744','111606844','112726844','113048944','113492144') &
#                 ccr_score >= 533 & ## more aggressive
# #                 ccr_score >= 550 &
# #                 ccr_score >= 600 & ## less aggressive
#                 ccr_number_of_loans < 1.5 &
#                 ccr_number_of_employers_last_six_months < 0.5
#             )
        ),
        leadgroup = expr(
            FALSE
#             ( ##  yes
#                 campaign_id %in% c('220026','220223','220257','220392','550022','550061','550065','550067','550102','550108') &
# #                 ccr_score < 542 ## less aggressive
#                 ccr_score < 550 ## more aggressive
#             )
        ),
        leaptheory = expr(
            FALSE
#             (
#                 ccr_score >= 529 &
#                 campaign_id %in% c('101366','102959','211205','321098','43583','52270','53376','71605','81279','90384','90448') &
#                 ccr_number_of_bank_accounts >= 2.5
#             )
        )
    )
    
}

In [None]:
# lenderedge %>%
#     filterModelRows(
#         denominator.quo = NA,
#         campaign_id_keep = NA,
#         timestart = '2020-05-01',
#         timeend = '2020-07-01'
#     ) %>%
#     tagExclusion(
#         exclude.lde = getExclusions()[['quinstreet']],
#         exclude.ptr = getExclusionsPartnerSide()[['quinstreet']]
#     ) %>%
#     evaluateHistorical(
#         timeslice = FALSE, format.output = TRUE
#     )

In [None]:
tagExclusion = function (df, exclude.lde, exclude.ptr) {
    
    ##  tagExclusion is meant to intake (independently)
    ##  1. Test
    ##  2. Validate
    ##  3. PROCESSED, UNFILTERED original

    df %>% 
        mutate(
            is.v2.cut = eval(exclude.lde),
            is.v2.cut.ptr = eval(exclude.ptr),
        )

}

In [None]:
evaluateTestValidate = function (df.tagged, numerator.quo) {
    
    ##  evaluateTestValidate is meant to intake (independently)
    ##  1. Test
    ##  2. Validate

    df.tagged %>%
        group_by(
            !!numerator.quo
        ) %>% 
        summarize(
            cut = sum(is.v2.cut)
        )

}

In [None]:
evaluateHistorical = function (df.tagged, timeslice = FALSE, format.output = TRUE) {
    
    ##  evaluateHistoricalAggregate is meant to intake (independently)
    ##  1. PROCESSED, UNFILTERED original

    calculateDiff = function (new, old) {
        (new - old)
    }
    calculateDiffPerc = function (new, old) {
        (new - old)/old
    }
    calculateConversion = function (numerator, denominator) {
        numerator/denominator
    }
    
    
    getCountsRaw = function (df.tagged) {
        
        df.tagged %>% 
            filter(
                !is.cut.v1
            ) %>% 
            {
                if (timeslice == TRUE) {

                    . %>% group_by(lead_time)

                } else {
                    return(.)
                }

            }() %>% 
            summarize(
#                 leads.actual = n(),
                
                accept.actual = sum(accepted == 'TRUE'),
                apps.actual = sum(newentered == 'TRUE'),
                brp.actual = sum(bizrulespassed == 'TRUE'),
                qualified.actual = sum(qualified == 'TRUE'),
                bankverified.actual = sum(bankverified == 'TRUE'),
                passscorecardratecard.actual = sum(passscorecardratecard == 'TRUE'),
                contractsigned.actual = sum(contractsigned == 'TRUE'),
                cs.decisioned.actual = sum(cs_decisioned == 'TRUE'),
                funded.actual = sum(funded == 'TRUE'),
                funded.dollar.actual = sum((funded == 'TRUE') * funded_amount),
                clarity.cost.total.actual = sum(
                    sum(clear_credit_risk_app == 'TRUE', na.rm = TRUE) * 0.65,
                    sum(clear_recent_history_app == 'TRUE', na.rm = TRUE) * 0.05,
                    sum(clear_bank_behavior_app == 'TRUE', na.rm = TRUE) * 0.95,
                    sum(clear_advanced_attributes_app == 'TRUE', na.rm = TRUE) * 0.45,
                    sum(clear_fraud_insights_app == 'TRUE', na.rm = TRUE) * 0.65,
                    sum(clear_credit_risk_lead == 'TRUE', na.rm = TRUE) * 0.65,
                    sum(clear_recent_history_lead == 'TRUE', na.rm = TRUE) * 0.05,
                    sum(clear_bank_behavior_lead == 'TRUE', na.rm = TRUE) * 0.95,
                    sum(clear_advanced_attributes_lead == 'TRUE', na.rm = TRUE) * 0.45,
                    sum(clear_fraud_insights_lead == 'TRUE', na.rm = TRUE) * 0.65
                ),
                ibv.cost.actual = sum(ibv_costs),

                
                accept.proposed = sum(accepted == 'TRUE' & !is.v2.cut),
                apps.proposed = sum(newentered == 'TRUE' & !is.v2.cut),
                brp.proposed = sum(bizrulespassed == 'TRUE' & !is.v2.cut),
                qualified.proposed = sum(qualified == 'TRUE' & !is.v2.cut),
                bankverified.proposed = sum(bankverified == 'TRUE' & !is.v2.cut),
                passscorecardratecard.proposed = sum(passscorecardratecard == 'TRUE' & !is.v2.cut),
                contractsigned.proposed = sum(contractsigned == 'TRUE' & !is.v2.cut),
                cs.decisioned.proposed = sum(cs_decisioned == 'TRUE' & !is.v2.cut),
                funded.proposed = sum(funded == 'TRUE' & !is.v2.cut),
                funded.dollar.proposed = sum((funded == 'TRUE' & !is.v2.cut) * funded_amount),
                clarity.cost.total.proposed = sum(
                    sum(clear_credit_risk_app == 'TRUE' & !is.v2.cut, na.rm = TRUE) * 0.65,
                    sum(clear_recent_history_app == 'TRUE' & !is.v2.cut, na.rm = TRUE) * 0.05,
                    sum(clear_bank_behavior_app == 'TRUE' & !is.v2.cut, na.rm = TRUE) * 0.95,
                    sum(clear_advanced_attributes_app == 'TRUE' & !is.v2.cut, na.rm = TRUE) * 0.45,
                    sum(clear_fraud_insights_app == 'TRUE' & !is.v2.cut, na.rm = TRUE) * 0.65,
                    sum(clear_credit_risk_lead == 'TRUE' & !is.v2.cut.ptr, na.rm = TRUE) * 0.65,
                    sum(clear_recent_history_lead == 'TRUE' & !is.v2.cut.ptr, na.rm = TRUE) * 0.05,
                    sum(clear_bank_behavior_lead == 'TRUE' & !is.v2.cut.ptr, na.rm = TRUE) * 0.95,
                    sum(clear_advanced_attributes_lead == 'TRUE' & !is.v2.cut.ptr, na.rm = TRUE) * 0.45,
                    sum(clear_fraud_insights_lead == 'TRUE' & !is.v2.cut.ptr, na.rm = TRUE) * 0.65
                ),
                ibv.cost.proposed = sum(ibv_costs * !is.v2.cut),
            )
        
    }
    
    getCountsActual = function (counts.raw) {
        
        counts.raw %>%
            select(
                matches("(?:\\.actual|lead_time)$")
            ) %>% 
            rename_all(
                .funs = ~ .x %>% str_remove_all('\\.(?:actual|proposed)$')
            )
        
    }
    getCountsProposed = function (counts.raw) {
        
        counts.raw %>%
            select(
                matches("(?:\\.proposed|lead_time)$")
            ) %>% 
            rename_all(
                .funs = ~ .x %>% str_remove_all('\\.(?:actual|proposed)$')
            )
        
    }
    getCountsDeltaAbsolute = function (counts.raw) {
        
        counts.raw %>% 
            {
                if (timeslice == TRUE) {

                    . %>% group_by(lead_time)

                } else {
                    return(.)
                }

            }() %>% 
            summarize(
                accept = calculateDiff(
                    accept.proposed,
                    accept.actual
                ),
                apps = calculateDiff(
                    apps.proposed,
                    apps.actual
                ),
                brp = calculateDiff(
                    brp.proposed,
                    brp.actual
                ),
                qualified = calculateDiff(
                    qualified.proposed,
                    qualified.actual
                ),
                bankverified = calculateDiff(
                    bankverified.proposed,
                    bankverified.actual
                ),
                passscorecardratecard = calculateDiff(
                    passscorecardratecard.proposed,
                    passscorecardratecard.actual
                ),
                contractsigned = calculateDiff(
                    contractsigned.proposed,
                    contractsigned.actual
                ),
                cs.decisioned = calculateDiff(
                    cs.decisioned.proposed,
                    cs.decisioned.actual
                ),
                funded = calculateDiff(
                    funded.proposed,
                    funded.actual
                ),
                funded.dollar = calculateDiff(
                    funded.dollar.proposed,
                    funded.dollar.actual
                ),
                clarity.cost.total = calculateDiff(
                    clarity.cost.total.proposed,
                    clarity.cost.total.actual
                ),
                ibv.cost.total = calculateDiff(
                    ibv.cost.proposed,
                    ibv.cost.actual
                )

            )
        
    }
    getCountsDeltaRelative = function (counts.raw) {
        
        counts.raw %>%
            {
                if (timeslice == TRUE) {

                    . %>% group_by(lead_time)

                } else {
                    return(.)
                }

            }() %>% 
            summarize(
                accept = calculateDiffPerc(
                    accept.proposed,
                    accept.actual
                ),
                apps = calculateDiffPerc(
                    apps.proposed,
                    apps.actual
                ),
                brp = calculateDiffPerc(
                    brp.proposed,
                    brp.actual
                ),
                qualified = calculateDiffPerc(
                    qualified.proposed,
                    qualified.actual
                ),
                bankverified = calculateDiffPerc(
                    bankverified.proposed,
                    bankverified.actual
                ),
                passscorecardratecard = calculateDiffPerc(
                    passscorecardratecard.proposed,
                    passscorecardratecard.actual
                ),
                contractsigned = calculateDiffPerc(
                    contractsigned.proposed,
                    contractsigned.actual
                ),
                cs.decisioned = calculateDiffPerc(
                    cs.decisioned.proposed,
                    cs.decisioned.actual
                ),
                funded = calculateDiffPerc(
                    funded.proposed, funded.actual
                ),
                funded.dollar = calculateDiffPerc(
                    funded.dollar.proposed,
                    funded.dollar.actual
                ),
                clarity.cost.total = calculateDiffPerc(
                    clarity.cost.total.proposed,
                    clarity.cost.total.actual
                ),
                ibv.cost.total = calculateDiffPerc(
                    ibv.cost.proposed,
                    ibv.cost.actual
                )
            )
        
    }
    
    getConversionsActual = function (counts.raw) {
        
        counts.raw %>% 
            {
                if (timeslice == TRUE) {

                    . %>% group_by(lead_time)

                } else {
                    return(.)
                }

            }() %>% 
            summarize(
#                 accept.rate = calculateConversion(accept.actual, leads.actual),
                apply.rate = calculateConversion(apps.actual, accept.actual),
                qualified.rate = calculateConversion(qualified.actual, apps.actual),
                funding.rate = calculateConversion(funded.actual, qualified.actual),
                
                accept.to.fund = calculateConversion(funded.actual, accept.actual),
                app.to.fund = calculateConversion(funded.actual, apps.actual),
                
                bv.q = calculateConversion(bankverified.actual, qualified.actual),
                sc.bv = calculateConversion(passscorecardratecard.actual, bankverified.actual),
                cs.sc = calculateConversion(contractsigned.actual, passscorecardratecard.actual),
                dec.cs = calculateConversion(cs.decisioned.actual, contractsigned.actual),
                f.dec = calculateConversion(funded.actual, cs.decisioned.actual)
                
            )
        
    }
    getConversionsProposed = function (counts.raw) {
        
        counts.raw %>% 
            {
                if (timeslice == TRUE) {

                    . %>% group_by(lead_time)

                } else {
                    return(.)
                }

            }() %>% 
            summarize(
#                 accept.rate = calculateConversion(accept.proposed, leads.actual),
                apply.rate = calculateConversion(apps.proposed, accept.proposed),
                qualified.rate = calculateConversion(qualified.proposed, apps.proposed),
                funding.rate = calculateConversion(funded.proposed, qualified.proposed),

                accept.to.fund = calculateConversion(funded.proposed, accept.proposed),
                app.to.fund = calculateConversion(funded.proposed, apps.proposed),

                bv.q = calculateConversion(bankverified.proposed, qualified.proposed),
                sc.bv = calculateConversion(passscorecardratecard.proposed, bankverified.proposed),
                cs.sc = calculateConversion(contractsigned.proposed, passscorecardratecard.proposed),
                dec.cs = calculateConversion(cs.decisioned.proposed, contractsigned.proposed),
                f.dec = calculateConversion(funded.proposed, cs.decisioned.proposed)

            )

    }
    getConversionsDeltaAbsolute = function (conversions.proposed, conversions.actual) {
        
        if (timeslice == TRUE) {
            
            conversions.proposed %>% 
                inner_join(
                    conversions.actual,
                    by = 'lead_time',
                    suffix = c('.proposed', '.actual')
                ) %>% 
                group_by(
                    lead_time
                ) %>% 
                summarize(
#                     accept.rate = 10000 * calculateDiff(accept.rate.proposed, accept.rate.actual),
                    apply.rate = 10000 * calculateDiff(apply.rate.proposed, apply.rate.actual),
                    qualified.rate = 10000 * calculateDiff(qualified.rate.proposed, qualified.rate.actual),
                    funding.rate = 10000 * calculateDiff(funding.rate.proposed, funding.rate.actual),

                    accept.to.fund = 10000 * calculateDiff(accept.to.fund.proposed, accept.to.fund.actual),
                    app.to.fund = 10000 * calculateDiff(app.to.fund.proposed, app.to.fund.actual),

                    bv.q = 10000 * calculateDiff(bv.q.proposed, bv.q.actual),
                    sc.bv = 10000 * calculateDiff(sc.bv.proposed, sc.bv.actual),
                    cs.sc = 10000 * calculateDiff(cs.sc.proposed, cs.sc.actual),
                    dec.cs = 10000 * calculateDiff(dec.cs.proposed, dec.cs.actual),
                    f.dec = 10000 * calculateDiff(f.dec.proposed, f.dec.actual)
                )
            
        } else {
            
            data.frame(
#                 accept.rate = 10000 * calculateDiff(conversions.proposed$accept.rate, conversions.actual$accept.rate),
                apply.rate = 10000 * calculateDiff(conversions.proposed$apply.rate, conversions.actual$apply.rate),
                qualified.rate = 10000 * calculateDiff(conversions.proposed$qualified.rate, conversions.actual$qualified.rate),
                funding.rate = 10000 * calculateDiff(conversions.proposed$funding.rate, conversions.actual$funding.rate),

                accept.to.fund = 10000 * calculateDiff(conversions.proposed$accept.to.fund, conversions.actual$accept.to.fund),
                app.to.fund = 10000 * calculateDiff(conversions.proposed$app.to.fund, conversions.actual$app.to.fund),

                bv.q = 10000 * calculateDiff(conversions.proposed$bv.q, conversions.actual$bv.q),
                sc.bv = 10000 * calculateDiff(conversions.proposed$sc.bv, conversions.actual$sc.bv),
                cs.sc = 10000 * calculateDiff(conversions.proposed$cs.sc, conversions.actual$cs.sc),
                dec.cs = 10000 * calculateDiff(conversions.proposed$dec.cs, conversions.actual$dec.cs),
                f.dec = 10000 * calculateDiff(conversions.proposed$f.dec, conversions.actual$f.dec),

                stringsAsFactors = FALSE
            )
            
        }
        
    }
    getConversionsDeltaRelative = function (conversions.proposed, conversions.actual) {
        
        if (timeslice == TRUE) {
            
            conversions.proposed %>% 
                inner_join(
                    conversions.actual,
                    by = 'lead_time',
                    suffix = c('.proposed', '.actual')
                ) %>% 
                group_by(
                    lead_time
                ) %>% 
                summarize(
#                     accept.rate = calculateDiffPerc(accept.rate.proposed, accept.rate.actual),
                    apply.rate = calculateDiffPerc(apply.rate.proposed, apply.rate.actual),
                    qualified.rate = calculateDiffPerc(qualified.rate.proposed, qualified.rate.actual),
                    funding.rate = calculateDiffPerc(funding.rate.proposed, funding.rate.actual),

                    accept.to.fund = calculateDiffPerc(accept.to.fund.proposed, accept.to.fund.actual),
                    app.to.fund = calculateDiffPerc(app.to.fund.proposed, app.to.fund.actual),

                    bv.q = calculateDiffPerc(bv.q.proposed, bv.q.actual),
                    sc.bv = calculateDiffPerc(sc.bv.proposed, sc.bv.actual),
                    cs.sc = calculateDiffPerc(cs.sc.proposed, cs.sc.actual),
                    dec.cs = calculateDiffPerc(dec.cs.proposed, dec.cs.actual),
                    f.dec = calculateDiffPerc(f.dec.proposed, f.dec.actual)
                )
            
        } else {
            
            data.frame(
#                 accept.rate = calculateDiffPerc(conversions.proposed$accept.rate, conversions.actual$accept.rate),
                apply.rate = calculateDiffPerc(conversions.proposed$apply.rate, conversions.actual$apply.rate),
                qualified.rate = calculateDiffPerc(conversions.proposed$qualified.rate, conversions.actual$qualified.rate),
                funding.rate = calculateDiffPerc(conversions.proposed$funding.rate, conversions.actual$funding.rate),

                accept.to.fund = calculateDiffPerc(conversions.proposed$accept.to.fund, conversions.actual$accept.to.fund),
                app.to.fund = calculateDiffPerc(conversions.proposed$app.to.fund, conversions.actual$app.to.fund),

                bv.q = calculateDiffPerc(conversions.proposed$bv.q, conversions.actual$bv.q),
                sc.bv = calculateDiffPerc(conversions.proposed$sc.bv, conversions.actual$sc.bv),
                cs.sc = calculateDiffPerc(conversions.proposed$cs.sc, conversions.actual$cs.sc),
                dec.cs = calculateDiffPerc(conversions.proposed$dec.cs, conversions.actual$dec.cs),
                f.dec = calculateDiffPerc(conversions.proposed$f.dec, conversions.actual$f.dec),

                stringsAsFactors = FALSE
            )
            
        }
        
    }
    
    formatNumber = function (summary.df) {
        
        summary.df %>% 
            mutate_if(
                .predicate = is.numeric,
                .funs = formatC,
                format = 'd',
                big.mark = ','
            )
        
    }
    formatPercent = function (summary.df) {
        
        summary.df %>% 
            mutate_if(
                .predicate = is.numeric,
                .funs = ~
                    scales::label_percent(
                        accuracy = 0.1,
                        big.mark = ','
                    )(.x)
            )
        
    }
    formatExecutor = function (df, format.function, format.output) {
        
        {
            if (format.output == TRUE) {
                
                df %>% format.function
                
            } else {
                
                return(df)
                
            }
        }
        
    }
    
    
    counts.raw = df.tagged %>% getCountsRaw()

    return(
        list(
            count.actual = counts.raw %>%
                getCountsActual() %>%
                { if (format.output == TRUE) {. %>% formatNumber} else {return(.)} }(),
            count.proposed = counts.raw %>%
                getCountsProposed() %>%
                { if (format.output == TRUE) {. %>% formatNumber} else {return(.)} }(),
            count.delta.abs = counts.raw %>%
                getCountsDeltaAbsolute() %>%
                { if (format.output == TRUE) {. %>% formatNumber} else {return(.)} }(),
            count.delta.perc = counts.raw %>%
                getCountsDeltaRelative() %>%
                { if (format.output == TRUE) {. %>% formatPercent} else {return(.)} }(),
            
            
            conv.actual = counts.raw %>%
                getConversionsActual() %>%
                { if (format.output == TRUE) {. %>% formatPercent} else {return(.)} }(),
            conv.proposed = counts.raw %>%
                getConversionsProposed() %>%
                { if (format.output == TRUE) {. %>% formatPercent} else {return(.)} }(),
            
            conv.delta.abs =
                getConversionsDeltaAbsolute(
                    conversions.proposed = counts.raw %>% getConversionsProposed(),
                    conversions.actual = counts.raw %>% getConversionsActual()
                ) %>%
                { if (format.output == TRUE) {. %>% formatNumber} else {return(.)} }(),
            conv.delta.perc =
                getConversionsDeltaRelative(
                    conversions.proposed = counts.raw %>% getConversionsProposed(),
                    conversions.actual = counts.raw %>% getConversionsActual()
                ) %>%
                { if (format.output == TRUE) {. %>% formatPercent} else {return(.)} }()
        )
    )
    
}

In [None]:
plotHistorical = function (df.evaluation, metric.quo) {
    
    df.evaluation %>% 
        ggplot(
            mapping = aes(
                x = lead_time,
                y = !!metric.quo
            )
        ) +
        geom_line() +
        geom_point()
    
}

In [None]:
getFinalRecommendation = function (df, admethod) {
    
    admethod.short = admethod %>% 
        str_to_lower() %>% 
        str_remove_all(
            '\\s'
        ) %>% 
        str_remove_all(
            '\\d'
        )
    
    aggregates = df %>%
        filterModelRows(
#             denominator.quo = quo(accepted),
            denominator.quo = NA,
            campaign_id_keep = NA,
#             timestart = '2020-05-01',
#             timeend = '2020-07-01'
            timestart = '2020-08-01',
            timeend = '2020-09-01'
#             timestart = '2020-01-01',
#             timeend = '2020-04-01'
        ) %>%
        tagExclusion(
            exclude.lde = getExclusions()[[admethod.short]],
            exclude.ptr = getExclusionsPartnerSide()[[admethod.short]]
        ) %>%
        evaluateHistorical(
            timeslice = FALSE, format.output = TRUE
        )
    
        list(
            counts = aggregates %>%
                .[names(aggregates) %in% c('count.actual', 'count.proposed', 'count.delta.abs', 'count.delta.perc')],
            conversions = aggregates %>% 
                .[names(aggregates) %in% c('conv.actual', 'conv.proposed', 'conv.delta.abs', 'conv.delta.perc')]
        ) %>% 
        map(
            .f = ~ .x %>% 
                map(
                    .f = t
                ) %>% 
                do.call(
                    what = cbind,
                    args = .
                ) %>% 
                as.data.frame(
                    stringsAsFactors = FALSE
                ) %>% 
                setNames(
                    c(
                        'actual',
                        'proposed',
                        'delta.absolute',
                        'delta.relative'
                    )
                ) %>% 
                rownames_to_column(
                    var = 'metric'
                ) %>% 
                mutate(
                    admethod = admethod.short
                ) %>% 
                select(
                    admethod,
                    metric,
                    actual,
                    proposed,
                    delta.absolute,
                    delta.relative
                )
        )
    
        
    
    
#     df.filtered = df %>%
#         completePreModel(
#             denominator.quo = quo(accepted),
#             numerator.quo = quo(funded),
#             campaign_id_keep = NA,
#             timestart = '2020-05-01',
#             timeend = '2020-07-01',
#             train.perc = 0.8,
#             seed.num = 1
#         )

#     timeslice =
#         leaptheory %>%
#         filterModelRows(
#             denominator.quo = quo(accepted),
#             campaign_id_keep = NA,
#             timestart = '2020-05-01',
#             timeend = '2020-07-01'
#         ) %>%
#         tagExclusion(getExclusions()[[admethod.short]]) %>%
#         evaluateHistorical(timeslice = TRUE, format.output = FALSE)
    
#     df.filtered %>%
#         getDecisionTree(
#             numerator.quo = quo(funded),
#             minsplit = 1000,
#             minbucket = 1000,
#             cp = 0.001
#         )
    
#     df.filtered[c('test', 'validate', 'train', 'original')] %>% 
#         map(
#             .f = ~ .x %>%
#                     nodes.to.exclude.logic.expr = getExclusions()[[admethod.short]]
#                 ) %>%
#                 evaluateTestValidate(
#                     numerator.quo = quo(funded)
#                 )
#         )
    
#     timeslice$count.delta.perc %>% plotHistorical(quo(accept))
#     timeslice$count.delta.perc %>% plotHistorical(quo(funded))
    
}

# Execution

In [None]:
# getAllDFs()

In [None]:
df.list = getBindedDFs(write = FALSE) %>% appendCutV1()

In [None]:
lenderedge = df.list %>% completePreProcess(admethod = 'LenderEdge 4', timestart = '2020-08-01', timeend = '2020-09-01')
# lenderedge %>% getFinalRecommendation(admethod = 'LenderEdge 4') %>% do.call(rbind, .) %>% rownames_to_column('type')

evenfinancial = df.list %>% completePreProcess(admethod = 'Even Financial 4', timestart = '2020-08-01', timeend = '2020-09-01')
# evenfinancial %>% getFinalRecommendation(admethod = 'Even Financial 4') %>% do.call(rbind, .) %>% rownames_to_column('type')

monevo = df.list %>% completePreProcess(admethod = 'Monevo', timestart = '2020-08-01', timeend = '2020-09-01')
# monevo %>% getFinalRecommendation(admethod = 'Monevo') %>% do.call(rbind, .) %>% rownames_to_column('type')

quinstreet = df.list %>% completePreProcess(admethod = 'Quin Street 4', timestart = '2020-08-01', timeend = '2020-09-01')
# quinstreet %>% getFinalRecommendation(admethod = 'Quin Street 4') %>% do.call(rbind, .) %>% rownames_to_column('type')

leadgroup = df.list %>% completePreProcess(admethod = 'LeadGroup', timestart = '2020-08-01', timeend = '2020-09-01')
# leadgroup %>% getFinalRecommendation(admethod = 'LeadGroup') %>% do.call(rbind, .) %>% rownames_to_column('type')

leaptheory = df.list %>% completePreProcess(admethod = 'LeapTheory 4', timestart = '2020-08-01', timeend = '2020-09-01')
# leaptheory %>% getFinalRecommendation(admethod = 'LeapTheory 4') %>% do.call(rbind, .) %>% rownames_to_column('type')

In [None]:
list(
    lenderedge %>% getFinalRecommendation(admethod = 'LenderEdge 4') %>% do.call(rbind, .) %>% rownames_to_column('type'),
    evenfinancial %>% getFinalRecommendation(admethod = 'Even Financial 4') %>% do.call(rbind, .) %>% rownames_to_column('type'),
    monevo %>% getFinalRecommendation(admethod = 'Monevo') %>% do.call(rbind, .) %>% rownames_to_column('type'),
    quinstreet %>% getFinalRecommendation(admethod = 'Quin Street 4') %>% do.call(rbind, .) %>% rownames_to_column('type'),
    leadgroup %>% getFinalRecommendation(admethod = 'LeadGroup') %>% do.call(rbind, .) %>% rownames_to_column('type'),
    leaptheory %>% getFinalRecommendation(admethod = 'LeapTheory 4') %>% do.call(rbind, .) %>% rownames_to_column('type')
) %>% 
    do.call(
        what = rbind,
        args = .
    ) %>% 
    write.csv(
        "..\\docs\\v2-impact-export-q12020.csv",
        row.names = FALSE
    )

In [None]:
df.list$evenfinancial %>%
    group_by(
        date = lead_time %>% as.Date()
    ) %>% 
    summarize(
        accept = sum(accepted),
        cut = sum(
            accepted &
            campaign_id %>% str_detect('05778644::')
        ),
        cut.with.ccr = sum(
            accepted &
            campaign_id %>% str_detect('05778644::') &
            ccr_score < 540
        ),
        
        cut.full.perc = cut/accept,
        cut.with.ccr.perc = cut.with.ccr/accept,
        
    ) %>% 
    pivot_longer(
        cols = !date,
        names_to = 'key',
        values_to = 'value'
    ) %>% 
    filter(
        key %>% str_detect('perc')
    ) %>% 
    ggplot(
        mapping = aes(
            x = date,
            y = value,
            color = key,
            fill = key
        )
    ) +
    geom_point() +
    geom_line() +
    scale_y_continuous(
        labels = scales::percent
    )

### Random Forest

In [None]:
getRemoveNALogic = function (data.split) {
    
    data.split$train.bal %>% 
        select(
            -truefpd,
            -type_formula,
            -funded_amount
        ) %>% 
        apply(
            FUN = function (x) {x %>% is.na() %>% mean()},
            MARGIN = 2
        ) %>% 
        as.data.frame(
        ) %>%
        select(
            perc = '.'
        ) %>% 
        rownames_to_column(
            'field'
        ) %>% 
        filter(
            perc > 0
        ) %>% 
        mutate(
            expression = field %>% 
                map(
                    .f = ~ paste0(
                        '!is.na(',
                        .x,
                        ')'
                    )
                ) %>% as.character()
        ) %>% 
        .$expression %>% 
        paste0(
            collapse = ' & '
        )
    
}

In [None]:
getRandomForest = function (
    data.split,
    evaluate = FALSE,
    ntree = 24,
    mtry = data.split$train.bal %>% ncol() %>% sqrt() %>% ceiling(),
    replace = TRUE,
#     cutoff = 1/2,
#     sampsize = nrow(data.split$train.bal),
    #     nodesize = 5,
    oob.prox = FALSE
) {
 

    ####  Setup  ####

    suppressWarnings({suppressMessages({
        library(randomForest)
    })})
#     suppressWarnings({
#         listLearners() %>%
#             filter(
#                 type == 'classif' &
#                 name %>% str_detect('[Ff]orest')
#             )
#     })
#     getParamSet('classif.randomForest')


    ####  Task + Learner = Train  ####

    rf.task = makeClassifTask(
        id = 'rf.task',
        data = data.split$train.bal %>%
            select(
                -newentered,
                -qualified,
                -funded_amount,
                -type_formula,
                -truefpd
            ) %>% 
            filter(
                parse(
                    text = getRemoveNALogic(data.split = data.split)
                ) %>%
                eval()
            ) %>% as.data.frame(),
        target = 'funded',
        positive = 'TRUE'
    )

    rf.learner = makeLearner(
        cl = 'classif.randomForest',
        id = 'rf.learner',
        ntree = ntree,
        mtry = mtry,
        replace = replace,
#         cutoff = cutoff,
#         sampsize = sampsize,
#         nodesize = nodesize,
        oob.prox = FALSE
    )



    ####  Hypertuning  ####

#     rf.resample = makeResampleDesc('CV', iters = 5, stratify = TRUE)

#     rf.params = makeParamSet(
#     #     makeIntegerParam('mtry', lower = 4, upper = 12),
#     #     makeNumericParam('nodesize', lower = 10, upper = 11)
#     )

#     rf.search = makeTuneControlGrid()

#     rf.tune = tuneParams(
#         task = rf.task,
#         learner = rf.learner,
#         resampling = rf.resample,
#         par.set = rf.params,
#         control = rf.search,
#         measures = list(mmce, acc, fpr),
#         show.info = FALSE
#     )

#     rf.tune$x

#     # setHyperPars(
#     #     learner = rf.learner,
#     #     par.vals = rf.tune$x
#     # ) 



    ####  Resampling  ####

    # rf.cv = resample(
    #     learner = rf.learner,
    #     task = rf.task,
    #     resampling = rf.resample,
    #     measures = list(acc, mmce, fpr),
    #     show.info = FALSE
    # )

    # rf.cv %>% .$measures.test
    # rf.cv %>% .$aggr %>% as.data.frame()



    ####  Training  ####

    rf.model = train(
        learner = rf.learner,
        task = rf.task
    )



    ####  Evaluation  ####

#     rf.predict = predict(
#         object = rf.model,
#         newdata = data.split$test %>%
#             select(
#                 -newentered,
#                 -qualified,
#                 -funded_amount,
#                 -type_formula,
#                 -truefpd
#             )
#     )

#     rf.validate = predict(
#         object = rf.model,
#         newdata = data.split$validate %>%
#             select(
#                 -newentered,
#                 -qualified,
#                 -funded_amount,
#                 -type_formula,
#                 -truefpd
#             )
#     )
    
#     test.eval = rf.predict %>% getEvaluation()
#     validate.eval = rf.validate %>% getEvaluation()

#     if (evaluate) {
#         list(
#             test.eval = test.eval,
#             validate.eval = validate.eval
#         )
#     }



    ####  Outputs  ####
    
    ##    Model    ##
    return(rf.model)
   
}

In [None]:
getVIP = function (rf.model) {
    
    df = rf.model$learner.model %>%
        importance() %>%
        as.data.frame() %>%
        rownames_to_column(
            var = 'variable'
        ) %>%
        arrange(
            MeanDecreaseGini %>% desc()
        )
    
    plot = rf.model$learner.model %>%
        varImpPlot()
    
    list(
        df = df,
        plot = plot
    )
}

In [None]:
rf = data.split %>% getRandomForest()

In [None]:
rf %>% getVIP()

In [None]:
#### Clarity Field Analysis ####
clarityAnalysis = function () {

# ####  Clarity Report Type Validation  ####
# test %>%
#     filter(
#         ! report_received %>% str_detect('(?:FWB)?Leads01.*')
#     ) %>% 
#     transmute(
#         lead_id,
#         report_received,
#         report_requested,
#         lead_time,
#         report_time,
#         lead.date = lead_time %>% as.Date(),
#         report.date = report_time %>% as.Date(),
#         diff = lead.date - report.date
#     ) %>% 
#     arrange(
#         diff
#     )
# #     ) %T>%
# #     write.csv("..\\docs\\received-not-leads01.csv")

# test %>% group_by(report_received) %>% summarize(n = n()) %>% ungroup() %>% arrange(desc(n))

#### Get Clarity Data

####  Existing Fields  ####
getClarityFields = function () {

    a <- queryReporting(
    "
    select
        *
    from
        lde4.leads
    where
        --lead_time >= now()::date - '5 days'::interval
        lead_id = '99f418da-8b66-471e-9586-f4112718ed21'
    limit 100
    "
    ) %>%
        select(
            lead_id,
            clarity_report,
            accepted
        )

    b <- a %>%
        filter(
            !is.na(clarity_report)
        ) %>% 
        mutate(
            json.df = clarity_report %>% 
                map(
                    .f = ~ .x %>%
                        fromJSON() %>%
                        .$xml_response %>% 
                        unlist() %>% 
                        as.data.frame(
                            stringsAsFactors = FALSE
                        ) %>% 
                        t()
                )
        )

    all.fields <<- b %>%
        filter(
            lead_id == '99f418da-8b66-471e-9586-f4112718ed21'
        ) %>%
        .$clarity_report %>%
        fromJSON(
        ) %>% 
        .$xml_response %>% 
        unlist(
        ) %>% 
        as.data.frame(
            stringsAsFactors = FALSE
        ) %>% 
        rownames_to_column(
            var = 'key'
        ) %>% 
        rename(
            value = "."
        )

    inquiry.fields <<-
        all.fields %>%
            filter(
                key %>% str_detect('^inquiry\\..*') &
                value != '' &
                !is.na(value)
            ) %>% 
            filter(
                key %in% paste0(
                    'inquiry.',
                    c(
                        'ofac_match',
                        'ofac_score',
                        'social_security_valid',
                        'social_security_deceased',
                        'ssn_distinct_first_last_name_count',
                        'paycheck_direct_deposit',
                        'bank_routing_valid',
                        'inquiry_purpose_type'
                    )
                )
            )

    ccr.fields <<-
        all.fields %>%
            filter(
                key %>% str_detect('^clear_credit_risk\\..*') &
                value != '' &
                !is.na(value)
            ) %>% 
            filter(
                !key %>% str_detect('inquiries\\.member_id') &
                !key %>% str_detect('inquiry_received_at') &
                !key %>% str_detect('inquiry_purpose_type') &
                !key %>% str_detect('inquiry_tradeline_type') &
                !key %>% str_detect('tradelines\\..*') &
                !key %>% str_detect('stabilities\\..*') &
                !key %>% str_detect('experian_attribute\\..*') &
                !key %>% str_detect('description') &
                !key %>% str_detect('full_name') &
                !key %>% str_detect('code') &
                !key %>% str_detect('date') &
                !key %>% str_detect('first') &
                !key %>% str_detect('48')
#                 !key %>% str_detect('inquiry_purpose_type') & #keep
#                 !key %>% str_detect('inquiry_tradeline_type') & #keep
#                 !key %>% str_detect('tradelines\\.account_opened') & #keep
#                 !key %>% str_detect('tradelines\\.highest_credit') & #keep
#                 !key %>% str_detect('tradelines\\.amount_past_due') & #keep
#                 !key %>% str_detect('tradelines\\.current_balance') & #keep
            )

    crh.fields <<-
        all.fields %>%
            filter(
                key %>% str_detect('^clear_recent_history\\..*') &
                value != '' &
                !is.na(value)
            ) %>% 
            filter(
                !key %>% str_detect('tradeline_stabilities') &
                !key %>% str_detect('date') &
                !key %>% str_detect('name') &
                !key %>% str_detect('\\d+')
            )

    rbind(
        inquiry.fields,
        ccr.fields,
        crh.fields
    ) %>% .$key
    
}

####  Pull Test Clarity Report  ####
test = queryReporting(
"
select

    --  Identifiers --
    lde.lead_id
    , lde.leadofferid
    , lde.passthru_lead_offer_id
    , lde.lead_time at time zone 'America/Chicago' as lead_time
    , lde.partnerid

    --  Credit  --
    , case when lde.clarity_report notnull then TRUE else FALSE end as has_clarity
    , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'inquiry_received_at' as report_time
    , lde.clarity_report -> 'xml_response' -> 'opploans' ->> 'received' as report_received
    , lde.clarity_report -> 'xml_response' -> 'opploans' ->> 'requested_file' as report_requested

    , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'ofac_score' as ofac_score
    , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'paycheck_direct_deposit' as paycheck_direct_deposit
    , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'ssn_distinct_first_last_name_count' as ssn_distinct_first_last_name_count

    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'score' as ccr_score
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'clarity_seen' as ccr_clarity_seen
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'number_of_loans' as ccr_number_of_loans
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'number_of_bank_accounts' as ccr_number_of_bank_accounts
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'highest_number_of_days_past_due' as ccr_highest_number_of_days_past_due
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'current_inquiry_cluster_position' as ccr_current_inquiry_cluster_position
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'days_since_last_loan_charged_off' as ccr_days_since_last_loan_charged_off
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'days_since_inquiry_previously_seen' as ccr_days_since_inquiry_previously_seen
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'number_of_employers_last_six_months' as ccr_number_of_employers_last_six_months

    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'loans_in_collections' as srh_loans_in_collections
    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'spml_average_rollovers' as srh_spml_average_rollovers
    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'amount_loans_charged_off' as srh_amount_loans_charged_off
    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'online_loan_opened_in_the_last_year' as srh_online_loan_opened_in_the_last_year
    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'online_loan_inquiry_in_the_last_thirty_days' as srh_online_loan_inquiry_in_the_last_thirty_days

    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'inquiry_cluster_recent_history' -> 'total_inquiry_clusters_recent_history' ->> 'ninety_days_ago' as ticrh_ninety_days_ago
    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'inquiry_cluster_recent_history' -> 'total_inquiry_clusters_recent_history' ->> 'twentyfour_hours_ago' as ticrh_twentyfour_hours_ago


    from
        lde4.leads as lde
    inner join
        cloudlending.advertising_method as c_am
        on lde.partnerid = c_am.external_id
        and c_am.name = 'LenderEdge 4' 
    where
        lde.accepted = TRUE
        and lde.lead_time >= '2020-03-09'::date
    limit 1000
"
)

####  Identify Data Types  ####
not.features = c(
    'lead_id',
    'leadofferid',
    'passthru_lead_offer_id',
    'lead_time',
    'partnerid',
    'has_clarity',
    'report_time',
    'report_received',
    'report_requested'
)

boolean.features = c(
    'paycheck_direct_deposit',
    'ccr_hit',
    'ccr_clarity_seen',
    'srh_online_loan_opened_in_the_last_year',
    'srh_online_loan_inquiry_in_the_last_thirty_days'
)

numeric.features = colnames(test)[
    which(
        !colnames(test) %in% c(
            boolean.features,
            not.features,
            'ccr_worst_payment_rating_null',
            'ccr_worst_payment_rating_plus',
            'ccr_worst_payment_rating_zero',
            'ccr_worst_payment_rating_hash',
            'ccr_worst_payment_rating_else',
            'ccr_worst_payment_rating'
        )
    )
]

processed.features = c(
    'ccr_worst_payment_rating',
    'ccr_days_since_last_loan_charged_off',
    'ccr_days_since_last_loan_paid_off',
    'ccr_days_since_last_ontime_payment',
    'ccr_days_since_last_loan_payment',
    'ccr_days_since_last_loan_opened'
)

impute.median = c(
    'ccr_days_since_previous_bank_account_previously_seen',
    'ccr_days_since_reported_income_previously_seen',
    'ccr_days_since_inquiry_previously_seen',
    'ccr_highest_number_of_days_past_due',
    'paycheck_direct_deposit'
)

impute.mean = c(
    'ccr_number_of_loans',
    'ccr_number_of_bank_accounts',
    'ccr_number_of_loans_paid_off',
    'ccr_number_of_loans_paid_off',
    'ccr_number_of_loans_past_due',
    'ccr_current_inquiry_cluster_position',
    'ccr_number_of_loans_current_and_open',
    'ccr_number_of_employers_last_six_months',
    'ccr_score'
)

correlated.features.numeric = c(
    'icrh_ten_minutes_ago',                                    #ccr_current_inquiry_cluster_position
    'icrh_twenty_minutes_ago',                                 #ccr_current_inquiry_cluster_position
    'icrh_thirty_minutes_ago',                                 #ccr_current_inquiry_cluster_position
    'icrh_one_hour_ago',                                       #ccr_current_inquiry_cluster_position
    'icrh_twentyfour_hours_ago',                               #ccr_current_inquiry_cluster_position
    'icrh_seven_days_ago',                                     #ccr_current_inquiry_cluster_position
    'icrh_thirty_days_ago',                                    #ccr_current_inquiry_cluster_position
    'icrh_ninety_days_ago',                                    #ccr_current_inquiry_cluster_position
    'icrh_recent_history_current_inquiry_cluster_position',    #ccr_current_inquiry_cluster_position
    
    'ticrh_seven_days_ago',                                    #ticrh_twentyfour_hours_ago
    'ticrh_thirty_days_ago',                                   #ticrh_twentyfour_hours_ago
    
    'ccr_number_of_loans_paid_off',                            #ccr_number_of_loans
    'ccr_number_of_loans_past_due',                            #ccr_number_of_loans
    'ccr_number_of_loans_current_and_open',                    #ccr_number_of_loans,
    'ccr_days_since_reported_income_previously_seen',          #ccr_days_since_inquiry_previously_seen
    'ccr_days_since_previous_bank_account_previously_seen',    #ccr_days_since_inquiry_previously_seen
    
    'srh_amount_loans_in_collections',                         #srh_loans_in_collections
    'srh_days_with_open_loans_in_the_last_ninety_days',        #srh_loans_in_collections
    'srh_days_with_open_loans_in_the_last_year'                #srh_loans_in_collections
)

correlated.features.logical = c(
    'ccr_hit',                                                 #ccr_clarity_seen
    'ccr_worst_payment_rating_plus',                           #ccr_has_previous_loan_charged_off
    'ccr_worst_payment_rating_null',                           #ccr_has_previous_loan_opened  
    'ccr_has_previous_loan_payment',                           #ccr_has_previous_loan_opened  
    'ccr_has_previous_ontime_payment',                         #ccr_has_previous_loan_opened  
    'ccr_has_previous_loan_paid_off',                          #ccr_has_previous_loan_opened   
    'ccr_has_previous_loan_charged_off'                        #ccr_has_previous_loan_opened  
)

#### Convert Data Types

####  Convert Data Types  ####
test.clean = test %>%
    select(
        -not.features
    ) %>%
    mutate_at(
        .vars = boolean.features[which(! boolean.features %in% correlated.features.numeric)],
        .funs = as.logical
    ) %>%
    mutate_at(
        .vars = numeric.features[which(! numeric.features %in% correlated.features.numeric)],
        .funs = as.numeric
    ) %>% 
    mutate(
        ccr_worst_payment_rating_null = is.na(ccr_worst_payment_rating),
        ccr_worst_payment_rating_plus = !ccr_worst_payment_rating_null & ccr_worst_payment_rating == '+',
        ccr_worst_payment_rating_zero = !ccr_worst_payment_rating_null & ccr_worst_payment_rating == '0',
        ccr_worst_payment_rating_hash = !ccr_worst_payment_rating_null & ccr_worst_payment_rating == '#',
        ccr_worst_payment_rating_else = !(
            ccr_worst_payment_rating_plus |
            ccr_worst_payment_rating_zero |
            ccr_worst_payment_rating_hash |
            ccr_worst_payment_rating_null
        ),
        
        ccr_has_previous_loan_charged_off = ccr_days_since_last_loan_charged_off %>%
            getClarityMapping()$convertDaysChargedOff(),
        ccr_has_previous_loan_paid_off = ccr_days_since_last_loan_paid_off %>% 
            getClarityMapping()$convertDaysPaidOff(),
        ccr_has_previous_ontime_payment = ccr_days_since_last_ontime_payment %>% 
            getClarityMapping()$convertDaysOntimePayment(),
        ccr_has_previous_loan_payment = ccr_days_since_last_loan_payment %>% 
            getClarityMapping()$convertDaysAnyPayment(),
        ccr_has_previous_loan_opened = ccr_days_since_last_loan_opened %>% 
            getClarityMapping()$convertDaysLoanOpened()
        
        
    ) %>%
    select(
        -processed.features
    )
# test.clean %>% str()

#### Impute

####  Examine Values in Field  ####
field = quo(paycheck_direct_deposit)

test.clean[[quo_name(field)]] %>% median(na.rm = TRUE)
test.clean[[quo_name(field)]] %>% mean(na.rm = TRUE)

test %>%
    group_by(
#         var = !!field %>% as.numeric
        var = !!field
    ) %>% 
    summarize(
        n = n()
    ) %>%
    ungroup() %>%
#     filter(
#         !is.na(var)
#         var < 100
#     ) %>% 
    arrange(
#         desc(n)
        var
#     )
    ) %>% ggplot(aes(x = var, y = n)) + geom_bar(stat = 'identity')

####  Impute and/or Remove Missing Values  ####
test.impute.value = test.clean %>%
    mutate_at(
        .vars = impute.median[ which(! impute.median %in% correlated.features.numeric) ],
        .funs = ~ .x %>%
            replace_na(
                replace = .x %>% median(na.rm = TRUE)
            )
    ) %>%
    mutate_at(
        .vars = impute.mean[ which(! impute.mean %in% correlated.features.numeric) ],
        .funs = ~ .x %>%
            replace_na(
                replace = .x %>% mean(na.rm = TRUE)
            )
    ) %>% 
    mutate(
        paycheck_direct_deposit = paycheck_direct_deposit %>% as.logical()
    )

test.impute = test.impute.value %>% 
    filter(
        apply(
            X = test.impute.value,
            FUN = function (x) { x %>% is.na() %>% sum() },
            MARGIN = 1
        ) == 0
    )

#### Numeric Collinearity

####  Calculate Correlation Matrix (Numeric)  ####
test.numeric.cor = test.impute %>%
    select(
        numeric.features[ which(!numeric.features %in% processed.features)]
    ) %>% 
    cor()

test.numeric.cor[upper.tri(test.numeric.cor)] = NA
test.numeric.cor.upper = test.numeric.cor %>% melt(na.rm = TRUE)

# ####  Sum Missing (NA) Values for Numeric  ####
# apply(
# #     X = test.clean %>%
#     X = test.impute %>%
#         select(
#             numeric.features[ which(!numeric.features %in% c(processed.features, correlated.features.numeric)) ]
#         ),
#     FUN = function (x) { is.na(x) %>% sum() },
#     MARGIN = 2
# ) %>% 
# as.data.frame() %>% select(n = '.') %>% rownames_to_column('field') %>% arrange(desc(n))

# ####  Find / Remove Collinear Features (Numeric)  ####
# correlated.features.numeric = c(
#     'icrh_ten_minutes_ago',                                    #ccr_current_inquiry_cluster_position
#     'icrh_twenty_minutes_ago',                                 #ccr_current_inquiry_cluster_position
#     'icrh_thirty_minutes_ago',                                 #ccr_current_inquiry_cluster_position
#     'icrh_one_hour_ago',                                       #ccr_current_inquiry_cluster_position
#     'icrh_twentyfour_hours_ago',                               #ccr_current_inquiry_cluster_position
#     'icrh_seven_days_ago',                                     #ccr_current_inquiry_cluster_position
#     'icrh_thirty_days_ago',                                    #ccr_current_inquiry_cluster_position
#     'icrh_ninety_days_ago',                                    #ccr_current_inquiry_cluster_position
#     'icrh_recent_history_current_inquiry_cluster_position',    #ccr_current_inquiry_cluster_position
    
#     'ticrh_seven_days_ago',                                    #ticrh_twentyfour_hours_ago
#     'ticrh_thirty_days_ago',                                   #ticrh_twentyfour_hours_ago
    
#     'ccr_number_of_loans_paid_off',                            #ccr_number_of_loans
#     'ccr_number_of_loans_past_due',                            #ccr_number_of_loans
#     'ccr_number_of_loans_current_and_open',                    #ccr_number_of_loans,
#     'ccr_days_since_reported_income_previously_seen',          #ccr_days_since_inquiry_previously_seen
#     'ccr_days_since_previous_bank_account_previously_seen',    #ccr_days_since_inquiry_previously_seen
    
#     'srh_amount_loans_in_collections',                         #srh_loans_in_collections
#     'srh_days_with_open_loans_in_the_last_ninety_days',        #srh_loans_in_collections
#     'srh_days_with_open_loans_in_the_last_year'                #srh_loans_in_collections
# )

# test.numeric.cor.upper.removed = test.numeric.cor.upper %>% 
#     filter(
#         Var1 != Var2
#     ) %>% 
#     arrange(
#         desc(value)
#     ) %>% 
#     filter(
#         ! Var1 %in% correlated.features.numeric &
#         ! Var2 %in% correlated.features.numeric
#     ) %>% 
# #     group_by(
# #         Var1
# #     ) %>% 
# #     summarize(
# #         n = n(),
# #         total.cor = sum(value^2)
# #     ) %>% 
# #     ungroup() %>% 
#     arrange(
# #         total.cor %>% desc
#         value %>% desc
#     )

# test.numeric.cor.upper.removed %T>%
#     head() %>% 
#     ggplot(
#         mapping = aes(
#             x = Var1,
#             y = Var2,
#             fill = value
#         )
#     ) +
#     geom_tile(
#         color = 'white'
#     ) +
#     scale_fill_gradient2(
#         low = "blue",
#         high = "red",
#         mid = "white", 
#         midpoint = 0,
#         limit = c(-1,1),
#         space = "Lab", 
#         name="Pearson\nCorrelation"
#     ) +
#     theme_minimal() +
#     theme(
#         axis.text.x = element_text(
#             angle = -45,
#             hjust = 0
#         )
#     )

#### Boolean Collinearity

####  Calculate Correlation Matrix (Boolean)  ####
test.logical.cor = test.impute %>%
    select(
        -c(numeric.features[ which(!numeric.features %in% processed.features)])
    ) %>%
    cor()

test.logical.cor[upper.tri(test.logical.cor)] = NA
test.logical.cor.upper = test.logical.cor %>% melt(na.rm = TRUE)

# ####  Find / Remove Collinear Features (Logical)  ####
# correlated.features.logical = c(
#     'ccr_hit',                               # ccr_clarity_seen
#     'ccr_worst_payment_rating_plus',         # ccr_has_previous_loan_charged_off
#     'ccr_worst_payment_rating_null',         # ccr_has_previous_loan_opened  
#     'ccr_has_previous_loan_payment',         # ccr_has_previous_loan_opened  
#     'ccr_has_previous_ontime_payment',       # ccr_has_previous_loan_opened  
#     'ccr_has_previous_loan_paid_off',        # ccr_has_previous_loan_opened   
#     'ccr_has_previous_loan_charged_off'      # ccr_has_previous_loan_opened  
# )

# test.logical.cor.upper.removed = test.logical.cor.upper %>% 
#     filter(
#         Var1 != Var2
#     ) %>% 
#     arrange(
#         desc(value)
# #         value
#     ) %>% 
#     filter(
#         ! Var1 %in% correlated.features.logical &
#         ! Var2 %in% correlated.features.logical
# #     ) %>% 
# #     group_by(
# #         Var1
# #     ) %>% 
# #     summarize(
# #         n = n(),
# #         total.cor = sum(value^2)
# #     ) %>% 
# #     ungroup() %>% 
# #     arrange(
# #         total.cor %>% desc
#     )

# test.logical.cor.upper.removed

# test.logical.cor.upper.removed %>%
#     ggplot(
#         mapping = aes(
#             x = Var1,
#             y = Var2,
#             fill = value
#         )
#     ) +
#     geom_tile(
#         color = 'white'
#     ) +
#     scale_fill_gradient2(
#         low = "blue",
#         high = "red",
#         mid = "white", 
#         midpoint = 0,
#         limit = c(-1,1),
#         space = "Lab", 
#         name="Pearson\nCorrelation"
#     ) +
#     theme_minimal() +
#     theme(
#         axis.text.x = element_text(
#             angle = -45,
#             hjust = 0
#         )
#     )

# getClarityMapping = function () {
    
#     convertInquiryPurposeType = function (purpose.code) {
        
#         case_when(
#             purpose.code == 'AR' ~ 'New Credit',
#             purpose.code == 'AS' ~ 'New Credit Soft',
#             purpose.code == 'RA' ~ 'Account Review Soft',
#             purpose.code == 'RP' ~ 'Consumer Inquiry Soft',
#             purpose.code == 'CL' ~ 'Collection Inquiry',
#             purpose.code == 'PC' ~ 'Pre-check Soft',
#             purpose.code == 'MS' ~ 'Credit Monitor Soft',
#             purpose.code == 'CC' ~ 'Check Cash',
#             purpose.code == 'CS' ~ 'Collection Soft',
#             purpose.code == 'PS' ~ 'Pre-screen Soft',
#             purpose.code == 'IV' ~ 'Item Verification',
#             purpose.code == 'IS' ~ 'Item Verification Soft',
#             purpose.code == 'EH' ~ 'Employment',
#             purpose.code == 'ES' ~ 'Employment Soft',
#             purpose.code == 'LH' ~ 'Lease',
#             purpose.code == 'LS' ~ 'Lease Soft',
#             purpose.code == 'WS' ~ 'Written Authorization Soft',
#             purpose.code == 'WH' ~ 'Written Authorization - Hard',
#             purpose.code == 'PR' ~ 'Portfolio Review',
#             purpose.code == 'PA' ~ 'Portfolio Acquisition',
#             purpose.code == 'SP' ~ 'Subpoena',
#             TRUE ~ 'Other'
#         )
#     }
#     convertWorstPaymentRatingCCR = function (rating) {
        
#         case_when(
#             is.na(rating) ~ 0,
#             rating == '+' ~ 1,
#             rating == '0' ~ 2,
#             rating == '#' ~ 3,
#             rating == '@' ~ 4,
#             rating == 'X' ~ 5,
#             rating == '4' ~ 6,
#             rating == 'V' ~ 7,
#             rating == 'W' ~ 8,
#             rating == '1' ~ 9,
#             rating == '5' ~ 10,
#             rating == 'B' ~ 11,
#             rating == 'L' ~ 12,
#             rating == '7' ~ 13,
#             rating == '8' ~ 14,
#             rating == 'C' ~ 15,
#             rating == 'D' ~ 16,
#             rating == 'E' ~ 17,
#             rating == 'H' ~ 18,
#             rating == 'U' ~ 19,
#             rating == 'Y' ~ 20,
#             rating == 'Z' ~ 21,
#             TRUE ~ 22
#         )
#     }
#     convertDaysChargedOff = function (days) {
        
#         case_when(
#             is.na(days) ~ FALSE,
#             TRUE ~ TRUE
#         )
        
#     }
#     convertDaysPaidOff = function (days) {
        
#         case_when(
#             is.na(days) ~ FALSE,
#             TRUE ~ TRUE
#         )
        
#     }
#     convertDaysOntimePayment = function (days) {
        
#         case_when(
#             is.na(days) ~ FALSE,
#             TRUE ~ TRUE
#         )
        
#     }
#     convertDaysAnyPayment = function (days) {
        
#         case_when(
#             is.na(days) ~ FALSE,
#             TRUE ~ TRUE
#         )
        
#     }
#     convertDaysLoanOpened = function (days) {
        
#         case_when(
#             is.na(days) ~ FALSE,
#             TRUE ~ TRUE
#         )
        
#     }
    
#     list(
#         convertInquiryPurposeType = convertInquiryPurposeType,
#         convertWorstPaymentRatingCCR = convertWorstPaymentRatingCCR,
#         convertDaysChargedOff = convertDaysChargedOff,
#         convertDaysPaidOff = convertDaysPaidOff,
#         convertDaysOntimePayment = convertDaysOntimePayment,
#         convertDaysAnyPayment = convertDaysAnyPayment,
#         convertDaysLoanOpened = convertDaysLoanOpened
#     )
        
# }

}

In [None]:
# ##delimited doesn't give what is expected - i think the sorting between .l in pmap and names in .f are off
# getWaterfall = function () {
    
#     querySnowflake(
#     "
#     select
#         *
#     from
#         periscope_de.periscope_views.bizops_application_waterfall
#     limit 10
#     "
#     )
    
# }
# attachDelimitedWaterfall = function (waterfall) {

#     waterfall %>% 

#         ##  I need to make the column names easily referenced as variable, i.e., syntax that wouldn't require ``  ##


#         ##  1. I remove spaces from colnames.
#         rename_all(
#             .funs = ~ .x %>% str_to_lower() %>% str_remove_all('\\s')
#         ) %>% 
#         ##  2. I consolidate duplicate columns
#         mutate(
#             `05-nodirectdeposit` = max(`05-nodirectdeposit`, `21-nodirectdeposit`),
#             `06-bankduration` = max(`06-bankduration`, `15-bankduration`)
#         ) %>% 
#         select(
#             -`21-nodirectdeposit`,
#             -`15-bankduration`
#         ) %>% 
#         ##  3. I convert character to logical
#         mutate_at(
#             .vars = vars(matches('^\\d{2}')),
#             .funs = as.logical
#         ) %>% 
#         ##  4. I remove numbers and hyphens
#         rename_all(
#             .funs = ~ .x %>% 
#                 str_replace_all(
#                     pattern = '\\d{2}-',
#                     replacement = ''
#                 ) %>%
#                 str_remove_all('-')
#         ) %>% 


#         ##  I need to make a new column that lists the UNCLEANED waterfall name when value is true  ##
#         ##  Approach:
#         ##  1. Map over each of the waterfall variables.
#         ##  2. For each row, determine which variables are true.
#         ##    - Transform variables into long-wise list.
#         ##    - Name each list element (which is the T/F value) with the original column name.
#         ##  3. Create the delimited list. Take the original column name when the value is TRUE.

#         mutate(

#             ##  I need to make a new column that lists the UNCLEANED waterfall name when value is true  ##
#             ##  This part -- https://community.rstudio.com/t/how-to-select-columns-when-using-pmap-inside-of-mutate-for-rowwise-operation/23086/2
#             ##  Needs to reflect the cleaning above before the mutate.
#             delimited = pmap(

#                 .l = list(
#                     ##  parse_exprs creates a list of variables based on the string vector returned
#                     ##  !!! converts EACH element in the list to an argument passed into function (which is list())
#                     !!!rlang::parse_exprs(
#                         waterfall %>% 
#                             rename_all(
#                                 .funs = ~ .x %>% str_to_lower() %>% str_remove_all('\\s')
#                             ) %>% 
#                             mutate(
#                                 `05-nodirectdeposit` = max(`05-nodirectdeposit`, `21-nodirectdeposit`),
#                                 `06-bankduration` = max(`06-bankduration`, `15-bankduration`)
#                             ) %>% 
#                             select(
#                                 -`21-nodirectdeposit`,
#                                 -`15-bankduration`
#                             ) %>% 
#                             select_at(
#                                 .vars = vars(matches('^\\d{2}'))
#                             ) %>% 
#                             rename_all(
#                                 .funs = ~ .x %>% 
#                                     str_replace_all(
#                                         pattern = '\\d{2}-',
#                                         replacement = ''
#                                     ) %>%
#                                     str_remove_all('-')
#                             ) %>% 
#                             colnames() %>% 
#                             sort()
#                     )
#                 ), 

#                 .f = function (...) {


#                     ##  All the variables passed into above pmap is now a list
#                     vars = list(...)


#                     ##  Naming each list element (which is the T/F value) provides a way to link the T/F value to the column name.
#                     names(vars) = waterfall %>% 
#                         rename_all(
#                             .funs = ~ .x %>% str_to_lower
#                         ) %>% 
#                         select_at(
#                             .vars = vars(matches('^\\d{2}'))
#                         ) %>% 
#                         select(
#                             -`21-no direct deposit`,
#                             -`15-bank duration`
#                         ) %>% 
#                         colnames() %>% 
#                         sort()
                    

#                     ##  More easily converted into a data frame
#                     data.frame(
#                         key = vars %>% names(),
#                         value = vars %>% unlist(),
#                         stringsAsFactors = FALSE
#                     ) %>% 
#                     mutate(
#                         to.delimit = case_when(
#                             value ~ key,
#                             TRUE ~ ''
#                         )
#                     ) %>% 
#                     ##  Delimit cases that are TRUE
#                     .$to.delimit %>% 
#                     paste(
#                         collapse = ' '
#                     ) %>% 
#                     str_replace_all(
#                         pattern = '(?<=.)\\s',
#                         replacement = ', '
#                     ) %>% 
#                     str_trim(
#                         side = 'both'
#                     )

#                 }
#             )
#         ) %>% 
#         filter(
#             loanid == 'APP-0003866618'
#         ) %>%
#         select(
#             loanid,
#             reasongrp,
#             clarityearlyscreen,
#             delimited
#         )
    
# }
# # waterfall = getWaterfall()
# # waterfall %>% attachDelimitedWaterfall()

In [None]:
# getLoanPerformance = function () {
    
#     queryReporting(
#     "
#     select
#         applicationid as application
#         , truefpd
#     from
#         tableau_reporting.tbl_pd_rate_loan_level
#     where
#         appldate >= '2019-10-01'::date
#     "
#     )
    
# }