# Import Packages

In [None]:
# library(devtools)
# install_github(
#     'jasonchang2018/opploansanalytics',
#     auth_token = Sys.getenv('GITHUB_PAT_OPPLOANSANALYTICS')
# )

library(opploansanalytics)
load.packages()

library(mlr)
library(pdp)
library(vip)
library(reshape2)

# Prepare Data

In [None]:
#### Clarity Field Analysis ####
clarityAnalysis = function () {

# ####  Clarity Report Type Validation  ####
# test %>%
#     filter(
#         ! report_received %>% str_detect('(?:FWB)?Leads01.*')
#     ) %>% 
#     transmute(
#         lead_id,
#         report_received,
#         report_requested,
#         lead_time,
#         report_time,
#         lead.date = lead_time %>% as.Date(),
#         report.date = report_time %>% as.Date(),
#         diff = lead.date - report.date
#     ) %>% 
#     arrange(
#         diff
#     )
# #     ) %T>%
# #     write.csv("..\\docs\\received-not-leads01.csv")

# test %>% group_by(report_received) %>% summarize(n = n()) %>% ungroup() %>% arrange(desc(n))

#### Get Clarity Data

####  Existing Fields  ####
getClarityFields = function () {

    a <- queryReporting(
    "
    select
        *
    from
        lde4.leads
    where
        --lead_time >= now()::date - '5 days'::interval
        lead_id = '99f418da-8b66-471e-9586-f4112718ed21'
    limit 100
    "
    ) %>%
        select(
            lead_id,
            clarity_report,
            accepted
        )

    b <- a %>%
        filter(
            !is.na(clarity_report)
        ) %>% 
        mutate(
            json.df = clarity_report %>% 
                map(
                    .f = ~ .x %>%
                        fromJSON() %>%
                        .$xml_response %>% 
                        unlist() %>% 
                        as.data.frame(
                            stringsAsFactors = FALSE
                        ) %>% 
                        t()
                )
        )

    all.fields <<- b %>%
        filter(
            lead_id == '99f418da-8b66-471e-9586-f4112718ed21'
        ) %>%
        .$clarity_report %>%
        fromJSON(
        ) %>% 
        .$xml_response %>% 
        unlist(
        ) %>% 
        as.data.frame(
            stringsAsFactors = FALSE
        ) %>% 
        rownames_to_column(
            var = 'key'
        ) %>% 
        rename(
            value = "."
        )

    inquiry.fields <<-
        all.fields %>%
            filter(
                key %>% str_detect('^inquiry\\..*') &
                value != '' &
                !is.na(value)
            ) %>% 
            filter(
                key %in% paste0(
                    'inquiry.',
                    c(
                        'ofac_match',
                        'ofac_score',
                        'social_security_valid',
                        'social_security_deceased',
                        'ssn_distinct_first_last_name_count',
                        'paycheck_direct_deposit',
                        'bank_routing_valid',
                        'inquiry_purpose_type'
                    )
                )
            )

    ccr.fields <<-
        all.fields %>%
            filter(
                key %>% str_detect('^clear_credit_risk\\..*') &
                value != '' &
                !is.na(value)
            ) %>% 
            filter(
                !key %>% str_detect('inquiries\\.member_id') &
                !key %>% str_detect('inquiry_received_at') &
                !key %>% str_detect('inquiry_purpose_type') &
                !key %>% str_detect('inquiry_tradeline_type') &
                !key %>% str_detect('tradelines\\..*') &
                !key %>% str_detect('stabilities\\..*') &
                !key %>% str_detect('experian_attribute\\..*') &
                !key %>% str_detect('description') &
                !key %>% str_detect('full_name') &
                !key %>% str_detect('code') &
                !key %>% str_detect('date') &
                !key %>% str_detect('first') &
                !key %>% str_detect('48')
#                 !key %>% str_detect('inquiry_purpose_type') & #keep
#                 !key %>% str_detect('inquiry_tradeline_type') & #keep
#                 !key %>% str_detect('tradelines\\.account_opened') & #keep
#                 !key %>% str_detect('tradelines\\.highest_credit') & #keep
#                 !key %>% str_detect('tradelines\\.amount_past_due') & #keep
#                 !key %>% str_detect('tradelines\\.current_balance') & #keep
            )

    crh.fields <<-
        all.fields %>%
            filter(
                key %>% str_detect('^clear_recent_history\\..*') &
                value != '' &
                !is.na(value)
            ) %>% 
            filter(
                !key %>% str_detect('tradeline_stabilities') &
                !key %>% str_detect('date') &
                !key %>% str_detect('name') &
                !key %>% str_detect('\\d+')
            )

    rbind(
        inquiry.fields,
        ccr.fields,
        crh.fields
    ) %>% .$key
    
}

####  Pull Test Clarity Report  ####
test = queryReporting(
"
select

    --  Identifiers --
    lde.lead_id
    , lde.leadofferid
    , lde.passthru_lead_offer_id
    , lde.lead_time at time zone 'America/Chicago' as lead_time
    , lde.partnerid

    --  Credit  --
    , case when lde.clarity_report notnull then TRUE else FALSE end as has_clarity
    , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'inquiry_received_at' as report_time
    , lde.clarity_report -> 'xml_response' -> 'opploans' ->> 'received' as report_received
    , lde.clarity_report -> 'xml_response' -> 'opploans' ->> 'requested_file' as report_requested

    , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'ofac_score' as ofac_score
    , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'paycheck_direct_deposit' as paycheck_direct_deposit
    , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'ssn_distinct_first_last_name_count' as ssn_distinct_first_last_name_count

    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'score' as ccr_score
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'clarity_seen' as ccr_clarity_seen
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'number_of_loans' as ccr_number_of_loans
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'number_of_bank_accounts' as ccr_number_of_bank_accounts
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'highest_number_of_days_past_due' as ccr_highest_number_of_days_past_due
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'current_inquiry_cluster_position' as ccr_current_inquiry_cluster_position
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'days_since_last_loan_charged_off' as ccr_days_since_last_loan_charged_off
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'days_since_inquiry_previously_seen' as ccr_days_since_inquiry_previously_seen
    , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'number_of_employers_last_six_months' as ccr_number_of_employers_last_six_months

    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'loans_in_collections' as srh_loans_in_collections
    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'spml_average_rollovers' as srh_spml_average_rollovers
    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'amount_loans_charged_off' as srh_amount_loans_charged_off
    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'online_loan_opened_in_the_last_year' as srh_online_loan_opened_in_the_last_year
    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'online_loan_inquiry_in_the_last_thirty_days' as srh_online_loan_inquiry_in_the_last_thirty_days

    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'inquiry_cluster_recent_history' -> 'total_inquiry_clusters_recent_history' ->> 'ninety_days_ago' as ticrh_ninety_days_ago
    , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'inquiry_cluster_recent_history' -> 'total_inquiry_clusters_recent_history' ->> 'twentyfour_hours_ago' as ticrh_twentyfour_hours_ago


    from
        lde4.leads as lde
    inner join
        cloudlending.advertising_method as c_am
        on lde.partnerid = c_am.external_id
        and c_am.name = 'LenderEdge 4' 
    where
        lde.accepted = TRUE
        and lde.lead_time >= '2020-03-09'::date
    limit 1000
"
)

####  Identify Data Types  ####
not.features = c(
    'lead_id',
    'leadofferid',
    'passthru_lead_offer_id',
    'lead_time',
    'partnerid',
    'has_clarity',
    'report_time',
    'report_received',
    'report_requested'
)

boolean.features = c(
    'paycheck_direct_deposit',
    'ccr_hit',
    'ccr_clarity_seen',
    'srh_online_loan_opened_in_the_last_year',
    'srh_online_loan_inquiry_in_the_last_thirty_days'
)

numeric.features = colnames(test)[
    which(
        !colnames(test) %in% c(
            boolean.features,
            not.features,
            'ccr_worst_payment_rating_null',
            'ccr_worst_payment_rating_plus',
            'ccr_worst_payment_rating_zero',
            'ccr_worst_payment_rating_hash',
            'ccr_worst_payment_rating_else',
            'ccr_worst_payment_rating'
        )
    )
]

processed.features = c(
    'ccr_worst_payment_rating',
    'ccr_days_since_last_loan_charged_off',
    'ccr_days_since_last_loan_paid_off',
    'ccr_days_since_last_ontime_payment',
    'ccr_days_since_last_loan_payment',
    'ccr_days_since_last_loan_opened'
)

impute.median = c(
    'ccr_days_since_previous_bank_account_previously_seen',
    'ccr_days_since_reported_income_previously_seen',
    'ccr_days_since_inquiry_previously_seen',
    'ccr_highest_number_of_days_past_due',
    'paycheck_direct_deposit'
)

impute.mean = c(
    'ccr_number_of_loans',
    'ccr_number_of_bank_accounts',
    'ccr_number_of_loans_paid_off',
    'ccr_number_of_loans_paid_off',
    'ccr_number_of_loans_past_due',
    'ccr_current_inquiry_cluster_position',
    'ccr_number_of_loans_current_and_open',
    'ccr_number_of_employers_last_six_months',
    'ccr_score'
)

correlated.features.numeric = c(
    'icrh_ten_minutes_ago',                                    #ccr_current_inquiry_cluster_position
    'icrh_twenty_minutes_ago',                                 #ccr_current_inquiry_cluster_position
    'icrh_thirty_minutes_ago',                                 #ccr_current_inquiry_cluster_position
    'icrh_one_hour_ago',                                       #ccr_current_inquiry_cluster_position
    'icrh_twentyfour_hours_ago',                               #ccr_current_inquiry_cluster_position
    'icrh_seven_days_ago',                                     #ccr_current_inquiry_cluster_position
    'icrh_thirty_days_ago',                                    #ccr_current_inquiry_cluster_position
    'icrh_ninety_days_ago',                                    #ccr_current_inquiry_cluster_position
    'icrh_recent_history_current_inquiry_cluster_position',    #ccr_current_inquiry_cluster_position
    
    'ticrh_seven_days_ago',                                    #ticrh_twentyfour_hours_ago
    'ticrh_thirty_days_ago',                                   #ticrh_twentyfour_hours_ago
    
    'ccr_number_of_loans_paid_off',                            #ccr_number_of_loans
    'ccr_number_of_loans_past_due',                            #ccr_number_of_loans
    'ccr_number_of_loans_current_and_open',                    #ccr_number_of_loans,
    'ccr_days_since_reported_income_previously_seen',          #ccr_days_since_inquiry_previously_seen
    'ccr_days_since_previous_bank_account_previously_seen',    #ccr_days_since_inquiry_previously_seen
    
    'srh_amount_loans_in_collections',                         #srh_loans_in_collections
    'srh_days_with_open_loans_in_the_last_ninety_days',        #srh_loans_in_collections
    'srh_days_with_open_loans_in_the_last_year'                #srh_loans_in_collections
)

correlated.features.logical = c(
    'ccr_hit',                                                 #ccr_clarity_seen
    'ccr_worst_payment_rating_plus',                           #ccr_has_previous_loan_charged_off
    'ccr_worst_payment_rating_null',                           #ccr_has_previous_loan_opened  
    'ccr_has_previous_loan_payment',                           #ccr_has_previous_loan_opened  
    'ccr_has_previous_ontime_payment',                         #ccr_has_previous_loan_opened  
    'ccr_has_previous_loan_paid_off',                          #ccr_has_previous_loan_opened   
    'ccr_has_previous_loan_charged_off'                        #ccr_has_previous_loan_opened  
)

#### Convert Data Types

####  Convert Data Types  ####
test.clean = test %>%
    select(
        -not.features
    ) %>%
    mutate_at(
        .vars = boolean.features[which(! boolean.features %in% correlated.features.numeric)],
        .funs = as.logical
    ) %>%
    mutate_at(
        .vars = numeric.features[which(! numeric.features %in% correlated.features.numeric)],
        .funs = as.numeric
    ) %>% 
    mutate(
        ccr_worst_payment_rating_null = is.na(ccr_worst_payment_rating),
        ccr_worst_payment_rating_plus = !ccr_worst_payment_rating_null & ccr_worst_payment_rating == '+',
        ccr_worst_payment_rating_zero = !ccr_worst_payment_rating_null & ccr_worst_payment_rating == '0',
        ccr_worst_payment_rating_hash = !ccr_worst_payment_rating_null & ccr_worst_payment_rating == '#',
        ccr_worst_payment_rating_else = !(
            ccr_worst_payment_rating_plus |
            ccr_worst_payment_rating_zero |
            ccr_worst_payment_rating_hash |
            ccr_worst_payment_rating_null
        ),
        
        ccr_has_previous_loan_charged_off = ccr_days_since_last_loan_charged_off %>%
            getClarityMapping()$convertDaysChargedOff(),
        ccr_has_previous_loan_paid_off = ccr_days_since_last_loan_paid_off %>% 
            getClarityMapping()$convertDaysPaidOff(),
        ccr_has_previous_ontime_payment = ccr_days_since_last_ontime_payment %>% 
            getClarityMapping()$convertDaysOntimePayment(),
        ccr_has_previous_loan_payment = ccr_days_since_last_loan_payment %>% 
            getClarityMapping()$convertDaysAnyPayment(),
        ccr_has_previous_loan_opened = ccr_days_since_last_loan_opened %>% 
            getClarityMapping()$convertDaysLoanOpened()
        
        
    ) %>%
    select(
        -processed.features
    )
# test.clean %>% str()

#### Impute

####  Examine Values in Field  ####
field = quo(paycheck_direct_deposit)

test.clean[[quo_name(field)]] %>% median(na.rm = TRUE)
test.clean[[quo_name(field)]] %>% mean(na.rm = TRUE)

test %>%
    group_by(
#         var = !!field %>% as.numeric
        var = !!field
    ) %>% 
    summarize(
        n = n()
    ) %>%
    ungroup() %>%
#     filter(
#         !is.na(var)
#         var < 100
#     ) %>% 
    arrange(
#         desc(n)
        var
#     )
    ) %>% ggplot(aes(x = var, y = n)) + geom_bar(stat = 'identity')

####  Impute and/or Remove Missing Values  ####
test.impute.value = test.clean %>%
    mutate_at(
        .vars = impute.median[ which(! impute.median %in% correlated.features.numeric) ],
        .funs = ~ .x %>%
            replace_na(
                replace = .x %>% median(na.rm = TRUE)
            )
    ) %>%
    mutate_at(
        .vars = impute.mean[ which(! impute.mean %in% correlated.features.numeric) ],
        .funs = ~ .x %>%
            replace_na(
                replace = .x %>% mean(na.rm = TRUE)
            )
    ) %>% 
    mutate(
        paycheck_direct_deposit = paycheck_direct_deposit %>% as.logical()
    )

test.impute = test.impute.value %>% 
    filter(
        apply(
            X = test.impute.value,
            FUN = function (x) { x %>% is.na() %>% sum() },
            MARGIN = 1
        ) == 0
    )

#### Numeric Collinearity

####  Calculate Correlation Matrix (Numeric)  ####
test.numeric.cor = test.impute %>%
    select(
        numeric.features[ which(!numeric.features %in% processed.features)]
    ) %>% 
    cor()

test.numeric.cor[upper.tri(test.numeric.cor)] = NA
test.numeric.cor.upper = test.numeric.cor %>% melt(na.rm = TRUE)

# ####  Sum Missing (NA) Values for Numeric  ####
# apply(
# #     X = test.clean %>%
#     X = test.impute %>%
#         select(
#             numeric.features[ which(!numeric.features %in% c(processed.features, correlated.features.numeric)) ]
#         ),
#     FUN = function (x) { is.na(x) %>% sum() },
#     MARGIN = 2
# ) %>% 
# as.data.frame() %>% select(n = '.') %>% rownames_to_column('field') %>% arrange(desc(n))

# ####  Find / Remove Collinear Features (Numeric)  ####
# correlated.features.numeric = c(
#     'icrh_ten_minutes_ago',                                    #ccr_current_inquiry_cluster_position
#     'icrh_twenty_minutes_ago',                                 #ccr_current_inquiry_cluster_position
#     'icrh_thirty_minutes_ago',                                 #ccr_current_inquiry_cluster_position
#     'icrh_one_hour_ago',                                       #ccr_current_inquiry_cluster_position
#     'icrh_twentyfour_hours_ago',                               #ccr_current_inquiry_cluster_position
#     'icrh_seven_days_ago',                                     #ccr_current_inquiry_cluster_position
#     'icrh_thirty_days_ago',                                    #ccr_current_inquiry_cluster_position
#     'icrh_ninety_days_ago',                                    #ccr_current_inquiry_cluster_position
#     'icrh_recent_history_current_inquiry_cluster_position',    #ccr_current_inquiry_cluster_position
    
#     'ticrh_seven_days_ago',                                    #ticrh_twentyfour_hours_ago
#     'ticrh_thirty_days_ago',                                   #ticrh_twentyfour_hours_ago
    
#     'ccr_number_of_loans_paid_off',                            #ccr_number_of_loans
#     'ccr_number_of_loans_past_due',                            #ccr_number_of_loans
#     'ccr_number_of_loans_current_and_open',                    #ccr_number_of_loans,
#     'ccr_days_since_reported_income_previously_seen',          #ccr_days_since_inquiry_previously_seen
#     'ccr_days_since_previous_bank_account_previously_seen',    #ccr_days_since_inquiry_previously_seen
    
#     'srh_amount_loans_in_collections',                         #srh_loans_in_collections
#     'srh_days_with_open_loans_in_the_last_ninety_days',        #srh_loans_in_collections
#     'srh_days_with_open_loans_in_the_last_year'                #srh_loans_in_collections
# )

# test.numeric.cor.upper.removed = test.numeric.cor.upper %>% 
#     filter(
#         Var1 != Var2
#     ) %>% 
#     arrange(
#         desc(value)
#     ) %>% 
#     filter(
#         ! Var1 %in% correlated.features.numeric &
#         ! Var2 %in% correlated.features.numeric
#     ) %>% 
# #     group_by(
# #         Var1
# #     ) %>% 
# #     summarize(
# #         n = n(),
# #         total.cor = sum(value^2)
# #     ) %>% 
# #     ungroup() %>% 
#     arrange(
# #         total.cor %>% desc
#         value %>% desc
#     )

# test.numeric.cor.upper.removed %T>%
#     head() %>% 
#     ggplot(
#         mapping = aes(
#             x = Var1,
#             y = Var2,
#             fill = value
#         )
#     ) +
#     geom_tile(
#         color = 'white'
#     ) +
#     scale_fill_gradient2(
#         low = "blue",
#         high = "red",
#         mid = "white", 
#         midpoint = 0,
#         limit = c(-1,1),
#         space = "Lab", 
#         name="Pearson\nCorrelation"
#     ) +
#     theme_minimal() +
#     theme(
#         axis.text.x = element_text(
#             angle = -45,
#             hjust = 0
#         )
#     )

#### Boolean Collinearity

####  Calculate Correlation Matrix (Boolean)  ####
test.logical.cor = test.impute %>%
    select(
        -c(numeric.features[ which(!numeric.features %in% processed.features)])
    ) %>%
    cor()

test.logical.cor[upper.tri(test.logical.cor)] = NA
test.logical.cor.upper = test.logical.cor %>% melt(na.rm = TRUE)

# ####  Find / Remove Collinear Features (Logical)  ####
# correlated.features.logical = c(
#     'ccr_hit',                               # ccr_clarity_seen
#     'ccr_worst_payment_rating_plus',         # ccr_has_previous_loan_charged_off
#     'ccr_worst_payment_rating_null',         # ccr_has_previous_loan_opened  
#     'ccr_has_previous_loan_payment',         # ccr_has_previous_loan_opened  
#     'ccr_has_previous_ontime_payment',       # ccr_has_previous_loan_opened  
#     'ccr_has_previous_loan_paid_off',        # ccr_has_previous_loan_opened   
#     'ccr_has_previous_loan_charged_off'      # ccr_has_previous_loan_opened  
# )

# test.logical.cor.upper.removed = test.logical.cor.upper %>% 
#     filter(
#         Var1 != Var2
#     ) %>% 
#     arrange(
#         desc(value)
# #         value
#     ) %>% 
#     filter(
#         ! Var1 %in% correlated.features.logical &
#         ! Var2 %in% correlated.features.logical
# #     ) %>% 
# #     group_by(
# #         Var1
# #     ) %>% 
# #     summarize(
# #         n = n(),
# #         total.cor = sum(value^2)
# #     ) %>% 
# #     ungroup() %>% 
# #     arrange(
# #         total.cor %>% desc
#     )

# test.logical.cor.upper.removed

# test.logical.cor.upper.removed %>%
#     ggplot(
#         mapping = aes(
#             x = Var1,
#             y = Var2,
#             fill = value
#         )
#     ) +
#     geom_tile(
#         color = 'white'
#     ) +
#     scale_fill_gradient2(
#         low = "blue",
#         high = "red",
#         mid = "white", 
#         midpoint = 0,
#         limit = c(-1,1),
#         space = "Lab", 
#         name="Pearson\nCorrelation"
#     ) +
#     theme_minimal() +
#     theme(
#         axis.text.x = element_text(
#             angle = -45,
#             hjust = 0
#         )
#     )

# getClarityMapping = function () {
    
#     convertInquiryPurposeType = function (purpose.code) {
        
#         case_when(
#             purpose.code == 'AR' ~ 'New Credit',
#             purpose.code == 'AS' ~ 'New Credit Soft',
#             purpose.code == 'RA' ~ 'Account Review Soft',
#             purpose.code == 'RP' ~ 'Consumer Inquiry Soft',
#             purpose.code == 'CL' ~ 'Collection Inquiry',
#             purpose.code == 'PC' ~ 'Pre-check Soft',
#             purpose.code == 'MS' ~ 'Credit Monitor Soft',
#             purpose.code == 'CC' ~ 'Check Cash',
#             purpose.code == 'CS' ~ 'Collection Soft',
#             purpose.code == 'PS' ~ 'Pre-screen Soft',
#             purpose.code == 'IV' ~ 'Item Verification',
#             purpose.code == 'IS' ~ 'Item Verification Soft',
#             purpose.code == 'EH' ~ 'Employment',
#             purpose.code == 'ES' ~ 'Employment Soft',
#             purpose.code == 'LH' ~ 'Lease',
#             purpose.code == 'LS' ~ 'Lease Soft',
#             purpose.code == 'WS' ~ 'Written Authorization Soft',
#             purpose.code == 'WH' ~ 'Written Authorization - Hard',
#             purpose.code == 'PR' ~ 'Portfolio Review',
#             purpose.code == 'PA' ~ 'Portfolio Acquisition',
#             purpose.code == 'SP' ~ 'Subpoena',
#             TRUE ~ 'Other'
#         )
#     }
#     convertWorstPaymentRatingCCR = function (rating) {
        
#         case_when(
#             is.na(rating) ~ 0,
#             rating == '+' ~ 1,
#             rating == '0' ~ 2,
#             rating == '#' ~ 3,
#             rating == '@' ~ 4,
#             rating == 'X' ~ 5,
#             rating == '4' ~ 6,
#             rating == 'V' ~ 7,
#             rating == 'W' ~ 8,
#             rating == '1' ~ 9,
#             rating == '5' ~ 10,
#             rating == 'B' ~ 11,
#             rating == 'L' ~ 12,
#             rating == '7' ~ 13,
#             rating == '8' ~ 14,
#             rating == 'C' ~ 15,
#             rating == 'D' ~ 16,
#             rating == 'E' ~ 17,
#             rating == 'H' ~ 18,
#             rating == 'U' ~ 19,
#             rating == 'Y' ~ 20,
#             rating == 'Z' ~ 21,
#             TRUE ~ 22
#         )
#     }
#     convertDaysChargedOff = function (days) {
        
#         case_when(
#             is.na(days) ~ FALSE,
#             TRUE ~ TRUE
#         )
        
#     }
#     convertDaysPaidOff = function (days) {
        
#         case_when(
#             is.na(days) ~ FALSE,
#             TRUE ~ TRUE
#         )
        
#     }
#     convertDaysOntimePayment = function (days) {
        
#         case_when(
#             is.na(days) ~ FALSE,
#             TRUE ~ TRUE
#         )
        
#     }
#     convertDaysAnyPayment = function (days) {
        
#         case_when(
#             is.na(days) ~ FALSE,
#             TRUE ~ TRUE
#         )
        
#     }
#     convertDaysLoanOpened = function (days) {
        
#         case_when(
#             is.na(days) ~ FALSE,
#             TRUE ~ TRUE
#         )
        
#     }
    
#     list(
#         convertInquiryPurposeType = convertInquiryPurposeType,
#         convertWorstPaymentRatingCCR = convertWorstPaymentRatingCCR,
#         convertDaysChargedOff = convertDaysChargedOff,
#         convertDaysPaidOff = convertDaysPaidOff,
#         convertDaysOntimePayment = convertDaysOntimePayment,
#         convertDaysAnyPayment = convertDaysAnyPayment,
#         convertDaysLoanOpened = convertDaysLoanOpened
#     )
        
# }

}

In [None]:
getShortenedAdmethodName = function (admethod) {
    
    admethod %>%
        str_to_lower() %>% 
        str_match_all(
            pattern = regex(
                "(.*?)(?:\\s4)?$"
            )
        ) %>% 
        .[[1]] %>% .[,2] %>% .[1] %>%
        str_replace_all(
            pattern = ' ',
            replacement = ''
        )
    
}

In [None]:
getLeadsPerformance = function (admethod, timestart='2019-11-01', timeend='2020-02-01', limit=NA, write=FALSE) {
    
    getLeadsDF = function (admethod, timestart, timeend, limit = NA) {
    
        queryReporting(
            paste0(
    "
    select

        --  Identifiers --
        lde.lead_id
        , lde.leadofferid
        , lde.lead_time at time zone 'America/Chicago' as lead_time
        , extract(day from lde.lead_time at time zone 'America/Chicago') as lead_day
        , lde.partnerid
        , lde.email

        --  Outcome  --
        , lde.accepted
        , lde.reason
        , lde.code

        --  Bank  --
        , lde.bankname
        , lde.abaroutingnumber
        , lde.accountnumber

        --  Income  --
        , lde.grossmonthlyincome
        , lde.incometype
        , lde.payrollfrequency
        , lde.payrolltype
        , lde.lastpayrolldate

        --  Identity  --
        , lde.dateofbirth
        , floor((lde.lead_time::date - lde.dateofbirth::date)::numeric/365) as age
        , lde.statecode

        --  Employment  --
        , lde.work_hiredate

        --  Offer  --
        , lde.offer_amount
        , lde.offer_interestrate
        , lde.offer_monthlypayment
        , (lde.raw_lead ->> 'requestedLoanAmount')::numeric as requestedLoanAmount
        , lde.raw_lead -> 'campaign_id' as campaign_id

        --  Credit  --
        , case when lde.clarity_report notnull then TRUE else FALSE end as has_clarity
        , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'inquiry_received_at' as report_time
        , lde.clarity_report -> 'xml_response' -> 'opploans' ->> 'received' as report_received
        , lde.clarity_report -> 'xml_response' -> 'opploans' ->> 'requested_file' as report_requested

        , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'ofac_score' as ofac_score
        , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'paycheck_direct_deposit' as paycheck_direct_deposit
        , lde.clarity_report -> 'xml_response' -> 'inquiry' ->> 'ssn_distinct_first_last_name_count' as ssn_distinct_first_last_name_count

        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'score' as ccr_score
        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'number_of_loans' as ccr_number_of_loans
        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'number_of_bank_accounts' as ccr_number_of_bank_accounts
        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'highest_number_of_days_past_due' as ccr_highest_number_of_days_past_due
        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'current_inquiry_cluster_position' as ccr_current_inquiry_cluster_position
        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'days_since_last_loan_charged_off' as ccr_days_since_last_loan_charged_off
        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'days_since_inquiry_previously_seen' as ccr_days_since_inquiry_previously_seen
        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'number_of_employers_last_six_months' as ccr_number_of_employers_last_six_months
        , lde.clarity_report -> 'xml_response' -> 'clear_credit_risk' ->> 'worst_payment_rating' as ccr_worst_payment_rating

        , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'loans_in_collections' as srh_loans_in_collections
        , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'spml_average_rollovers' as srh_spml_average_rollovers
        , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'amount_loans_charged_off' as srh_amount_loans_charged_off
        , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'online_loan_opened_in_the_last_year' as srh_online_loan_opened_in_the_last_year
        , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'supplier_recent_history' -> 'summary_recent_history' ->> 'online_loan_inquiry_in_the_last_thirty_days' as srh_online_loan_inquiry_in_the_last_thirty_days

        , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'inquiry_cluster_recent_history' -> 'total_inquiry_clusters_recent_history' ->> 'ninety_days_ago' as ticrh_ninety_days_ago
        , lde.clarity_report -> 'xml_response' -> 'clear_recent_history' -> 'inquiry_cluster_recent_history' -> 'total_inquiry_clusters_recent_history' ->> 'twentyfour_hours_ago' as ticrh_twentyfour_hours_ago

    from
        lde4.leads as lde
    inner join
        cloudlending.advertising_method as c_am
        on lde.partnerid = c_am.external_id
        and c_am.name = '", admethod, "' 
    where
        lde.accepted = TRUE
        and lde.lead_time >= '", timestart, "'::date
        and lde.lead_time < '", timeend, "'::date
    ", ifelse(is.na(limit), "", paste0("limit ", limit))
            )
        )
    }
    getFunnelDF = function (admethod, timestart, timeend) {
        queryReporting(paste0(
    "
    with status as
    (
        select
            c_app.id as application
            , c_app.lde4_lead_id as lead_id
            , c_app.createddate at time zone 'America/Chicago' as appldate
            , c_app.funded_amount
            , c_app.type_formula

            , max(case  when new_value = 'NEW - ENTERED'
                        then 1 else 0
                        end) as newentered
            , max(case  when old_value = 'NEW - ENTERED'
                        and new_value = 'BUSINESS RULES PASSED'
                        then 1 else 0
                        end) as bizrulespassed

            , max(case  when new_value = 'BUREAU APPROVED'
                        then 1 else 0
                        end) as qualified

            , max(case  when new_value in ('BANK VERIFICATION COMPLETED', 'NEW - SCORECARD GENERATED')
                        then 1 else 0
                        end) as bankverified

            , max(case  when c_ash.old_value = 'NEW - PRICING GENERATED'
                        and c_ash.new_value in ('CONTRACT SIGNED', 'WAITING ON STIPULATIONS')
                        then 1 else 0
                        end) as passscorecardratecard

            , max(case  when c_ash.old_value in ('NEW - PRICING GENERATED', 'WAITING ON STIPULATIONS')
                        and c_ash.new_value = 'CONTRACT SIGNED'
                        then 1 else 0 end) as contractsigned

            , max(case  when c_ash.new_value = 'LOAN APPROVED'
                        then 1 else 0
                        end) as funded
        from
            cloudlending.applications as c_app
            inner join
                cloudlending.advertising_method as c_am
                on c_app.advertising_method = c_am.id
                and c_am.name = '", admethod, "'
            inner join
                cloudlending.application_status_history as c_ash
                on c_app.id = c_ash.application
        where
            c_app.createddate at time zone 'America/Chicago' >= '", timestart, "'
            and c_app.createddate at time zone 'America/Chicago' < '", timeend, "'
        group by
            1,2,3,4,5
    )
    , cs_decisioned_apps as
    (
        select
            c_app.id as application
        from
            cloudlending.applications as c_app
            inner join
                status
                on c_app.id = status.application
                and status.contractsigned = 1
        where
            denialreason not in ('Time In Pending', 'Withdraw')
            or (denialreason isnull and status = 'LOAN APPROVED')
    )
    select
        status.application
        , status.lead_id
        , status.appldate
        , status.type_formula
        , status.funded_amount
        , status.newentered
        , status.bizrulespassed
        , status.qualified
        , status.bankverified
        , status.passscorecardratecard
        , status.contractsigned
        , case when cs_decisioned_apps.application notnull then 1 else 0 end as cs_decisioned
        , status.funded
    from
      status
      left join
        cs_decisioned_apps
        on status.application = cs_decisioned_apps.application
    "
    ))
    }
    
    leads <<- getLeadsDF(
        admethod = admethod,
        timestart = timestart,
        timeend = timeend,
        limit = limit
    )
    
    funnel <<- getFunnelDF(
        admethod = admethod,
        timestart = timestart,
        timeend = timeend
    )
    
    loan.performance <<- queryReporting(
    "
    select
        applicationid as application
        , truefpd
    from
        tableau_reporting.tbl_pd_rate_loan_level
    where
        appldate >= '2019-10-01'::date
    "
    )
    
    
    
    df = leads %>%
        left_join(
            funnel,
            by = 'lead_id'
        ) %>% 
        left_join(
            loan.performance,
            by = 'application'
        ) %>% 
        mutate_at(
            .vars = colnames(funnel %>% select(-application, -lead_id, -appldate, -funded_amount, -type_formula)),
            .funs = function (x) { x %>% replace_na(replace = 0) %>% as.logical() }
        )
    
    
    
    if (write) {
        df %>%
            write.csv(
                paste0(
                    "..\\data\\df-",
                    admethod %>% getShortenedAdmethodName(),
                    "-",
                    timestart,
                    ".csv"
                )
            )
    }
    
    return(df)
    
}

In [None]:
convertDataTypes = function (raw.df) {
    
    data.type.list = list(
        booleans = c(
            'paycheck_direct_deposit',
            'srh_online_loan_opened_in_the_last_year',
            'srh_online_loan_inquiry_in_the_last_thirty_days'
        ),
        numerics = c(
            'paycheck_direct_deposit',
            'ofac_score',
            'ssn_distinct_first_last_name_count',
            'ccr_score',
            'ccr_number_of_loans',
            'ccr_number_of_bank_accounts',
            'ccr_highest_number_of_days_past_due',
            'ccr_current_inquiry_cluster_position',
            'ccr_days_since_last_loan_charged_off',
            'ccr_days_since_inquiry_previously_seen',
            'ccr_number_of_employers_last_six_months',
            'srh_loans_in_collections',
            'srh_spml_average_rollovers',
            'srh_amount_loans_charged_off',
            'ticrh_ninety_days_ago',
            'ticrh_twentyfour_hours_ago'
        )
    )
    
    raw.df %>% 
        mutate_at(
            .vars = data.type.list$booleans,
            .funs = as.logical
        ) %>%  
        mutate_at(
            .vars = data.type.list$numerics,
            .funs = as.numeric
        )
    
}

In [None]:
processColumns = function (converted.df) {

    converted.df %>% 
        mutate(
            ccr_worst_payment_rating_null = is.na(ccr_worst_payment_rating),
            ccr_worst_payment_rating_plus = !ccr_worst_payment_rating_null & ccr_worst_payment_rating == '+',
            ccr_worst_payment_rating_zero = !ccr_worst_payment_rating_null & ccr_worst_payment_rating == '0',
            ccr_worst_payment_rating_hash = !ccr_worst_payment_rating_null & ccr_worst_payment_rating == '#',
            ccr_worst_payment_rating_else = !(
                ccr_worst_payment_rating_plus |
                ccr_worst_payment_rating_zero |
                ccr_worst_payment_rating_hash |
                ccr_worst_payment_rating_null
            ),

            ccr_has_previous_loan_charged_off = case_when(
                is.na(ccr_days_since_last_loan_charged_off) ~ FALSE,
                TRUE ~ TRUE
            ),
            
            payrolltype = payrolltype %>% replace_na('Missing'),
            campaign_id = campaign_id %>% map(~ .x %>% str_match_all("\\\"(.*)\\\"") %>% .[[1]] %>% .[,2]) %>% as.character()
            
        ) %>% 
        select(
            -ccr_worst_payment_rating,
            -ccr_days_since_last_loan_charged_off
        )
}

In [None]:
imputeMissingValues = function (processed.df) {
    

    impute.10000 = c(
        'ccr_days_since_inquiry_previously_seen',
        'ccr_number_of_loans',
        'ccr_number_of_bank_accounts',
        'ccr_highest_number_of_days_past_due',
        'ccr_current_inquiry_cluster_position',
        'ccr_number_of_employers_last_six_months',
        'ccr_score',
        'paycheck_direct_deposit'
    )
    
    processed.df %>%
        mutate_at(
            .vars = impute.10000,
            .funs = ~ .x %>%
                replace_na(
                    replace = 10000
                )
        )
    
}

In [None]:
standardizeValues = function (imputed.df) {
    return(imputed.df)
}

In [None]:
keepModelFeatures = function (standardized.df) {
    
    standardized.df %>% 
        select(
            -lead_id,
            -leadofferid,
            -lead_time,
            -partnerid,
            -accepted,
            -reason,
            -code,
            -campaign_id,
            -bankname,
            -abaroutingnumber,
            -accountnumber,
            -incometype,
            -payrollfrequency,
            -payrolltype,
            -work_hiredate,
            -lastpayrolldate,
            -dateofbirth,
            -report_time,
            -has_clarity,
            -report_requested,
            -report_received,
            -report_time,
            -appldate,
#             -newentered,
            -bizrulespassed,
#             -qualified,
            -bankverified,
            -passscorecardratecard,
            -contractsigned,
            -cs_decisioned,
            -offer_interestrate,
            -offer_monthlypayment,
            -offer_amount,
            -application
        ) %>% 
        mutate_if(
            .predicate = is.character,
            .funs = as.factor
        ) %>% 
        mutate_if(
            .predicate = is.logical,
            .funs = as.factor
        )
    
}

In [None]:
removeMissingObservations = function (feature.df) {
    
    feature.df %>% 
#         mutate(
        filter(
            apply(
                X = feature.df,
                FUN = function (x) { x %>% is.na() %>% sum() },
                MARGIN = 1
            ) == 0
        )
    
}

In [None]:
getColPercNA = function (df) {

    df %>% 
        apply(
            FUN = function (x) {
                x %>% is.na() %>% mean()
            },
            MARGIN = 2
        ) %>% 
        as.data.frame() %>% 
        select(
            perc.na = '.'
        ) %>% 
        rownames_to_column(
            var = 'column'
        ) %>% 
        arrange(
            perc.na %>% desc()
        )

}

### Get Training

In [None]:
writeTraining = function (admethod = 'LeapTheory 4') {
    
    df.feb =
        getLeadsPerformance(
            admethod = admethod,
            timestart = '2020-02-01',
            timeend = '2020-02-23',
            limit = NA,
            write = TRUE
        )

    df.jan =
        getLeadsPerformance(
            admethod = admethod,
            timestart = '2020-01-01',
            timeend = '2020-02-01',
            limit = NA,
            write = TRUE
        )

    df.dec =
        getLeadsPerformance(
            admethod = admethod,
            timestart = '2019-12-01',
            timeend = '2020-01-01',
            limit = NA,
            write = TRUE
        )


    df = do.call(
        rbind,
        list(df.feb, df.jan, df.dec)
    )
    
    df %>% write.csv(
        paste0(
            "..\\data\\df-",
            admethod %>% getShortenedAdmethodName(),
            ".csv"
        )
    )
    
    return(df)
    
}

In [None]:
readTraining = function (admethod = 'LeapTheory 4', write = FALSE) {

    if (write)
        write = writeTraining()
    
    df.feb =
        suppressMessages({suppressWarnings({
            read_csv(
                paste0(
                    "..\\data\\df-",
                    admethod %>% getShortenedAdmethodName(),
                    "-2020-02-01.csv"
                )
            ) %>%
                select(-X1)
        })}) %>% 
        convertDataTypes() %>%
        processColumns() %>%
        imputeMissingValues() %>%
#         standardizeValues() %>% 
        keepModelFeatures()
#         removeMissingObservations()

    df.jan =
        suppressMessages({suppressWarnings({
            read_csv(
                paste0(
                    "..\\data\\df-",
                    admethod %>% getShortenedAdmethodName(),
                    "-2020-01-01.csv"
                )
            ) %>%
                select(-X1)
        })}) %>% 
        convertDataTypes() %>%
        processColumns() %>%
        imputeMissingValues() %>%
#         standardizeValues() %>% 
        keepModelFeatures()
#         removeMissingObservations()

    df.dec =
        suppressMessages({suppressWarnings({
            read_csv(
                paste0(
                    "..\\data\\df-",
                    admethod %>% getShortenedAdmethodName(),
                    "-2019-12-01.csv"
                )
            ) %>%
                select(-X1)
        })}) %>% 
        convertDataTypes() %>%
        processColumns() %>%
        imputeMissingValues() %>%
#         standardizeValues() %>% 
        keepModelFeatures()
#         removeMissingObservations()
    
    df = do.call(
        rbind,
        list(df.feb, df.jan, df.dec)
    )
    
    
#     df = do.call(
#         what = rbind,
#         args = lapply(
#             X = c(
#                 "..\\data\\df-lenderedge-2020-01-01.csv",
#                 "..\\data\\df-lenderedge-2020-02-01.csv"
#             ),
#             FUN = function (x) {
#                 suppressWarnings({suppressMessages({
#                     read_csv(x)
#                 })})
#             }
#         )
#     ) %>% select(-X1)
    
    return(df)
    
}

In [None]:
df.train = readTraining()

### Get Test

In [None]:
writeTest = function (admethod = 'LeapTheory 4', timestart = '2020-03-01', timeend = '2020-03-08') {
    
    getLeadsPerformance(
        admethod = admethod,
        timestart = timestart,
        timeend = timeend,
        limit = NA,
        write = TRUE
    ) %T>%
        write.csv(
            paste0(
                "..\\data\\df-",
                admethod %>% getShortenedAdmethodName(),
                "-test.csv"
            )
        )
    
}

In [None]:
readTest = function (admethod = 'LeapTheory 4', write = FALSE) {

    if (write)
        write = writeTest()
    
    suppressMessages({suppressWarnings({
        read_csv(
            paste0(
                "..\\data\\df-",
                admethod %>% getShortenedAdmethodName(),
                "-2020-03-01.csv"
            )
        ) %>%
            select(-X1)
    })}) %>% 
    convertDataTypes() %>%
    processColumns() %>%
    imputeMissingValues() %>%
#     standardizeValues() %>% 
    keepModelFeatures()
#     removeMissingObservations()

}

In [None]:
df.test = readTest()

### Get Validate

In [None]:
writeValidate = function (admethod = 'LeapTheory 4', timestart = '2020-02-23', timeend = '2020-03-01') {
    
    getLeadsPerformance(
        admethod = admethod,
        timestart = timestart,
        timeend = timeend,
        limit = NA,
        write = TRUE
    ) %T>%
        write.csv(
            paste0(
                "..\\data\\df-",
                admethod %>% getShortenedAdmethodName(),
                "-validate.csv"
            )
        )
    
}

In [None]:
readValidate = function (admethod = 'LeapTheory 4', write = FALSE) {

    if (write)
        write = writeValidate()
    
    suppressMessages({suppressWarnings({
        read_csv(
            paste0(
                "..\\data\\df-",
                admethod %>% getShortenedAdmethodName(),
                "-2020-02-23.csv"
            )
        ) %>%
            select(-X1)
    })}) %>% 
    convertDataTypes() %>%
    processColumns() %>%
    imputeMissingValues() %>%
#     standardizeValues() %>% 
    keepModelFeatures()
#     removeMissingObservations()
    
}

In [None]:
df.validate = readValidate()

### Get Additional Validates

In [None]:
writeValidate2 = function (admethod = 'LeapTheory 4', timestart = '2020-03-08', timeend = '2020-03-15') {
    
    getLeadsPerformance(
        admethod = admethod,
        timestart = timestart,
        timeend = timeend,
        limit = NA,
        write = TRUE
    ) %T>%
        write.csv(
            paste0(
                "..\\data\\df-",
                admethod %>% getShortenedAdmethodName(),
                "-validate2.csv"
            )
        )
    
}

In [None]:
readValidate2 = function (admethod = 'LeapTheory 4', write = FALSE) {

    if (write)
        write = writeValidate2()
    
    suppressMessages({suppressWarnings({
        read_csv(
            paste0(
                "..\\data\\df-",
                admethod %>% getShortenedAdmethodName(),
                "-2020-03-08.csv"
            )
        ) %>%
            select(-X1)
    })}) %>% 
    convertDataTypes() %>%
    processColumns() %>%
    imputeMissingValues() %>%
#     standardizeValues() %>% 
    keepModelFeatures()
#     removeMissingObservations()
    
}

In [None]:
df.validate2 = readValidate2()

### Data Processing

In [None]:
upsampleImbalancedClassInTraining = function (df.train, upsample.multiple = 23, quietly = FALSE) {

    ###  Upsample Minority Class (Funded == 1)  ###
    if (!quietly) {
        cat("Pre Upsample:\n")
        print(df.train %>% group_by(funded) %>% summarize(n()))
        cat('\n')
    }

    train.funded = df.train %>% filter(funded == 'TRUE')
    funded.rep.df = do.call(
        rbind,
        replicate(
            n = upsample.multiple,
            expr = {
                rbind(data.frame(), train.funded)
            },
            simplify = FALSE
        )
    )
    
    train.bal = rbind(
        df.train,
        funded.rep.df
    )
    
    if (!quietly) {
        cat("\nPost Upsample:\n")
        print(train.bal %>% group_by(funded) %>% summarize(n()))
        cat('\n')
    }
    
    return(train.bal)
    
}

In [None]:
removeNewClassesInGeneralization = function (df.generalize, df.train, generalize.name = NA) {
    
    df.similar = df.generalize %>%
        filter(
#             incometype %in% (df.train$incometype %>% unique()) &
#             payrolltype %in% (df.train$payrolltype %>% unique()) &
            statecode %in% (df.train$statecode %>% unique()) &
#             campaign_id %in% (df.train$campaign_id %>% unique()) &
            paycheck_direct_deposit %in% (df.train$paycheck_direct_deposit %>% unique()) &
            srh_online_loan_opened_in_the_last_year %in% (df.train$srh_online_loan_opened_in_the_last_year %>% unique()) &
            srh_online_loan_inquiry_in_the_last_thirty_days %in% (df.train$srh_online_loan_inquiry_in_the_last_thirty_days %>% unique()) &
            ccr_worst_payment_rating_null %in% (df.train$ccr_worst_payment_rating_null %>% unique()) &
            ccr_worst_payment_rating_plus %in% (df.train$ccr_worst_payment_rating_plus %>% unique()) &
            ccr_worst_payment_rating_zero %in% (df.train$ccr_worst_payment_rating_zero %>% unique()) &
            ccr_worst_payment_rating_hash %in% (df.train$ccr_worst_payment_rating_hash %>% unique()) &
            ccr_worst_payment_rating_else %in% (df.train$ccr_worst_payment_rating_else %>% unique()) &
            ccr_has_previous_loan_charged_off %in% (df.train$ccr_has_previous_loan_charged_off %>% unique())
        )
    
    print(
        paste0(
            nrow(df.generalize) - nrow(df.similar),
            ifelse(
                is.na(generalize.name),
                " rows removed.",
                paste0(" rows removed from ", generalize.name, ".")
            )
        )
    )
    cat("\n")
    
    return(df.similar)
    
}

In [None]:
processAll = function (df.train, df.test, df.validate, df.validate2) {    
    
    list(
        train = df.train %>% 
            mutate_if(
                .predicate = ~ ! .x %>% is.numeric,
                .funs = as.factor
            ) %>% 
            select(
                -statecode
            ),
        
        test = df.test %>%
            removeNewClassesInGeneralization(
                df.train,
                "test set"
            ) %>% 
            mutate_if(
                .predicate = ~ ! .x %>% is.numeric,
                .funs = as.factor
            ) %>% 
            select(
                -statecode
            ),
        
        validate = df.validate %>%
            removeNewClassesInGeneralization(
                df.train,
                "validation set"
            ) %>% 
            mutate_if(
                .predicate = ~ ! .x %>% is.numeric,
                .funs = as.factor
            ) %>% 
            select(
                -statecode
            ),
        
        validate2 = df.validate2 %>%
            removeNewClassesInGeneralization(
                df.train,
                "validation2 set"
            ) %>% 
            mutate_if(
                .predicate = ~ ! .x %>% is.numeric,
                .funs = as.factor
            ) %>% 
            select(
                -statecode
            ),
        
        train.bal = df.train %>%
            upsampleImbalancedClassInTraining(
            ) %>% 
            mutate_if(
                .predicate = ~ ! .x %>% is.numeric,
                .funs = as.factor
            ) %>% 
            select(
                -statecode
            )
    )

}

In [None]:
data.split = processAll(
    df.train,
    df.test,
    df.validate,
    df.validate2
)

In [None]:
data.split$train.bal %>% str()

# Modeling

### Decision Tree

In [None]:
getEvaluation = function (predict.object) {

    detailed = predict.object %>% 
        as.data.frame(
            stringsAsFactors = FALSE
        ) %>% 
        group_by(
            truth,
            response
        ) %>% 
        summarize(
            n = n()
        ) %>%
        ungroup()
    
    prior = detailed %>% filter(truth == TRUE) %>% .$n %>% sum() /
            detailed %>% .$n %>% sum()
    
    accuracy = detailed %>% filter(truth == response) %>% .$n %>% sum() /
            detailed %>% .$n %>% sum()                              ##TP,TN
    
    recall = detailed %>% filter(truth == TRUE & response == TRUE) %>% .$n %>% sum()/
            detailed %>% filter(truth == TRUE) %>% .$n %>% sum()    ##FP
    
    precision = detailed %>% filter(truth == TRUE & response == TRUE) %>% .$n %>% sum()/
            detailed %>% filter(response == TRUE) %>% .$n %>% sum() ##FN
    
    
    metrics = data.frame(
        metric = c('prior', 'accuracy', 'recall', 'precision'),
        value = c(prior, accuracy, recall, precision),
        stringsAsFactors = FALSE
    )
 
    list(
        detailed = detailed,
        metrics = metrics
    )
}

In [None]:
getDecisionTree = function (data.split, evaluate = TRUE, xval = 0, minsplit = 1000, minbucket = 1000, cp = 0.00005) {
 

    ####  Setup  ####

    library(rattle)
    # library(rpart)
    # suppressWarnings({
    #     listLearners() %>%
    #         filter(
    #             type == 'classif' &
    #             name %>% str_detect('[Tt]ree')
    #         )
    # })
    # getParamSet('classif.rpart')

    # loss.matrix = matrix(
    #     c(0,1,1.6,0),
    #     byrow = TRUE,
    #     nrow = 2
    # )

    rpart.task = makeClassifTask(
        id = 'rpart.task',
        data = data.split$train.bal %>%
            select(
                -newentered,
                -qualified,
                -funded_amount,
                -type_formula,
                -truefpd
            ) %>% 
            select(
#                 age,
                ccr_score,
#                 lead_day,
                grossmonthlyincome,
                requestedloanamount,
                ccr_days_since_inquiry_previously_seen,
                ccr_number_of_bank_accounts,
                ticrh_ninety_days_ago,
                funded
            ),
        target = 'funded',
        positive = 'TRUE'
    )

    rpart.learner = makeLearner(
        cl = 'classif.rpart',
        id = 'rpart.learner',
        xval = xval,                  ##  Number of Cross Validations
        minsplit = minsplit,             ##  Number of Obs in Node for Split to be Attempted
        minbucket = minbucket,             ##  Minimum number of observations in Leaf Node
        cp = cp                ##  Minimum information gain for split to execute.
    #     loss = loss.matrix
    )



    ####  Resampling  ####

    # rpart.resample = makeResampleDesc('CV', iters = 5, stratify = TRUE)

    # rpart.cv = resample(
    #     learner = rpart.learner,
    #     task = rpart.task,
    #     resampling = rpart.resample,
    #     measures = list(acc, mmce, fpr),
    #     show.info = FALSE
    # )

    # # rpart.cv %>% .$measures.test
    # # rpart.cv %>% .$aggr %>% as.data.frame()



    ####  Hypertuning  ####

    # rpart.params = makeParamSet(
    #     makeIntegerParam('minsplit', lower = 10, upper = 50),
    #     makeIntegerParam('minbucket', lower = 5, upper = 50),
    #     makeNumericParam('cp', lower = 0.0001, upper = 0.2)
    # )

    # rpart.search = makeTuneControlGrid()

    # rpart.tune = tuneParams(
    #     learner = rpart.learner,
    #     task = rpart.task,
    #     resampling = rpart.resample,
    #     par.set = rpart.params,
    #     control = rpart.search,
    #     measures = list(mmce, acc, fpr),
    #     show.info = FALSE
    # )

    # setHyperPars(
    #     learner = rpart.learner,
    #     par.vals = rpart.tune$x
    # ) 



    ####  Training  ####

    rpart.model = train(
        learner = rpart.learner,
        task = rpart.task
    )



    ####  Evaluation  ####

#     rpart.predict = predict(
#         object = rpart.model,
#         newdata = data.split$test %>% select(-newentered, -qualified)
#     )

#     rpart.validate = predict(
#         object = rpart.model,
#         newdata = data.split$validate %>% select(-newentered, -qualified)
#     )

#     test.eval = rpart.predict %>% getEvaluation()
#     validate.eval = rpart.validate %>% getEvaluation()

#     if (evaluate) {
#         list(
#             test.eval = test.eval,
#             validate.eval = validate.eval
#         )
#     }


#     rules = rpart.model$learner.model
#     rules.df = rpart.plot::rpart.rules(rpart.model$learner.model)
# #     rules.tree = rpart.plot::rpart.plot(rpart.model$learner.model, type = 4)
# #     fancyRpartPlot(rpart.model$learner.model)



    ####  Outputs  ####
    
    ##    Tree    ##
    pdf(
        tf <- tempfile(
            fileext = ".pdf"
        )
    )
    fancyRpartPlot(
        rpart.model$learner.model
    )
    dev.off()
    cat(tf)
   
}

In [None]:
suppressWarnings({suppressMessages({
    data.split %>% getDecisionTree(evaluate = FALSE)
})})

##### Evaluate Model

In [None]:
getDecisionTreeProposal = function (data.split, nodes.to.exclude.logic.expr, loantype = 'NEW') {    

    
    ####  Evaluation  ####
    evaluatePerformance = function (data.split, stage, loantype.inner = loantype) {
        
        calculateChangeSize = function (metric.string, observed.metrics = observed, proposed.metrics = proposed) {
            
            (
                100 *
                ( proposed.metrics[[metric.string]] - observed.metrics[[metric.string]] ) /
                observed.metrics[[metric.string]]
            ) %>%
            round(2) %>% 
            paste0('%')
            
        }
        calculateChangeConversion = function (metric.string, observed.metrics = observed, proposed.metrics = proposed) {
            
            paste0(
                ifelse(
                    proposed.metrics[[metric.string]] - observed.metrics[[metric.string]] >= 0,
                    '+',
                    ''
                ),
                round(
                    10000 * (proposed.metrics[[metric.string]] - observed.metrics[[metric.string]]),
                    0
                )
            )
            
        }
        
        observed = data.split[[stage]] %>%
            filter(
                parse(
                    text = if_else(
                        !is.na(loantype.inner),
                        'str_to_upper(type_formula) == str_to_upper(loantype.inner) | is.na(type_formula)',
                        'TRUE'
                    )
                ) %>%
                eval()
            ) %>% 
            summarize(
                accept.size = n(),
                app.size = sum(newentered == 'TRUE'),
                qual.size = sum(qualified == 'TRUE'),
                funded.size = sum(funded == 'TRUE'),
                dollar.size = sum(
                    (funded == 'TRUE') *
                    funded_amount,
                    na.rm = TRUE
                ),
                
                apply.rate = app.size/accept.size,
                qr = qual.size/app.size,
                fr = funded.size/qual.size,
                app.to.fund = funded.size/app.size,
                accept.to.fund = funded.size/accept.size,
                
                fpd.mature = sum(funded == 'TRUE' & !is.na(truefpd)) / funded.size,
                fpd = sum(funded == 'TRUE' & !is.na(truefpd) & truefpd == 1) / sum(funded == 'TRUE' & !is.na(truefpd))
            )

        proposed = data.split[[stage]] %>%
            filter(
                !eval(nodes.to.exclude.logic.expr) &
                parse(
                    text = if_else(
                        !is.na(loantype.inner),
                        'str_to_upper(type_formula) == str_to_upper(loantype.inner) | is.na(type_formula)',
                        'TRUE'
                    )
                ) %>%
                eval()
            ) %>% 
            summarize(
                accept.size = n(),
                app.size = sum(newentered == 'TRUE'),
                qual.size = sum(qualified == 'TRUE'),
                funded.size = sum(funded == 'TRUE'),
                dollar.size = sum(
                    (funded == 'TRUE') *
                    funded_amount,
                    na.rm = TRUE
                ),
                
                apply.rate = app.size/accept.size,
                qr = qual.size/app.size,
                fr = funded.size/qual.size,
                app.to.fund = funded.size/app.size,
                accept.to.fund = funded.size/accept.size,
                
                fpd.mature = sum(funded == 'TRUE' & !is.na(truefpd)) / funded.size,
                fpd = sum(funded == 'TRUE' & !is.na(truefpd) & truefpd == 1) / sum(funded == 'TRUE' & !is.na(truefpd))
            )

        change = data.frame(
            'Size__' = '',
            accepted = 'accept.size' %>% calculateChangeSize(),
            app = 'app.size' %>% calculateChangeSize(),
            qualified = 'qual.size' %>% calculateChangeSize(),
            funded = 'funded.size' %>% calculateChangeSize(),
            funded.dollar = 'dollar.size' %>% calculateChangeSize(),
            
            'Rates__' = '',
            apply.rate = 'apply.rate' %>% calculateChangeConversion(),
            qr = 'qr' %>% calculateChangeConversion(),
            fr = 'fr' %>% calculateChangeConversion(),
            app.to.fund = 'app.to.fund' %>% calculateChangeConversion(),
            accept.to.fund = 'accept.to.fund' %>% calculateChangeConversion(),
            
            fpd.mature = paste0(round(100*observed$fpd.mature, 2), '%'),
            fpd = 'fpd' %>% calculateChangeConversion(),
            
            stringsAsFactors = FALSE
        )
        
        list(
#             observed = observed,
#             proposed = proposed,
            change = change
        )
        
    }
    
    list(
#         data.split %>% evaluatePerformance('train.bal'),
        data.split %>% evaluatePerformance('train'),
        data.split %>% evaluatePerformance('test'),
        data.split %>% evaluatePerformance('validate'),
        data.split %>% evaluatePerformance('validate2')
    )
    
}

In [None]:
exclude.logic = expr(
    (
        ccr_score == 10000 &
        ccr_number_of_bank_accounts == 10000
#     ) |
#     (
#         requestedloanamount < 1450 &
# #         ccr_number_of_bank_accounts >= 5.5
#         ccr_number_of_bank_accounts == 10000
    )
)

In [None]:
suppressWarnings({suppressMessages({
    data.split %>% getDecisionTreeProposal(exclude.logic)
})})

##### Evaluate Exclusion Nodes Generalization - Time Series

In [None]:
plotTimeSeries = function (admethod, exclude.logic.expr) {

    df.ts = do.call(
        what = rbind,
        args = lapply(
            X = c(
                paste0("..\\data\\df-", admethod %>% getShortenedAdmethodName, "-2019-12-01.csv"),
                paste0("..\\data\\df-", admethod %>% getShortenedAdmethodName, "-2020-01-01.csv"),
                paste0("..\\data\\df-", admethod %>% getShortenedAdmethodName, "-2020-02-01.csv"),
                paste0("..\\data\\df-", admethod %>% getShortenedAdmethodName, "-2020-02-23.csv"),
                paste0("..\\data\\df-", admethod %>% getShortenedAdmethodName, "-2020-03-01.csv")
            ),
            FUN = function (x) {
                suppressWarnings({suppressMessages({read_csv(x)})})
            }
        )
    )

    ###  Mix  ###
    mix.plot = df.ts %>%
        group_by(
            lead_date = lead_time %>% as.Date()
        ) %>% 
        mutate(
            day.total = n()
        ) %>% 
        ungroup() %>% 
        group_by(
            lead_date,
            exclude = eval(exclude.logic.expr)
        ) %>% 
        summarize(
            p = n()/mean(day.total)
        ) %>% 
        ungroup() %>% 
        ggplot(
            mapping = aes(
                x = lead_date,
                y = p,
                color = exclude
            )
        ) +
        geom_line() +
        geom_point() +
        scale_y_continuous(
            labels = scales::percent
        ) +
        labs(
            title = "Mix of Exclude"
        ) +
        theme_bw()

    ###  Conversion - FR  ###
    conversion.plot = df.ts %>%
        group_by(
            lead_date = lead_time %>% as.Date(),
            exclude = eval(exclude.logic.expr)
        ) %>% 
        summarize(
    #         conversion = sum(funded)/sum(qualified)
    #         conversion = sum(qualified)/sum(newentered)
    #         conversion = sum(newentered)/n()
    #         conversion = sum(funded)/sum(newentered)
            conversion = sum(funded)/n()
        ) %>% 
        ungroup() %>% 
        ggplot(
            mapping = aes(
                x = lead_date,
                y = conversion,
                color = exclude
            )
        ) +
        geom_line() +
        geom_point() +
        scale_y_continuous(
            labels = scales::percent
        ) +
        labs(
            title = "Conversion of Exclude"
        ) +
        theme_bw()

    
    list(
        mix.plot,
        conversion.plot
    )
    
}

In [None]:
# 'CreditKarma4' %>% plotTimeSeries(exclude.logic.expr = exclude.logic)

### Random Forest

In [None]:
getRemoveNALogic = function (data.split) {
    
    data.split$train.bal %>% 
        select(
            -truefpd,
            -type_formula,
            -funded_amount
        ) %>% 
        apply(
            FUN = function (x) {x %>% is.na() %>% mean()},
            MARGIN = 2
        ) %>% 
        as.data.frame(
        ) %>%
        select(
            perc = '.'
        ) %>% 
        rownames_to_column(
            'field'
        ) %>% 
        filter(
            perc > 0
        ) %>% 
        mutate(
            expression = field %>% 
                map(
                    .f = ~ paste0(
                        '!is.na(',
                        .x,
                        ')'
                    )
                ) %>% as.character()
        ) %>% 
        .$expression %>% 
        paste0(
            collapse = ' & '
        )
    
}

In [None]:
getRandomForest = function (
    data.split,
    evaluate = FALSE,
    ntree = 24,
    mtry = data.split$train.bal %>% ncol() %>% sqrt() %>% ceiling(),
    replace = TRUE,
#     cutoff = 1/2,
#     sampsize = nrow(data.split$train.bal),
    #     nodesize = 5,
    oob.prox = FALSE
) {
 

    ####  Setup  ####

    suppressWarnings({suppressMessages({
        library(randomForest)
    })})
#     suppressWarnings({
#         listLearners() %>%
#             filter(
#                 type == 'classif' &
#                 name %>% str_detect('[Ff]orest')
#             )
#     })
#     getParamSet('classif.randomForest')


    ####  Task + Learner = Train  ####

    rf.task = makeClassifTask(
        id = 'rf.task',
        data = data.split$train.bal %>%
            select(
                -newentered,
                -qualified,
                -funded_amount,
                -type_formula,
                -truefpd
            ) %>% 
            filter(
                parse(
                    text = getRemoveNALogic(data.split = data.split)
                ) %>%
                eval()
            ) %>% as.data.frame(),
        target = 'funded',
        positive = 'TRUE'
    )

    rf.learner = makeLearner(
        cl = 'classif.randomForest',
        id = 'rf.learner',
        ntree = ntree,
        mtry = mtry,
        replace = replace,
#         cutoff = cutoff,
#         sampsize = sampsize,
#         nodesize = nodesize,
        oob.prox = FALSE
    )



    ####  Hypertuning  ####

#     rf.resample = makeResampleDesc('CV', iters = 5, stratify = TRUE)

#     rf.params = makeParamSet(
#     #     makeIntegerParam('mtry', lower = 4, upper = 12),
#     #     makeNumericParam('nodesize', lower = 10, upper = 11)
#     )

#     rf.search = makeTuneControlGrid()

#     rf.tune = tuneParams(
#         task = rf.task,
#         learner = rf.learner,
#         resampling = rf.resample,
#         par.set = rf.params,
#         control = rf.search,
#         measures = list(mmce, acc, fpr),
#         show.info = FALSE
#     )

#     rf.tune$x

#     # setHyperPars(
#     #     learner = rf.learner,
#     #     par.vals = rf.tune$x
#     # ) 



    ####  Resampling  ####

    # rf.cv = resample(
    #     learner = rf.learner,
    #     task = rf.task,
    #     resampling = rf.resample,
    #     measures = list(acc, mmce, fpr),
    #     show.info = FALSE
    # )

    # rf.cv %>% .$measures.test
    # rf.cv %>% .$aggr %>% as.data.frame()



    ####  Training  ####

    rf.model = train(
        learner = rf.learner,
        task = rf.task
    )



    ####  Evaluation  ####

#     rf.predict = predict(
#         object = rf.model,
#         newdata = data.split$test %>%
#             select(
#                 -newentered,
#                 -qualified,
#                 -funded_amount,
#                 -type_formula,
#                 -truefpd
#             )
#     )

#     rf.validate = predict(
#         object = rf.model,
#         newdata = data.split$validate %>%
#             select(
#                 -newentered,
#                 -qualified,
#                 -funded_amount,
#                 -type_formula,
#                 -truefpd
#             )
#     )
    
#     test.eval = rf.predict %>% getEvaluation()
#     validate.eval = rf.validate %>% getEvaluation()

#     if (evaluate) {
#         list(
#             test.eval = test.eval,
#             validate.eval = validate.eval
#         )
#     }



    ####  Outputs  ####
    
    ##    Model    ##
    return(rf.model)
   
}

In [None]:
getVIP = function (rf.model) {
    
    df = rf.model$learner.model %>%
        importance() %>%
        as.data.frame() %>%
        rownames_to_column(
            var = 'variable'
        ) %>%
        arrange(
            MeanDecreaseGini %>% desc()
        )
    
    plot = rf.model$learner.model %>%
        varImpPlot()
    
    list(
        df = df,
        plot = plot
    )
}

In [None]:
rf = data.split %>% getRandomForest()

In [None]:
rf %>% getVIP()