In [0]:
import json
# from jsonpath_ng import jsonpath, parse
# from jsonpath import jsonpath
import pandas as pd
import numpy as np

# from utils import policy_dict, policy_data_dict, find_non_zero_riskweight_rules, find_zero_riskweight_rules
from pandasql import sqldf
from tqdm import tqdm
import logging

from pysnowflake import Session

pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

run_query = lambda query: sqldf(query, globals())




In [0]:
USER_NAME = 'jobyg' #replace it with your ldap name
sess = Session(
   connection_override_args={
       'autocommit': True,
       'authenticator': 'externalbrowser',
       'account': 'square',
       'database': f'PERSONAL_{USER_NAME.upper()}',
       'user': f'{USER_NAME}@squareup.com'
   }
   
)
conn = sess.open()



In [0]:
from datetime import date, timedelta
# Format the date as YYYY-MM-DD
start_date = '2024-01-01'
end_date = date.today().strftime("%Y-%m-%d")


conn.execute('use warehouse ADHOC__XLARGE')
conn.execute('use database AP_CUR_FRDRISK_G')
conn.execute('use schema public')

conn.execute(f'''create or replace temp table SUP_attempts as 
(select a.*, b.p2_overdue_d0_local, b.p2_due_local, b.sup_yn,
 case when b.sup_yn = 1 and in_flight_order_channel != 'SINGLE_USE_CARD' then 1 else 0 end as alert   from ap_cur_r_frdrisk.curated_fraud_risk_red.unified_feature_datamart_base__jobyg_dsl3_sv a
 left join ap_cur_riskbi_g.curated_risk_bi_green.dwm_order_loss_tagging b
 on a.order_token = b.order_token
 where (a.par_Region ilike 'GB' and par_process_date >='{start_date}')  
and (a.in_flight_order_channel = 'SINGLE_USE_CARD' or b.sup_yn = 1)
and a.checkpoint = 'CHECKOUT_CONFIRM'
AND COALESCE(a.DAYS_SINCE_FIRST_ORDER_DATE,0) <15
and a.dedup = 1);

''')

In [0]:
#create a list of features to subset and analyze 

feat_list = ['IN_FLIGHT_ORDER_MERCHANT_NAME'
,'CONSUMER_ACCOUNT_LINKING_TYPE'
,'CONSUMER_IS_FIRST_ORDER'
,'CONSUMER_CONTACT_ADDRESS_POSTCODE']

df_dict = {"aggregate":{},
           'time':{}
           }

for var in feat_list:
    var_query = f'''select 
    --date_trunc('week',par_process_date) as week_dt,
        {var},
        sum(case when is_in_attempt_control_group = 1 then p2_overdue_d0_local end) as p2_overdue,
            sum(case when is_in_attempt_control_group = 1 then p2_due_local end) as p2_due,
            div0(sum(case when is_in_attempt_control_group =1 then p2_overdue_d0_local end), sum(case when is_in_attempt_control_group =1 then p2_due_local end)) as ctrl_grp_p2_d0_rate,

            sum(order_amount_local) as gpv,
            count(distinct(case when is_in_attempt_control_group=1 then order_token end)) as ctrl_token_ct,

        sum(p2_overdue_d0_local ) as portfolio_p2_overdue,
        sum(p2_due_local ) as portfolio_p2_due,
        div0(sum( p2_overdue_d0_local ), sum(p2_due_local )) as portfolio_p2_d0_rate,
        count(order_token) as token_ct,

    from SUP_attempts  
    group by 1
    order by 1;
    '''
    temp = conn.download(var_query)
    df_dict['aggregate'][var]= temp

    alerts = temp[(temp['ctrl_grp_p2_d0_rate'] >=.35) & (temp['gpv']>500)]
    if len(alerts) > 0 and var!='CONSUMER_CONTACT_ADDRESS_POSTCODE':
        for idx, row in alerts.iterrows():
            str_var = str(var).lower()
            print(f"- {row[str_var]} with GPV ${row['gpv']:,.2f} has fraud rate of {row['portfolio_p2_d0_rate']:.1%}")


    

In [0]:
for var in feat_list:
    time_query = f'''select 
        to_varchar(par_process_date, 'YYYY-MM') as month_dt,
            {var},
            sum(case when is_in_attempt_control_group = 1 then p2_overdue_d0_local end) as p2_overdue,
                sum(case when is_in_attempt_control_group = 1 then p2_due_local end) as p2_due,
                div0(sum(case when is_in_attempt_control_group =1 then p2_overdue_d0_local end), sum(case when is_in_attempt_control_group =1 then p2_due_local end)) as ctrl_grp_p2_d0_rate,

                sum(case when order_token_loss is not null then order_amount_local end) as gpv,
                count(order_token) as token_ct,

                count(distinct(case when is_in_attempt_control_group=1 then order_token end)) as ctrl_token_ct,
            sum(p2_overdue_d0_local ) as portfolio_p2_overdue,
            sum(p2_due_local ) as portfolio_p2_due,
            div0(sum( p2_overdue_d0_local ), sum(p2_due_local )) as portfolio_p2_d0_rate,
        from SUP_attempts  
        where order_token_loss is not null
        group by 1,2
        order by 1,2;
        '''
    temp = conn.download(time_query)
    temp_fixed = run_query(f'''select 
                        month_dt 
                        ,case when gpv > 10000 then {var} else 'Other' end as merchant_name
                        ,sum(p2_overdue) as ctrl_group_p2_overdue
                        ,sum(p2_due) as ctrl_group_p2_due
                        ,sum(p2_overdue)/nullif(sum(p2_due),0) as ctrl_group_p2_d0_rate
                        ,sum(gpv) as gpv
                        ,sum(token_ct) as token_ct
                        ,sum(ctrl_token_ct) as ctrl_token_ct
                        ,sum(portfolio_p2_overdue ) as portfolio_p2_overdue
                        ,sum(portfolio_p2_due ) as portfolio_p2_due
                        ,sum(portfolio_p2_overdue)/nullif(sum(portfolio_p2_due),0) as portfolio_p2_d0_rate
                        from temp
                        group by 1,2
                        order by 1,2
                         ''')
    # temp_fixed['ctrl_group_p2_d0_rate'] = temp_fixed['portfolio_p2_d0_rate']
    df_dict['time'][var] = temp_fixed



# Graphs to help make my demonstration

GPV by merchant over time (Stacked bar)

P2 Overdue by merchant over time (Stacked bar)



In [0]:
df_dict['aggregate']['IN_FLIGHT_ORDER_MERCHANT_NAME'].sort_values(by=[ 'gpv' ], ascending=[False])

In [0]:
gpv_df = df_dict['time']['IN_FLIGHT_ORDER_MERCHANT_NAME'].copy() 
df_pivot = gpv_df.pivot_table(index='month_dt', columns='merchant_name', values='gpv', aggfunc='sum')

df_pivot_percentage = df_pivot.div(df_pivot.sum(axis=1), axis=0) * 100
palette = sns.color_palette("Set2", 16)  
# gpv_df['percent'] = gpv_df['gpv'] / gpv_df.groupby('month_dt')['gpv'].transform('sum') * 100
# Plot the 100% stacked bar chart
ax = df_pivot_percentage.plot(kind='bar', stacked=True, figsize=(20,16),  color=palette)

# Set the plot labels and title
ax.set_xlabel('Month')
ax.set_ylabel('Percentage (%)')
ax.set_title('100% Stacked Column Chart: Sales by Merchant')
ax.legend(title="Merchants", loc="upper center", bbox_to_anchor=(0.5, -0.1), ncol=3, fontsize=9)

# Show the plot
plt.show()

In [0]:
gpv_df = df_dict['time']['IN_FLIGHT_ORDER_MERCHANT_NAME'].copy() 
df_pivot = gpv_df.pivot_table(index='month_dt', columns='merchant_name', values='portfolio_p2_overdue', aggfunc='sum')

df_pivot_percentage = df_pivot.div(df_pivot.sum(axis=1), axis=0) * 100
palette = sns.color_palette("Set2", 16)  
# gpv_df['percent'] = gpv_df['gpv'] / gpv_df.groupby('month_dt')['gpv'].transform('sum') * 100
# Plot the 100% stacked bar chart
ax = df_pivot_percentage.plot(kind='bar', stacked=True, figsize=(20,16),  color=palette)

# Set the plot labels and title
ax.set_xlabel('Month')
ax.set_ylabel('Percentage (%)')
ax.set_title('100% Stacked Column Chart: Portfolio P2D0 by Merchant')
ax.legend(title="Merchants", loc="upper center", bbox_to_anchor=(0.5, -0.1), ncol=3, fontsize=9)

# Show the plot
plt.show()


# Analyzing tesco and currys

In [0]:
tesco = conn.execute('''create or replace temp table tesco as (select * from SUP_attempts where in_flight_order_merchant_name ilike 'tesco'); ''')
currys = conn.execute('''create or replace temp table currys as (select a.*, idv_check_result ,ekata_record_phone_Valid, ekata_record_phone_line_type,LEXISNEXIS_RECORD_SMARTSCORE from SUP_attempts a 
                      left join (select key_consumer_uuid, GDP_PROCESSED_DATETIME, IDV_CHECK_RESULT, ekata_record_phone_Valid, ekata_record_phone_line_type, LEXISNEXIS_RECORD_SMARTSCORE  from ap_raw_green.green.raw_c_e_idv_uksoft_id 
QUALIFY ROW_NUMBER() OVER (PARTITION BY  key_consumer_uuid ORDER BY GDP_PROCESSED_DATETIME DESC) = 1
)d
on a.consumer_id = d.key_consumer_uuid
and a.par_process_date >= d.GDP_PROCESSED_DATETIME 
                      
                      where in_flight_order_merchant_name ilike 'currys'); ''')

In [0]:
# what features do i think could be important here
df_list = []
feat_list=['CONSUMER_ACCOUNT_LINKING_TYPE'
,' CONSUMER_IS_FIRST_ORDER']
for var in feat_list:
    time_query = f'''select 
        to_varchar(par_process_date, 'YYYY-MM') as month_dt,
            {var},
            sum(case when is_in_attempt_control_group = 1 then p2_overdue_d0_local end) as p2_overdue,
                sum(case when is_in_attempt_control_group = 1 then p2_due_local end) as p2_due,
                div0(sum(case when is_in_attempt_control_group =1 then p2_overdue_d0_local end), sum(case when is_in_attempt_control_group =1 then p2_due_local end)) as ctrl_grp_p2_d0_rate,

                sum(case when order_token_loss is not null then order_amount_local end) as gpv,
                count(order_token) as token_ct,

                count(distinct(case when is_in_attempt_control_group=1 then order_token end)) as ctrl_token_ct,
            sum(p2_overdue_d0_local ) as portfolio_p2_overdue,
            sum(p2_due_local ) as portfolio_p2_due,
            div0(sum( p2_overdue_d0_local ), sum(p2_due_local )) as portfolio_p2_d0_rate,
        from tesco  
        where order_token_loss is not null
        group by 1,2
        order by 1,2;
        '''
    df_list.append(conn.download(time_query))
        
    

In [0]:
b = conn.download('''select 
                    CASE 
    WHEN in_flight_order_amount < 50 THEN '01. 0-50'
    WHEN in_flight_order_amount < 100 THEN '02. 50-100'
    WHEN in_flight_order_amount < 150 THEN '03. 100-150'
    WHEN in_flight_order_amount < 200 THEN '04. 150-200'
    WHEN in_flight_order_amount < 250 THEN '05. 200-250'
    WHEN in_flight_order_amount < 300 THEN '06. 250-300'
    ELSE '07. 300+' end as in_flight_order_bin,
    
     sum(case when is_in_attempt_control_group = 1 then p2_overdue_d0_local end) as p2_overdue,
                sum(case when is_in_attempt_control_group = 1 then p2_due_local end) as p2_due,
                div0(sum(case when is_in_attempt_control_group =1 then p2_overdue_d0_local end), sum(case when is_in_attempt_control_group =1 then p2_due_local end)) as ctrl_grp_p2_d0_rate,

                sum(case when order_token_loss is not null then order_amount_local end) as gpv,
                count(order_token) as token_ct,

                count(distinct(case when is_in_attempt_control_group=1 then order_token end)) as ctrl_token_ct,
            sum(p2_overdue_d0_local ) as portfolio_p2_overdue,
            sum(p2_due_local ) as portfolio_p2_due,
            div0(sum( p2_overdue_d0_local ), sum(p2_due_local )) as portfolio_p2_d0_rate,
        from tesco  
        where order_token_loss is not null
        group by 1
        order by 1;                
                    ''')

In [0]:
b.sort_values(by=['in_flight_order_bin']) #looks like 150 is a good order bin for tesco

In [0]:
c = conn.download('''select 
                    CASE 
    WHEN in_flight_order_amount < 50 THEN '01. 0-50'
    WHEN in_flight_order_amount < 100 THEN '02. 50-100'
    WHEN in_flight_order_amount < 150 THEN '03. 100-150'
    WHEN in_flight_order_amount < 200 THEN '04. 150-200'
    WHEN in_flight_order_amount < 250 THEN '05. 200-250'
    WHEN in_flight_order_amount < 300 THEN '06. 250-300'
    ELSE '07. 300+' end as in_flight_order_bin,
    case when consumer_Active_order_number between 0 and 2 then '01. 0-2'
         when consumer_Active_order_number between 2 and 6 then '02. 2-6'
         when consumer_Active_order_number between 6 and 10 then '03. 6-10'
         else '10+' end as active_ordeR_number_bin,
        
     sum(case when is_in_attempt_control_group = 1 then p2_overdue_d0_local end) as p2_overdue,
                sum(case when is_in_attempt_control_group = 1 then p2_due_local end) as p2_due,
                div0(sum(case when is_in_attempt_control_group =1 then p2_overdue_d0_local end), sum(case when is_in_attempt_control_group =1 then p2_due_local end)) as ctrl_grp_p2_d0_rate,

                sum(case when order_token_loss is not null then order_amount_local end) as gpv,
                count(order_token) as token_ct,

                count(distinct(case when is_in_attempt_control_group=1 then order_token end)) as ctrl_token_ct,
            sum(p2_overdue_d0_local ) as portfolio_p2_overdue,
            sum(p2_due_local ) as portfolio_p2_due,
            div0(sum( p2_overdue_d0_local ), sum(p2_due_local )) as portfolio_p2_d0_rate,
        from currys  
        where order_token_loss is not null
        group by 1,2
        order by 1,2;                
                    ''')

In [0]:
c.sort_values(by=['in_flight_order_bin', 'active_order_number_bin']) #nothing for currys

In [0]:
e = conn.download('''select 
    consumer_account_linking_type,
     sum(case when is_in_attempt_control_group = 1 then p2_overdue_d0_local end) as p2_overdue,
                sum(case when is_in_attempt_control_group = 1 then p2_due_local end) as p2_due,
                div0(sum(case when is_in_attempt_control_group =1 then p2_overdue_d0_local end), sum(case when is_in_attempt_control_group =1 then p2_due_local end)) as ctrl_grp_p2_d0_rate,

                sum(case when order_token_loss is not null then order_amount_local end) as gpv,
                count(order_token) as token_ct,

                count(distinct(case when is_in_attempt_control_group=1 then order_token end)) as ctrl_token_ct,
            sum(p2_overdue_d0_local ) as portfolio_p2_overdue,
            sum(p2_due_local ) as portfolio_p2_due,
            div0(sum( p2_overdue_d0_local ), sum(p2_due_local )) as portfolio_p2_d0_rate,
        from currys  
        where order_token_loss is not null
        group by 1
        order by 1;                
                    ''')

In [0]:
e

In [0]:
# what features do i think could be important here
df_list = []
feat_list=['IDV_CHECK_RESULT', 'ekata_record_phone_Valid', 'ekata_record_phone_line_type']
for var in feat_list:
    time_query = f'''select 
            {var},
            sum(case when is_in_attempt_control_group = 1 then p2_overdue_d0_local end) as p2_overdue,
                sum(case when is_in_attempt_control_group = 1 then p2_due_local end) as p2_due,
                div0(sum(case when is_in_attempt_control_group =1 then p2_overdue_d0_local end), sum(case when is_in_attempt_control_group =1 then p2_due_local end)) as ctrl_grp_p2_d0_rate,

                sum(case when order_token_loss is not null then order_amount_local end) as gpv,
                count(order_token) as token_ct,

                count(distinct(case when is_in_attempt_control_group=1 then order_token end)) as ctrl_token_ct,
            sum(p2_overdue_d0_local ) as portfolio_p2_overdue,
            sum(p2_due_local ) as portfolio_p2_due,
            div0(sum( p2_overdue_d0_local ), sum(p2_due_local )) as portfolio_p2_d0_rate,
        from currys  
        where order_token_loss is not null
        group by 1
        order by 1;
        '''
    print(time_query)
    df_list.append(conn.download(time_query))
        
    

In [0]:
df_list[0]

In [0]:
df_list[1]

In [0]:
df_list[2]

In [0]:
12269.700/(12269.700+22648.801)

In [0]:
872/(1726+872)

In [0]:
def generate_case_when(column_name, thresholds_dict):
    """
    Generate a SQL CASE WHEN statement based on thresholds dictionary.
    
    Args:
        column_name (str): The name of the column to use in the CASE statement
        thresholds_dict (dict): Dictionary with operators as keys and thresholds as values
        
    Returns:
        str: Formatted CASE WHEN statement
    """
    # Sort the thresholds to ensure correct ordering
    sorted_items = []
    exact_match = None
    
    # Process the thresholds and operators
    for op, val in thresholds_dict.items():
        if op == '=':
            exact_match = val
        else:
            # Strip any operators and convert to integer
            threshold = int(''.join(filter(str.isdigit, str(val))))
            sorted_items.append((threshold, op))
    
    # Sort by threshold value
    sorted_items.sort()
    
    # Build the CASE statement
    case_stmt = [f"CASE"]
    
    # Add exact match if exists
    if exact_match is not None:
        case_stmt.append(f"    WHEN {column_name} = {exact_match} THEN '{exact_match}'")
    
    # Process the ranges
    for i in range(len(sorted_items)):
        current_threshold = sorted_items[i][0]
        
        if i == 0 and exact_match is not None:
            # First range after exact match
            start = exact_match + 1
        elif i == 0:
            # First range with no exact match
            start = 1
        else:
            # Start from previous threshold + 1
            start = sorted_items[i-1][0] + 1
            
        end = current_threshold
        
        # Add the WHEN clause
        case_stmt.append(f"    WHEN {column_name} BETWEEN {start} AND {end} THEN '{start}-{end}'")
    
    # Add the ELSE clause for the last condition
    if sorted_items:
        last_threshold = sorted_items[-1][0]
        case_stmt.append(f"    ELSE '>{last_threshold}'")
    
    case_stmt.append("END")
    
    return '\n'.join(case_stmt)
test_dict = {'=': 0, '<=': 14, '<': 30, '>': 365}
generate_case_when('whitepages_primary_email_address_checks_email_first_seen_days', test_dict)

In [0]:
d  = conn.download('''select 
   CASE\n    WHEN whitepages_primary_email_address_checks_email_first_seen_days = 0 THEN '0'\n    WHEN whitepages_primary_email_address_checks_email_first_seen_days BETWEEN 1 AND 14 THEN '1-14'\n    WHEN whitepages_primary_email_address_checks_email_first_seen_days BETWEEN 15 AND 30 THEN '15-30'\n    WHEN whitepages_primary_email_address_checks_email_first_seen_days BETWEEN 31 AND 365 THEN '31-365'\n    ELSE '>365'\nEND as whitepages_bin,
   
     sum(case when is_in_attempt_control_group = 1 then p2_overdue_d0_local end) as p2_overdue,
                sum(case when is_in_attempt_control_group = 1 then p2_due_local end) as p2_due,
                div0(sum(case when is_in_attempt_control_group =1 then p2_overdue_d0_local end), sum(case when is_in_attempt_control_group =1 then p2_due_local end)) as ctrl_grp_p2_d0_rate,

                sum(case when order_token_loss is not null then order_amount_local end) as gpv,
                count(order_token) as token_ct,

                count(distinct(case when is_in_attempt_control_group=1 then order_token end)) as ctrl_token_ct,
            sum(p2_overdue_d0_local ) as portfolio_p2_overdue,
            sum(p2_due_local ) as portfolio_p2_due,
            div0(sum( p2_overdue_d0_local ), sum(p2_due_local )) as portfolio_p2_d0_rate,
        from currys  
        where order_token_loss is not null
        group by 1
        order by 1;                
                    ''')

In [0]:
d

In [0]:
def create_decile_bins(df, column_name):
    """
    Creates a CASE statement for binning data based on deciles of a numeric column
    
    Args:
        df: pandas DataFrame
        column_name: name of the numeric column to calculate deciles for
    
    Returns:
        str: SQL CASE statement for binning
    """
    # Calculate deciles (0 to 100 by 10)
    deciles = df[column_name].quantile([i/10 for i in range(1, 11)]).round(3)
    
    # Build CASE statement
    case_stmt = f"CASE\n"
    
    # First bin (up to 10th percentile)
    case_stmt += f"  WHEN {column_name} <= {deciles[0.1]} THEN '(0) <{deciles[0.1]}'\n"
    
    # Middle bins (10th to 90th percentile)
    for i in range(1, 9):
        lower = deciles[i/10]
        upper = deciles[(i+1)/10]
        case_stmt += f"  WHEN {column_name} > {lower} AND {column_name} <= {upper} THEN '({i}) {lower}-{upper}'\n"
    
    # Last bin (above 90th percentile)
    case_stmt += f"  WHEN {column_name} > {deciles[0.9]} THEN '(9) {deciles[0.9]}'\n"
    
    case_stmt += f"END AS {column_name}_bin"
    
    # Print decile values for reference
    print("Decile values:")
    for i, value in enumerate(deciles, 1):
        print(f"{i*10}th percentile: {value}")
        
    return case_stmt

In [0]:
sup_Attempts = conn.download('select * from SUP_attempts')

In [0]:
numeric_case_when_dict = {}
add_cols =   ['BP_C_BATCH_CONSUMER_BATCH_MODEL_V1'
,'MODEL_ONLINE_OD_ABUSIVE_GLOBAL_APRIL_2024_SCORE'
,'MODEL_ONLINE_OD_GLOBAL_PAYBACK_SUP_MODEL_AUGUST_2023_SCORE'
,'MODEL_ONLINE_OD_PAYBACK_NON_US_APRIL_2024_SCORE']

add_cols = [col.lower() for col in add_cols]
numeric_cols = [
       'whitepages_identity_check_score',
       'whitepages_identity_network_score'
]
numeric_cols.extend(add_cols)
numeric_cols
for col in numeric_cols:
    numeric_case_when_dict[col+'_bin']=create_decile_bins(sup_Attempts, col)

In [0]:
categorical_cols = []
query = f''' create or replace  table final_driver as (select 
    
a.*'''

for  key, case_when in numeric_case_when_dict.items():
    query += f', {case_when}'
    categorical_cols.append(key)

query+=" from currys a);"
print(query)
conn.execute(query)

In [0]:
 return_dict = {}
 for column in categorical_cols:
     print(f'working on {column}')
     output = conn.download(f'''  select 
 {column},
 
    sum(case when is_in_attempt_control_group = 1 then p2_overdue_d0_local end) as p2_overdue,
                sum(case when is_in_attempt_control_group = 1 then p2_due_local end) as p2_due,
                div0(sum(case when is_in_attempt_control_group =1 then p2_overdue_d0_local end), sum(case when is_in_attempt_control_group =1 then p2_due_local end)) as ctrl_grp_p2_d0_rate,

                sum(case when order_token_loss is not null then order_amount_local end) as gpv,
                count(order_token) as token_ct,

                count(distinct(case when is_in_attempt_control_group=1 then order_token end)) as ctrl_token_ct,
            sum(p2_overdue_d0_local ) as portfolio_p2_overdue,
            sum(p2_due_local ) as portfolio_p2_due,
            div0(sum( p2_overdue_d0_local ), sum(p2_due_local )) as portfolio_p2_d0_rate,
    
from final_driver a 

where 1=1
-- and is_in_attempt_control_group = 1 

group by 1
order by 1
;
''')
     if output['ctrl_grp_p2_d0_rate'].max() > .35:
         return_dict[column] = output
         print(f'{column} had a segment >= .35 p2d0')
         

In [0]:
return_dict['whitepages_identity_check_score_bin'].sort_values(by='whitepages_identity_check_score_bin')

In [0]:
return_dict['whitepages_identity_check_score_bin']['token_ct'].sum()

In [0]:
()

In [0]:
(1691+1013+271+34)/(1354+1698+1940+2187+2054+1795+1691+1013+271+34) #44% of transactions jfc 

In [0]:
return_dict['whitepages_identity_network_score_bin'].sort_values(by='whitepages_identity_network_score_bin')

In [0]:
return_dict['bp_c_batch_consumer_batch_model_v1_bin'].sort_values(by='bp_c_batch_consumer_batch_model_v1_bin') #plausible

In [0]:
token_denom = return_dict['bp_c_batch_consumer_batch_model_v1_bin'].token_ct.sum()

In [0]:
ctrl_denom = return_dict['bp_c_batch_consumer_batch_model_v1_bin'].p2_overdue.sum()

In [0]:
(151.8+255.2+375.8+130+236)/ctrl_denom

In [0]:
(255.2+375.8+130+236)/ctrl_denom

In [0]:
(1388+1440+1689+2053+2845)/token_denom
(255.2+375.8+130+236)/ctrl_denom

In [0]:
(1440+1689+2053+2845)/token_denom


#trying again with consumer model


In [0]:
 return_dict = {}
 for column in categorical_cols:
     print(f'working on {column}')
     output = conn.download(f'''  select 
 {column},
 
    sum(case when is_in_attempt_control_group = 1 then p2_overdue_d0_local end) as p2_overdue,
                sum(case when is_in_attempt_control_group = 1 then p2_due_local end) as p2_due,
                div0(sum(case when is_in_attempt_control_group =1 then p2_overdue_d0_local end), sum(case when is_in_attempt_control_group =1 then p2_due_local end)) as ctrl_grp_p2_d0_rate,

                sum(case when order_token_loss is not null then order_amount_local end) as gpv,
                count(order_token) as token_ct,

                count(distinct(case when is_in_attempt_control_group=1 then order_token end)) as ctrl_token_ct,
            sum(p2_overdue_d0_local ) as portfolio_p2_overdue,
            sum(p2_due_local ) as portfolio_p2_due,
            div0(sum( p2_overdue_d0_local ), sum(p2_due_local )) as portfolio_p2_d0_rate,
    
from final_driver a 

where 1=1
-- and is_in_attempt_control_group = 1 
-- first segment to use
and bp_c_batch_consumer_batch_model_v1 >= 650
--second segment to use 
and model_online_od_payback_non_us_april_2024_score >= 455
--
group by 1
order by 1
;
''')
     if output['ctrl_grp_p2_d0_rate'].max() > .35:
         return_dict[column] = output
         print(f'{column} had a segment >= .35 p2d0')
         

In [0]:
return_dict['model_online_od_global_payback_sup_model_august_2023_score_bin'].sort_values(by='model_online_od_global_payback_sup_model_august_2023_score_bin')

In [0]:
return_dict['model_online_od_abusive_global_april_2024_score_bin'].sort_values(by='model_online_od_abusive_global_april_2024_score_bin')

In [0]:
return_dict['model_online_od_payback_non_us_april_2024_score_bin'].sort_values(by='model_online_od_payback_non_us_april_2024_score_bin')

In [0]:
(1212+1843+2612)/token_denom


In [0]:
(411.490+181.5+130+126.74)/ctrl_denom

In [0]:
return_dict['whitepages_identity_network_score_bin'].sort_values(by='whitepages_identity_network_score_bin')

In [0]:
return_dict['whitepages_identity_check_score_bin'].sort_values(by='whitepages_identity_check_score_bin')

In [0]:
return_dict['model_online_od_global_payback_sup_model_august_2023_score_bin'].sort_values(by='model_online_od_global_payback_sup_model_august_2023_score_bin') #plausible

In [0]:
return_dict['model_online_od_global_payback_sup_model_august_2023_score_bin'].token_ct.sum()

In [0]:
return_dict['model_online_od_global_payback_sup_model_august_2023_score_bin'].portfolio_p2_overdue.sum()

In [0]:
(2693.100+2579.690+2060.630+1235.980+1135.100)/34963.5 #how is this possible, how does 44% of transactions only eliminate 27.7% of p2d0 

In [0]:
(818+925+1132+1369+1978)/14037 #44% of transactions jfc 


# repeating the above but with a filter on whitepages identity score

In [0]:
 return_dict = {}
 for column in categorical_cols:
     print(f'working on {column}')
     output = conn.download(f'''  select 
 {column},
 
    sum(case when is_in_attempt_control_group = 1 then p2_overdue_d0_local end) as p2_overdue,
                sum(case when is_in_attempt_control_group = 1 then p2_due_local end) as p2_due,
                div0(sum(case when is_in_attempt_control_group =1 then p2_overdue_d0_local end), sum(case when is_in_attempt_control_group =1 then p2_due_local end)) as ctrl_grp_p2_d0_rate,

                sum(case when order_token_loss is not null then order_amount_local end) as gpv,
                count(order_token) as token_ct,

                count(distinct(case when is_in_attempt_control_group=1 then order_token end)) as ctrl_token_ct,
            sum(p2_overdue_d0_local ) as portfolio_p2_overdue,
            sum(p2_due_local ) as portfolio_p2_due,
            div0(sum( p2_overdue_d0_local ), sum(p2_due_local )) as portfolio_p2_d0_rate,
    
from final_driver a 

where 1=1
-- and is_in_attempt_control_group = 1 
    and whitepages_identity_check_score >= 255
group by 1
order by 1
;
''')
     if output['ctrl_grp_p2_d0_rate'].max() > .35:
         return_dict[column] = output
         print(f'{column} had a segment >= .35 p2d0')
         

In [0]:
return_dict['bp_c_batch_consumer_batch_model_v1_bin'].sort_values(by='bp_c_batch_consumer_batch_model_v1_bin') #plausible

In [0]:
test = conn.download('select * from currys where p2_overdue_d0_local > 0')

In [0]:
test

In [0]:
list_A = ["400538723", "400570637", "400575272", "400568202", "400504709", "400535118", "400551784", "400527714", "400515891", "400546925", "400527604", "400562359", "400474538", "400571508", "400515894", "400569331", "400572942", "400547364", "400474394", "400527759", "400575150", "400562345", "400570636", "400527737", "400527047", "400474401", "400475455", "400527639", "400527593", "400517492", "400574444", "400461110", "400488351", "400507556", "400527756", "400569337", "400527058", "400570624", "400510052", "400510042", "400515896", "400575275", "400527704", "400514646", "400527833", "400522690", "400527601", "400562315", "400579004", "400527066", "400507557", "400509583", "400507558"]

list_B = ["400538723", "400570637", "400575272", "400568202", "400504709", "400535118", "400551784", "400527714", "400515891", "400546925", "400527604", "400562359", "400474538", "400571508", "400515894", "400569331", "400572942", "400547364", "400474394", "400527759", "400575150", "400562345", "400570636", "400527737", "400527047", "400474401", "400475455", "400527639", "400527593", "400517492", "400574444", "400461110", "400488351", "400507556", "400527756", "400569337", "400527058", "400570624", "400510052", "400510042", "400515896", "400575275"]
print(len(list_A), len(list_B))