# Data Monitoring Automation

This notebook documents the code and progress as I work my way towards setting up an automated process to monitor the performance of the fraud model(s), across partners and products. 

In [2]:
# Library Imports

import pandas as pd
import numpy as np
import trellis
import os
from avant_python_utils.email import send_email
from datalaketools.connectors.presto_db import PrestoDB
presto = PrestoDB()
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, f1_score,recall_score,precision_score, average_precision_score
from datetime import date, timedelta, datetime

## Constants with Column Names

Lots of columns that are used in multiple functions throughout this document. I'll define those columns here so I won't have to change them in multiple places later if there's a change

In [3]:
# TODO - Define these as global variables (store them in a config.py file)

SCORE_COL = 'score_5'
YPRED_COL = 'prediction'
YTRUE_COL = 'suspected_fraud'
TIME_COL = 'loan_processing_start_time'
AMOUNT_COL = 'loan_amount'
THRESHOLD = 0.05
LOAN_WINDOW = 'week'
MODEL_START_DATE = '2018-09-15'





## Base Data Creation

In [4]:
#trellis.start()
# fraud = trellis.connect('us_fraud_follower')
#parent_dir_path = os.path.dirname(os.path.abspath(__file__)) - REMOVE COMMENT IN PYTHON SCRIPT
parent_dir_path = os.getcwd()
subject = 'Avant Model Monitor Weekly Report (Data Only)'
credentials = {'username': trellis.keys('automate_email')['email'], 'password': trellis.keys('automate_email')['pw']}


In [5]:
# TD prod connection
td_connector = trellis.connect('td_prod.follower')

Query to obtain data at a loan ID level. 

In [61]:
df_raw = presto.execute_df('''
SELECT
  l.id as loan_id
, l.loan_processing_start_time
, date_trunc('{LOAN_WINDOW}', l.loan_processing_start_time) as entered_lp_week
, l.status
, case when l.status in ('current','late','paid_off','charged_off') then 1 else 0 end as issued
, case when c.high_confidence_fraud_indicator=true or cfl.id is not null then 1 else 0 end as high_confidence_fraud_indicator
, case when cfr.customer_id is not null then 1 else 0 end as suspected_fraud 
--, cfrt.name as fraud_reason
, cast(fd.score_4 as double) as score_4
, cast(fd.score_5 as double) as score_5
, coalesce(cast(fd.score_5 as double), cast(fd.score_4 as double)) as hard_score
, l.state
, l.payment_method
, l.loan_amount
, ca.product_type
, vrdt.risk_summary_identity_high
, vrdt.risk_summary_identity_medium
, vrdt.risk_summary_identity_low
FROM avant.dw.customer_applications ca
LEFT JOIN avant.dw.loans l on l.customer_application_id = ca.id
JOIN avant.dw.customers c
  ON c.id = l.customer_id
  
  -- getting dependent variable
  
LEFT JOIN (
select customer_id 
from avant.avant_basic.customer_fraud_reasons cfr 
group by 1
) cfr on c.id = cfr.customer_id
  
 -- LEFT JOIN avant.avant_basic.customer_fraud_reason_types cfrt on cfr.customer_fraud_reason_type_id = cfrt.id
  
  -- getting fraud scores
LEFT JOIN (
  SELECT
    l.id as loan_id
  , json_extract_scalar(fd.model_scores, '$["fraud/en-US/4.1.0"]["score"]') as score_4
  , json_extract_scalar(fd.model_scores, '$["fraud/en-US/5.0.0"]["score"]') as score_5
  , fd.id as fraud_decision_id
  , row_number() over (partition by l.id order by fd.created_at desc) as row_num
  FROM avant.dw.loans l
  JOIN avant.avant_basic.fraud_decisions fd
    ON fd.customer_application_id = l.customer_application_id
   -- AND fd.created_at AT TIME ZONE 'America/Chicago' >= l.loan_processing_start_time
WHERE l.loan_processing_start_time > date '{START_DATE}'
) fd 
  ON fd.loan_id = l.id 
  AND fd.row_num=1
  -- getting fraud indicator
LEFT JOIN avant.avant_basic.confirmed_fraud_logs cfl 
  ON cfl.customer_id = c.id
  
    -- filtering for valid loans to evaluate performance on
  -- JOIN avant.dw.loan_performance_by_installment lp 
  -- ON lp.loan_id = l.id 
  -- AND lp.installment_number = 1
  -- AND lp.installment_date <= date_add('day', -64, current_timestamp)

  
  -- adding identity tier a loan was assigned to and fraud_review flag
  LEFT JOIN avant.dw_temp_newver.verifications_risks_decisions_test vrdt
  on ca.id = vrdt.customer_application_id and vrdt.row_num_recent = 1
  
  
WHERE l.loan_processing_start_time > date '{START_DATE}'
'''.format(LOAN_WINDOW = loan_window, START_DATE = MODEL_START_DATE), , td_connector)


In [63]:
#Remove records with no fraud score
df = df_raw[df_raw.score_5.notnull()]

In [64]:
max(df['loan_processing_start_time'])

'2020-08-26 10:45:39.325'

In [7]:
df[YPRED_COL] = np.where(df[SCORE_COL] > THRESHOLD, 1, 0)
#df['prediction'] = [1 if x > 0.05 else 0 for x in df['score_5']]
#df['prediction'] = list(np.where(df['score_5'] > 0.05, 1, 0)) 
# All three solutions raise the settingwithcopywarining

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


## Model Evaluation Pipeline

In [8]:
def weeklyEvaluator(dframe, ytrue = YTRUE_COL, ypred = YPRED_COL, scores = SCORE_COL):
    true_positives = dframe[ytrue] * dframe[ypred]
    false_positives = (1-dframe[ytrue]) * dframe[ypred]
    false_negatives =  dframe[ytrue] * (1-dframe[ypred])
    
    #calculating multiple metrics
    precision = precision_score(y_true = dframe[ytrue], y_pred = dframe[ypred], pos_label = 1)
    recall = recall_score(y_true = dframe[ytrue], y_pred = dframe[ypred], pos_label = 1)
    f1score = f1_score(y_true = dframe[ytrue], y_pred = dframe[ypred], pos_label = 1)
    auc_pr = average_precision_score(y_true = dframe[ytrue], y_score = dframe[scores], pos_label=1)
    auc_roc = roc_auc_score(y_true = dframe[ytrue], y_score = dframe[scores])
    fraud_rate = dframe[ytrue].sum()/len(dframe.index)
    avg_score = dframe[scores].sum()/len(dframe.index)
    
    
    
    return pd.Series({'precision': precision, 'recall': recall, 'f1score': f1score, 'auc_pr':auc_pr, 'auc_roc':auc_roc,
                     'fraud_rate': fraud_rate, 'avg_score': avg_score})

In [9]:
byWeek_stats = df.groupby('entered_lp_week', as_index = False).apply(weeklyEvaluator)

## Connecting to Google Sheets

In [10]:
#Importing the module
import gspread
from df2gspread import df2gspread as d2g
from oauth2client.service_account import ServiceAccountCredentials

In [49]:
#The scope is always look like this so we did not need to change anything
scope = [
   'https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
#Name of our Service Account Key
google_key_file = 'service_key.json'
credentials = ServiceAccountCredentials.from_json_keyfile_name(google_key_file, scope)
gc = gspread.authorize(credentials)

In [54]:
#This is the Worksheet ID
wb = gc.open_by_key('14ROlpuOP9IkixM5-nn1Pc0ux6kWgmj7c62NzdDl-5hU')
#This is the sheet name
wks_name = 'Raw Data'
ws = wb.worksheet(wks_name)

#clear the existing data in worksheet
ws.clear()

#update new data to worksheet (first list out columns, and then add values for each column)
ws.update([byWeek_stats.columns.values.tolist()] + byWeek_stats.values.tolist())

{'spreadsheetId': '14ROlpuOP9IkixM5-nn1Pc0ux6kWgmj7c62NzdDl-5hU',
 'updatedRange': "'Raw Data'!A1:H104",
 'updatedRows': 104,
 'updatedColumns': 8,
 'updatedCells': 832}

In [55]:
ws.format("A2:A1000", { "numberFormat": { "type": ('DATE') }})

{'spreadsheetId': '14ROlpuOP9IkixM5-nn1Pc0ux6kWgmj7c62NzdDl-5hU',
 'replies': [{}]}

## Filling in Table Values

For each metric, I need the following 

- Value since the date model was trained
- Compare last 30 days to previous 30 days (with scope for excluding the last 2 months)
- Absolute difference between value when model was trained, and current value

In [48]:
datetime.strptime(a, "%Y-%m-%d") + timedelta(days = 30)

datetime.datetime(2018, 10, 15, 0, 0)

In [27]:
test = df.query('loan_processing_start_time > "2019-09-15" & loan_processing_start_time < "2019-10-30"')

In [52]:
today_date

'2020-10-05'

In [58]:
today_date = date.today().strftime("%Y-%m-%d")
prev30_date = (date.today() - timedelta(days = 30)).strftime("%Y-%m-%d")
prev60_date = (date.today() - timedelta(days = 60)).strftime("%Y-%m-%d")
#a = datetime.strptime(MODEL_START_DATE, "%Y-%m-%d")
#b = (a + timedelta(days = 30)).strftime("%Y-%m-%d")

#creating different datasets for the different time periods

#dataset 1 - 30 days after model was trained


#test = df.query('{0} > @MODEL_START_DATE & {0} < @b'.format(TIME_COL))
data_last30 = df.query('loan_processing_start_time > @prev30_date & loan_processing_start_time < @today_date')
#data_prev30 = dframe.query('@timecol > @prev60_date & @timecol < @prev30_date')   

In [59]:
data_last30

Unnamed: 0,loan_id,loan_processing_start_time,entered_lp_week,status,issued,high_confidence_fraud_indicator,suspected_fraud,score_4,score_5,hard_score,state,payment_method,loan_amount,product_type,risk_summary_identity_high,risk_summary_identity_medium,risk_summary_identity_low


In [49]:
average_precision_score(y_true = data_last30[YTRUE_COL], y_score = data_last30[SCORE_COL], pos_label = 1)

IndexError: cannot do a non-empty take from an empty axes.

In [43]:
test = values_for_cells(df)

IndexError: cannot do a non-empty take from an empty axes.

In [42]:
#function to return a metric for a specified time period

def values_for_cells(dframe, ytrue = YTRUE_COL, ypred = YPRED_COL, scores = SCORE_COL, timecol = TIME_COL, amount = AMOUNT_COL):
   
    #Setting up variables with different date values
    today_date = date.today().strftime("%Y-%m-%d")
    prev30_date = (date.today() - timedelta(days = 30)).strftime("%Y-%m-%d")
    prev60_date = (date.today() - timedelta(days = 60)).strftime("%Y-%m-%d")
    modeltrain_date_start = datetime.strptime(MODEL_START_DATE, "%Y-%m-%d")
    modeltrain_date_end = (modeltrain_date_start + timedelta(days = 30)).strftime("%Y-%m-%d")
    
    #creating different datasets for the different time periods
    
    #dataset 1 - 30 days after model was trained

    
    data_first30 = dframe.query('{0} > @MODEL_START_DATE & {0} < @modeltrain_date_end'.format(TIME_COL))
    data_last30 = dframe.query('{0} > @prev30_date & {0} < @today_date'.format(TIME_COL))
    data_prev30 = dframe.query('{0} > @prev60_date & {0} < @prev30_date'.format(TIME_COL))   

    #PRECISION
    precision_current = precision_score(y_true = data_last30[ytrue], y_pred = data_last30[ypred], pos_label = 1)
    precision_initial = precision_score(y_true = data_first30[ytrue], y_pred = data_first30[ypred], pos_label = 1)
    precision_prev30 = precision_score(y_true = data_prev30[ytrue], y_pred = data_prev30[ypred], pos_label = 1)

    #recall values
    recall_current = recall_score(y_true = data_last30[ytrue], y_pred = data_last30[ypred], pos_label = 1)
    recall_initial = recall_score(y_true = data_first30[ytrue], y_pred = data_first30[ypred], pos_label = 1)
    recall_prev30 = recall_score(y_true = data_prev30[ytrue], y_pred = data_prev30[ypred], pos_label = 1) 

    #F1 score
    f1_current = f1_score(y_true = data_last30[ytrue], y_pred = data_last30[ypred], pos_label = 1)
    f1_initial = f1_score(y_true = data_first30[ytrue], y_pred = data_first30[ypred], pos_label = 1)
    f1_prev30 = f1_score(y_true = data_prev30[ytrue], y_pred = data_prev30[ypred], pos_label = 1) 

    #auc pr
    aucpr_current = average_precision_score(y_true = data_last30[ytrue], y_score = data_last30[scores], pos_label = 1)
    aucpr_initial = average_precision_score(y_true = data_first30[ytrue], y_score = data_first30[scores], pos_label = 1)
    aucpr_prev30 = average_precision_score(y_true = data_prev30[ytrue], y_score = data_prev30[scores], pos_label = 1) 

    #auc roc
    aucroc_current = roc_auc_score(y_true = data_last30[ytrue], y_score = data_last30[scores], pos_label = 1)
    aucroc_initial = roc_auc_score(y_true = data_first30[ytrue], y_score = data_first30[scores], pos_label = 1)
    aucroc_prev30 = roc_auc_score(y_true = data_prev30[ytrue], y_score = data_prev30[scores], pos_label = 1) 

    #TODO - Confirm fraud rate definition
    #fraud rate
    fraudrate_current = data_last30[ytrue].sum()/len(data_last30.index)
    fraudrate_initial = data_first30[ytrue].sum()/len(data_first30.index)
    fraudrate_prev30 = data_prev30[ytrue].sum()/len(data_prev30.index)
    
    #avg score
    avgscore_current = data_last30[scores].sum()/len(data_last30.index)
    avgscore_initial = data_first30[scores].sum()/len(data_first30.index)
    avgscore_prev30 = data_prev30[scores].sum()/len(data_prev30.index)


    #TODO - Confirm fraud missed definition
    #fraud rate with dollar values
    fraudrate_dollar_current = (data_last30[amount]*data_last30[YTRUE_COL]).sum()/data_last30[amount].sum()
    fraudrate_dollar_initial = (data_first30[amount]*data_first30[YTRUE_COL]).sum()/data_first30[amount].sum()
    fraudrate_dollar_prev30 = (data_prev30[amount]*data_prev30[YTRUE_COL]).sum()/data_prev30[amount].sum()

    #$ value of fraud missed
    fraudmissed_dollar_current = data_last30[scores].sum()/len(data_last30.index)
    fraudmissed_dollar_initial = data_first30[scores].sum()/len(data_first30.index)
    fraudmissed_dollar_prev30 = data_prev30[scores].sum()/len(data_prev30.index)

    output = {"metric": ['precision', 'recall','f1score', 'auc_pr', 'auc_roc', 'fraudrate', 'avg_score', 'fraudrate_dollar', 'fraudmissed_dollar'],
             "current_values":[precision_current, recall_current, f1_current, aucpr_current, aucroc_current, fraudrate_current, avgscore_current, fraudrate_dollar_current, fraudmissed_dollar_current],
             "initial_values":[precision_initial, recall_initial, f1_initial, aucpr_initial, aucroc_initial, fraudrate_initial, avgscore_initial, fraudrate_dollar_initial, fraudmissed_dollar_initial],
             "prev30_values":[precision_prev30, recall_prev30, f1_prev30, aucpr_prev30, aucroc_prev30, fraudrate_prev30, avgscore_prev30, fraudrate_dollar_prev30, fraudmissed_dollar_prev30]}    
        

    
        

'02/08/2020'

In [33]:
Style 1

'2020-09-01'

In [1]:
#Style 1

d = {"metric":['recall', 'precision',''], "current_val":[0.5, 0.3]}

In [2]:
d

{'metric': ['recall', 'precision'], 'current_val': [0.5, 0.3]}

In [5]:
pd.DataFrame.from_dict(d)

Unnamed: 0,metric,current_val
0,recall,0.5
1,precision,0.3


In [6]:
# Style 2

d = {"recall":[0.5, 0.7, 1], "precision":[0.9,0.8,0.3]}

In [8]:
pd.DataFrame.from_dict(d)

Unnamed: 0,recall,precision
0,0.5,0.9
1,0.7,0.8
2,1.0,0.3
