# Data Monitoring Automation

This notebook documents the code and progress as I work my way towards setting up an automated process to monitor the performance of the fraud model(s), across partners and products. 

In [1]:
# Library Imports

import pandas as pd
import numpy as np
import trellis
import os
from avant_python_utils.email import send_email
from datalaketools.connectors.presto_db import PrestoDB
presto = PrestoDB()
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, f1_score,recall_score,precision_score, average_precision_score
###

## Base Data Creation

In [2]:
#trellis.start()
# fraud = trellis.connect('us_fraud_follower')
#parent_dir_path = os.path.dirname(os.path.abspath(__file__)) - REMOVE COMMENT IN PYTHON SCRIPT
parent_dir_path = os.getcwd()
subject = 'Avant Model Monitor Weekly Report (Data Only)'
credentials = {'username': trellis.keys('automate_email')['email'], 'password': trellis.keys('automate_email')['pw']}


In [3]:
#SQL query parameters
loan_window = 'week'

Query to obtain data at a loan ID level. 

In [4]:
df_raw = presto.execute_df('''
SELECT
  l.id as loan_id
, l.loan_processing_start_time
, date_trunc('{LOAN_WINDOW}', l.loan_processing_start_time) as entered_lp_week
, l.status
, case when l.status in ('current','late','paid_off','charged_off') then 1 else 0 end as issued
, case when c.high_confidence_fraud_indicator=true or cfl.id is not null then 1 else 0 end as high_confidence_fraud_indicator
, cast(fd.score_4 as double) as score_4
, cast(fd.score_5 as double) as score_5
, coalesce(cast(fd.score_5 as double), cast(fd.score_4 as double)) as hard_score
, l.state
, l.payment_method
, l.loan_amount
, ca.product_type
, vrdt.risk_summary_identity_high
, vrdt.risk_summary_identity_medium
, vrdt.risk_summary_identity_low
FROM avant.dw.customer_applications ca
LEFT JOIN avant.dw.loans l on l.customer_application_id = ca.id
JOIN avant.dw.customers c
  ON c.id = l.customer_id
  
  -- getting fraud scores
LEFT JOIN (
  SELECT
    l.id as loan_id
  , json_extract_scalar(fd.model_scores, '$["fraud/en-US/4.1.0"]["score"]') as score_4
  , json_extract_scalar(fd.model_scores, '$["fraud/en-US/5.0.0"]["score"]') as score_5
  , fd.id as fraud_decision_id
  , row_number() over (partition by l.id order by fd.created_at desc) as row_num
  FROM avant.dw.loans l
  JOIN avant.avant_basic.fraud_decisions fd
    ON fd.customer_application_id = l.customer_application_id
    AND fd.created_at AT TIME ZONE 'America/Chicago' >= l.loan_processing_start_time
WHERE l.loan_processing_start_time > date '2019-06-30'
) fd 
  ON fd.loan_id = l.id 
  AND fd.row_num=1
  -- getting fraud indicator
LEFT JOIN avant.avant_basic.confirmed_fraud_logs cfl 
  ON cfl.customer_id = c.id
  
    -- filtering for valid loans to evaluate performance on
  -- JOIN avant.dw.loan_performance_by_installment lp 
  -- ON lp.loan_id = l.id 
  -- AND lp.installment_number = 1
  -- AND lp.installment_date <= date_add('day', -64, current_timestamp)

  
  -- adding identity tier a loan was assigned to and fraud_review flag
  LEFT JOIN avant.dw_temp_newver.verifications_risks_decisions_test vrdt
  on ca.id = vrdt.customer_application_id and vrdt.row_num_recent = 1
WHERE l.loan_processing_start_time > date '2019-06-30'
'''.format(LOAN_WINDOW = loan_window))


In [5]:
#Remove records with no fraud score
df = df_raw[df_raw.score_5.notnull()]

In [6]:
df['prediction'] = np.where(df['score_5'] > 0.05, 1, 0)
#df['prediction'] = [1 if x > 0.05 else 0 for x in df['score_5']]
#df['prediction'] = list(np.where(df['score_5'] > 0.05, 1, 0)) 
# All three solutions raise the settingwithcopywarining

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


## Model Evaluation Pipeline

In [7]:
def evaluator(dframe, ytrue = 'high_confidence_fraud_indicator', ypred = 'prediction', scores = 'score_5'):
    tp = dframe[ytrue] * dframe[ypred]
    fp = (1-dframe[ytrue]) * dframe[ypred]
    fn =  dframe[ytrue] * (1-dframe[ypred])
    
    #calculating multiple metrics
    precision = tp.sum()/(tp.sum() + fp.sum())
    recall = tp.sum()/(tp.sum() + fn.sum())
    f1score = f1_score(y_true = dframe[ytrue], y_pred = dframe[ypred], pos_label = 1)
    auc_pr = average_precision_score(y_true = dframe[ytrue], y_score = dframe[scores], pos_label=1)
    auc_roc = roc_auc_score(y_true = dframe[ytrue], y_score = dframe[scores])
    
    return pd.Series({'precision': precision, 'recall': recall, 'f1score': f1score, 'auc_pr':auc_pr, 'auc_roc':auc_roc})

In [8]:
byWeek_stats = df.groupby('entered_lp_week', as_index = False).apply(evaluator)

## Connecting to Google Sheets

In [9]:
#Importing the module
import gspread
from df2gspread import df2gspread as d2g
from oauth2client.service_account import ServiceAccountCredentials

In [13]:
#The scope is always look like this so we did not need to change anything
scope = [
   'https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
#Name of our Service Account Key
google_key_file = 'service_key.json'
credentials = ServiceAccountCredentials.from_json_keyfile_name(google_key_file, scope)
gc = gspread.authorize(credentials)

In [55]:
#This is the Worksheet ID
spreadsheet_key = '14ROlpuOP9IkixM5-nn1Pc0ux6kWgmj7c62NzdDl-5hU'
#This is the sheet name
wks_name = 'raw_data'
#We upload the tips data to our Google Sheet. Setting the row_names to False if you did not want the index to be included
d2g.upload(byWeek_stats, spreadsheet_key, wks_name, credentials=credentials, row_names=False)

<Worksheet 'test_sheet' id:0>

In [18]:
#This is the Worksheet ID
wb = gc.open_by_key('14ROlpuOP9IkixM5-nn1Pc0ux6kWgmj7c62NzdDl-5hU')
#This is the sheet name
wks_name = 'raw_data'
ws = wb.worksheet(wks_name)

#clear the existing data in worksheet
ws.clear()

#update new data to worksheet (first list out columns, and then add values for each column)
ws.update([byWeek_stats.columns.values.tolist()] + byWeek_stats.values.tolist())