# Data Monitoring Automation

This notebook documents the code and progress as I work my way towards setting up an automated process to monitor the performance of the fraud model(s), across partners and products. 

In [4]:
GARDEN_SERVER = 'garden.amount.com'

In [5]:
# Library Imports

import pandas as pd
import numpy as np
import trellis
import os
from avant_python_utils.email import send_email
from datalaketools.connectors.presto_db import PrestoDB
presto = PrestoDB()
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, f1_score,recall_score,precision_score, average_precision_score
from datetime import date, timedelta, datetime

## Constants with Column Names

Lots of columns that are used in multiple functions throughout this document. I'll define those columns here so I won't have to change them in multiple places later if there's a change

In [6]:
# TODO - Define these as global variables (store them in a config.py file)

SCORE_COL = 'score_5'
YPRED_COL = 'prediction'
YTRUE_COL = 'suspected_fraud'
TIME_COL = 'loan_processing_start_time'
WEEKSTART_COL = 'entered_lp_week'
#TIME_COL = 'created_at'
AMOUNT_COL = 'loan_amount'
THRESHOLD = 0.05
MODEL_START_DATE = '2018-09-15'


## Base Data Creation

In [3]:
#trellis.start()
# fraud = trellis.connect('us_fraud_follower')
#parent_dir_path = os.path.dirname(os.path.abspath(__file__)) - REMOVE COMMENT IN PYTHON SCRIPT
parent_dir_path = os.getcwd()
subject = 'Avant Model Monitor Weekly Report (Data Only)'
credentials = {'username': trellis.keys('automate_email')['email'], 'password': trellis.keys('automate_email')['pw']}


In [7]:
# include this line if the GARDEN_SERVER variable is not equal to https://garden.amount.com
%env GARDEN_SERVER https://garden.amount.com
# trellis needs to be reloaded in order to use new GARDEN_SERVER value
import trellis
from importlib import reload
reload(trellis)

env: GARDEN_SERVER=https://garden.amount.com


<module 'trellis' from '/home/jovyan/.conda-envs/gkrishna_env/lib/python3.6/site-packages/trellis/__init__.py'>

In [8]:
# TD prod connection
td_connector = trellis.connect('td_prod.follower')

OperationalError: could not connect to server: Connection timed out
	Is the server running on host "prd-td-postgres-basic.cluster-ro-c2ti4xtnot4p.us-east-2.rds.amazonaws.com" (10.195.162.226) and accepting
	TCP/IP connections on port 5432?


In [None]:
df_raw = pd.read_sql('''
SELECT
  l.id as loan_id
, l.created_at
, date_trunc('week', l.created_at) as entered_lp_week
, l.status
, case when l.status in ('current','late','paid_off','charged_off') then 1 else 0 end as issued
--, case when c.high_confidence_fraud_indicator=true or cfl.id is not null then 1 else 0 end as high_confidence_fraud_indicator
, case when cfr.customer_id is not null then 1 else 0 end as suspected_fraud 
--, cfrt.name as fraud_reason
, cast(fd.score_5_old as float8) as score_5_old
, cast(fd.score_5_new as float8) as score_5_new
, coalesce(cast(fd.score_5_old as float8), cast(fd.score_5_new as float8)) as score_5
, l.state
, l.payment_method
, (l.amount_cents/100) as loan_amount
, ca.product_type
FROM customer_applications ca
LEFT JOIN loans l on l.customer_application_id = ca.id
JOIN customers c
  ON c.id = l.customer_id
  
  -- getting dependent variable
  
LEFT JOIN (
select customer_id 
from customer_fraud_reasons cfr 
group by 1
) cfr on c.id = cfr.customer_id
  
 -- LEFT JOIN avant.avant_basic.customer_fraud_reason_types cfrt on cfr.customer_fraud_reason_type_id = cfrt.id
  
  -- getting fraud scores
LEFT JOIN (
  SELECT
    l.id as loan_id
  , fd.model_scores -> 'fraud/en-US/4.1.0' ->> 'score' as score_4
  , fd.model_scores -> 'fraud/en-US/5.0.0' ->> 'score' as score_5_old
  , fd.model_scores -> 'fraud/en-US/5.0.0/avant' ->> 'score' as score_5_new
  , fd.id as fraud_decision_id
  , row_number() over (partition by l.id order by fd.created_at desc) as row_num
  FROM loans l
  JOIN fraud_decisions fd
    ON fd.customer_application_id = l.customer_application_id
    AND fd.created_at AT TIME ZONE 'America/Chicago' >= l.created_at
WHERE l.created_at > date '{START_DATE}'
) fd 
  ON fd.loan_id = l.id 
  AND fd.row_num=1
  -- getting fraud indicator
LEFT JOIN confirmed_fraud_logs cfl 
  ON cfl.customer_id = c.id
  
    -- filtering for valid loans to evaluate performance on
  -- JOIN avant.dw.loan_performance_by_installment lp 
  -- ON lp.loan_id = l.id 
  -- AND lp.installment_number = 1
  -- AND lp.installment_date <= date_add('day', -64, current_timestamp)
  
WHERE l.created_at > date '{START_DATE}'
'''.format(START_DATE = MODEL_START_DATE), td_connector)


In [None]:
#Remove records with no fraud score
df = df_raw[df_raw.score_5.notnull()]

In [7]:
max(df[TIME_COL])

'2020-10-07 21:46:17.937'

In [8]:
df

Unnamed: 0,loan_id,loan_processing_start_time,entered_lp_week,status,issued,high_confidence_fraud_indicator,suspected_fraud,score_5_old,score_5_new,score_5,state,payment_method,loan_amount,product_type,risk_summary_identity_high,risk_summary_identity_medium,risk_summary_identity_low
0,3725947,2019-09-09 08:37:27.287,2019-09-09 00:00:00.000,rejected,0,0,0,0.007947,,0.007947,OH,ach,3400.00,installment,True,False,False
1,3569707,2019-06-01 14:53:33.840,2019-05-27 00:00:00.000,cancelled,0,0,0,0.923066,,0.923066,NY,ach,2100.00,installment,True,False,False
2,3672702,2019-08-03 10:35:47.987,2019-07-29 00:00:00.000,rejected,0,0,0,0.007495,,0.007495,SC,ach,8553.69,refinance,False,False,True
3,3858077,2019-12-31 13:18:48.127,2019-12-30 00:00:00.000,rejected,0,0,0,0.015310,,0.015310,KS,ach,3200.00,installment,True,False,False
4,3156066,2018-09-19 15:50:18.018,2018-09-17 00:00:00.000,cancelled,0,0,0,0.004678,,0.004678,SC,ach,2000.00,installment,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502656,3504087,2019-04-22 18:26:32.161,2019-04-22 00:00:00.000,current,1,0,0,0.006523,,0.006523,NJ,ach,10547.26,refinance,False,False,True
502657,3644025,2019-07-16 21:03:02.335,2019-07-15 00:00:00.000,charged_off,1,0,0,0.006476,,0.006476,FL,ach,4400.00,installment,False,False,True
502658,3899587,2020-02-18 17:08:42.044,2020-02-17 00:00:00.000,current,1,0,0,0.008027,,0.008027,CT,ach,5605.18,refinance,False,False,True
502659,3163165,2018-09-24 08:25:08.489,2018-09-24 00:00:00.000,cancelled,0,0,0,0.004640,,0.004640,MD,ach,4000.00,installment,False,False,True


In [7]:
df[YPRED_COL] = np.where(df[SCORE_COL] > THRESHOLD, 1, 0)
#df['prediction'] = [1 if x > 0.05 else 0 for x in df['score_5']]
#df['prediction'] = list(np.where(df['score_5'] > 0.05, 1, 0)) 
# All three solutions raise the settingwithcopywarining

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


## Model Evaluation Pipeline

In [8]:
def weekly_evaluator(dframe, ytrue = YTRUE_COL, ypred = YPRED_COL, scores = SCORE_COL):
    true_positives = dframe[ytrue] * dframe[ypred]
    false_positives = (1-dframe[ytrue]) * dframe[ypred]
    false_negatives =  dframe[ytrue] * (1-dframe[ypred])
    
    #calculating multiple metrics
    precision = precision_score(y_true = dframe[ytrue], y_pred = dframe[ypred], pos_label = 1)
    recall = recall_score(y_true = dframe[ytrue], y_pred = dframe[ypred], pos_label = 1)
    f1score = f1_score(y_true = dframe[ytrue], y_pred = dframe[ypred], pos_label = 1)
    auc_pr = average_precision_score(y_true = dframe[ytrue], y_score = dframe[scores], pos_label=1)
    auc_roc = roc_auc_score(y_true = dframe[ytrue], y_score = dframe[scores])
    fraud_rate = dframe[ytrue].sum()/len(dframe.index)
    avg_score = dframe[scores].sum()/len(dframe.index)
    
    
    
    return pd.Series({'precision': precision, 'recall': recall, 'f1score': f1score, 'auc_pr':auc_pr, 'auc_roc':auc_roc,
                     'fraud_rate': fraud_rate, 'avg_score': avg_score})

Figuring out the date from which we have both fraud and not fraud in the same week, so that we can group by that week. 

In [9]:
#unique fraud values per week

test = pd.DataFrame(df.groupby('entered_lp_week', as_index = False)['suspected_fraud'].nunique())

In [10]:
test.query('suspected_fraud == 1')

Unnamed: 0,entered_lp_week,suspected_fraud


In [11]:
byWeek_stats = df.groupby('entered_lp_week', as_index = False).apply(weekly_evaluator)

## Connecting to Google Sheets

In [12]:
#Importing the module
import gspread
from df2gspread import df2gspread as d2g
from oauth2client.service_account import ServiceAccountCredentials

In [13]:
#The scope is always look like this so we did not need to change anything
scope = [
   'https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
#Name of our Service Account Key
google_key_file = 'service_key.json'
credentials = ServiceAccountCredentials.from_json_keyfile_name(google_key_file, scope)
gc = gspread.authorize(credentials)

In [54]:
#This is the Worksheet ID
workbook = gc.open_by_key('14ROlpuOP9IkixM5-nn1Pc0ux6kWgmj7c62NzdDl-5hU')
#This is the sheet name for week by week data
rawdata_worksheet_name = 'Raw Data'
rawdata_worksheet = workbook.worksheet(rawdata_worksheet_name)

#clear the existing data in worksheet
rawdata_worksheet.clear()

#update new data to worksheet (first list out columns, and then add values for each column)
rawdata_worksheet.update([byWeek_stats.columns.values.tolist()] + byWeek_stats.values.tolist())

{'spreadsheetId': '14ROlpuOP9IkixM5-nn1Pc0ux6kWgmj7c62NzdDl-5hU',
 'updatedRange': "'Raw Data'!A1:H104",
 'updatedRows': 104,
 'updatedColumns': 8,
 'updatedCells': 832}

In [22]:
#This is the sheet name for tables data
workbook = gc.open_by_key('14ROlpuOP9IkixM5-nn1Pc0ux6kWgmj7c62NzdDl-5hU')

tablesdata_worksheet_name = 'Tables Data'
tablesdata_worksheet = workbook.worksheet(tablesdata_worksheet_name)

#clear the existing data in worksheet
tablesdata_worksheet.clear()

#update new data to worksheet (first list out columns, and then add values for each column)
tablesdata_worksheet.update([tables_data.columns.values.tolist()] + tables_data.values.tolist())

{'spreadsheetId': '14ROlpuOP9IkixM5-nn1Pc0ux6kWgmj7c62NzdDl-5hU',
 'updatedRange': "'Tables Data'!A1:D10",
 'updatedRows': 10,
 'updatedColumns': 4,
 'updatedCells': 40}

In [26]:
#This is the sheet name for tables data
workbook = gc.open_by_key('14ROlpuOP9IkixM5-nn1Pc0ux6kWgmj7c62NzdDl-5hU')

baselinedata_worksheet_name = 'Baselines Data'
baselinedata_worksheet = workbook.worksheet(baselinedata_worksheet_name)

#clear the existing data in worksheet
baselinedata_worksheet.clear()

#update new data to worksheet (first list out columns, and then add values for each column)
baselinedata_worksheet.update([baseline_data.columns.values.tolist()] + baseline_data.values.tolist())

{'spreadsheetId': '14ROlpuOP9IkixM5-nn1Pc0ux6kWgmj7c62NzdDl-5hU',
 'updatedRange': "'Baselines Data'!A1:J110",
 'updatedRows': 110,
 'updatedColumns': 10,
 'updatedCells': 1100}

In [55]:
ws.format("A2:A1000", { "numberFormat": { "type": ('DATE') }})

{'spreadsheetId': '14ROlpuOP9IkixM5-nn1Pc0ux6kWgmj7c62NzdDl-5hU',
 'replies': [{}]}

## Filling in Table Values

For each metric, I need the following 

- Value since the date model was trained
- Compare last 30 days to previous 30 days (with scope for excluding the last 2 months)
- Absolute difference between value when model was trained, and current value

In [48]:
datetime.strptime(a, "%Y-%m-%d") + timedelta(days = 30)

datetime.datetime(2018, 10, 15, 0, 0)

In [27]:
test = df.query('loan_processing_start_time > "2019-09-15" & loan_processing_start_time < "2019-10-30"')

In [52]:
today_date

'2020-10-05'

In [58]:
today_date = date.today().strftime("%Y-%m-%d")
prev30_date = (date.today() - timedelta(days = 30)).strftime("%Y-%m-%d")
prev60_date = (date.today() - timedelta(days = 60)).strftime("%Y-%m-%d")
a = datetime.strptime(MODEL_START_DATE, "%Y-%m-%d")
b = (a + timedelta(days = 30)).strftime("%Y-%m-%d")

#creating different datasets for the different time periods

#dataset 1 - 30 days after model was trained


data_first30 = df.query('{0} > @MODEL_START_DATE & {0} < @b'.format(TIME_COL))
data_last30 = df.query('loan_processing_start_time > @prev30_date & loan_processing_start_time < @today_date')
#data_prev30 = dframe.query('@timecol > @prev60_date & @timecol < @prev30_date')   

In [59]:
data_first30

Unnamed: 0,loan_id,loan_processing_start_time,entered_lp_week,status,issued,high_confidence_fraud_indicator,suspected_fraud,score_5_old,score_5_new,score_5,state,payment_method,loan_amount,product_type,risk_summary_identity_high,risk_summary_identity_medium,risk_summary_identity_low,prediction
4,3156066,2018-09-19 15:50:18.018,2018-09-17 00:00:00.000,cancelled,0,0,0,0.004678,,0.004678,SC,ach,2000.0,installment,True,False,False,0
61,3188808,2018-10-08 20:09:51.096,2018-10-08 00:00:00.000,paid_off,1,0,0,0.002656,,0.002656,FL,ach,3400.0,installment,False,False,True,0
111,3187785,2018-10-08 11:38:45.465,2018-10-08 00:00:00.000,paid_off,1,0,0,0.019646,,0.019646,MN,ach,7900.0,refinance,,,,0
140,3169142,2018-09-27 08:42:47.307,2018-09-24 00:00:00.000,paid_off,1,0,0,0.003395,,0.003395,OH,ach,12500.0,installment,True,False,False,0
161,3178944,2018-10-02 21:38:24.323,2018-10-01 00:00:00.000,cancelled,0,0,0,0.049705,,0.049705,TX,ach,9300.0,installment,True,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502569,3157148,2018-09-20 08:37:07.494,2018-09-17 00:00:00.000,rejected,0,0,0,0.003565,,0.003565,CA,remotely_created_check,25000.0,installment,False,True,False,0
502634,3187499,2018-10-08 09:26:10.299,2018-10-08 00:00:00.000,charged_off,1,0,0,0.018444,,0.018444,MI,ach,2400.0,installment,False,False,True,0
502635,3177090,2018-10-02 05:18:50.992,2018-10-01 00:00:00.000,current,1,0,0,0.005252,,0.005252,NC,ach,3000.0,installment,False,False,True,0
502640,3157809,2018-09-21 15:26:42.783,2018-09-17 00:00:00.000,late,1,0,0,0.010588,,0.010588,IL,ach,3200.0,installment,False,False,True,0


In [32]:
average_precision_score(y_true = data_last30[YTRUE_COL], y_score = data_last30[SCORE_COL], pos_label = 1)

0.332906862189837

In [14]:
#function to return a metric for a specified time period

def values_for_cells(dframe, ytrue = YTRUE_COL, ypred = YPRED_COL, scores = SCORE_COL, timecol = TIME_COL, amount = AMOUNT_COL):
   
    #Setting up variables with different date values
    today_date = date.today().strftime("%Y-%m-%d")
    prev30_date = (date.today() - timedelta(days = 30)).strftime("%Y-%m-%d")
    prev60_date = (date.today() - timedelta(days = 60)).strftime("%Y-%m-%d")
    modeltrain_date_start = datetime.strptime(MODEL_START_DATE, "%Y-%m-%d")
    modeltrain_date_end = (modeltrain_date_start + timedelta(days = 30)).strftime("%Y-%m-%d")
    
    #creating different datasets for the different time periods
    
    #dataset 1 - 30 days after model was trained

    
    data_first30 = dframe.query('{0} > @MODEL_START_DATE & {0} < @modeltrain_date_end'.format(TIME_COL))
    data_last30 = dframe.query('{0} > @prev30_date & {0} < @today_date'.format(TIME_COL))
    data_prev30 = dframe.query('{0} > @prev60_date & {0} < @prev30_date'.format(TIME_COL))   

    #PRECISION
    precision_current = precision_score(y_true = data_last30[ytrue], y_pred = data_last30[ypred], pos_label = 1)
    precision_initial = precision_score(y_true = data_first30[ytrue], y_pred = data_first30[ypred], pos_label = 1)
    precision_prev30 = precision_score(y_true = data_prev30[ytrue], y_pred = data_prev30[ypred], pos_label = 1)

    #recall values
    recall_current = recall_score(y_true = data_last30[ytrue], y_pred = data_last30[ypred], pos_label = 1)
    recall_initial = recall_score(y_true = data_first30[ytrue], y_pred = data_first30[ypred], pos_label = 1)
    recall_prev30 = recall_score(y_true = data_prev30[ytrue], y_pred = data_prev30[ypred], pos_label = 1) 

    #F1 score
    f1_current = f1_score(y_true = data_last30[ytrue], y_pred = data_last30[ypred], pos_label = 1)
    f1_initial = f1_score(y_true = data_first30[ytrue], y_pred = data_first30[ypred], pos_label = 1)
    f1_prev30 = f1_score(y_true = data_prev30[ytrue], y_pred = data_prev30[ypred], pos_label = 1) 

    #auc pr
    aucpr_current = average_precision_score(y_true = data_last30[ytrue], y_score = data_last30[scores], pos_label = 1)
    aucpr_initial = average_precision_score(y_true = data_first30[ytrue], y_score = data_first30[scores], pos_label = 1)
    aucpr_prev30 = average_precision_score(y_true = data_prev30[ytrue], y_score = data_prev30[scores], pos_label = 1) 

    #auc roc
    aucroc_current = roc_auc_score(y_true = data_last30[ytrue], y_score = data_last30[scores])
    aucroc_initial = roc_auc_score(y_true = data_first30[ytrue], y_score = data_first30[scores])
    aucroc_prev30 = roc_auc_score(y_true = data_prev30[ytrue], y_score = data_prev30[scores]) 

    #TODO - Confirm fraud rate definition
    #fraud rate
    fraudrate_current = data_last30[ytrue].sum()/len(data_last30.index)
    fraudrate_initial = data_first30[ytrue].sum()/len(data_first30.index)
    fraudrate_prev30 = data_prev30[ytrue].sum()/len(data_prev30.index)
    
    #avg score
    avgscore_current = data_last30[scores].sum()/len(data_last30.index)
    avgscore_initial = data_first30[scores].sum()/len(data_first30.index)
    avgscore_prev30 = data_prev30[scores].sum()/len(data_prev30.index)


    #TODO - Confirm fraud missed definition
    #fraud rate with dollar values
    fraudrate_dollar_current = (data_last30[amount]*data_last30[YTRUE_COL]).sum()/data_last30[amount].sum()
    fraudrate_dollar_initial = (data_first30[amount]*data_first30[YTRUE_COL]).sum()/data_first30[amount].sum()
    fraudrate_dollar_prev30 = (data_prev30[amount]*data_prev30[YTRUE_COL]).sum()/data_prev30[amount].sum()

    #$ value of fraud missed
    fraudmissed_dollar_current = data_last30[scores].sum()/len(data_last30.index)
    fraudmissed_dollar_initial = data_first30[scores].sum()/len(data_first30.index)
    fraudmissed_dollar_prev30 = data_prev30[scores].sum()/len(data_prev30.index)

    output = {"metric": ['precision', 'recall','f1score', 'auc_pr', 'auc_roc', 'fraudrate', 'avg_score', 'fraudrate_dollar', 'fraudmissed_dollar'],
             "current_values":[precision_current, recall_current, f1_current, aucpr_current, aucroc_current, fraudrate_current, avgscore_current, fraudrate_dollar_current, fraudmissed_dollar_current],
             "initial_values":[precision_initial, recall_initial, f1_initial, aucpr_initial, aucroc_initial, fraudrate_initial, avgscore_initial, fraudrate_dollar_initial, fraudmissed_dollar_initial],
             "prev30_values":[precision_prev30, recall_prev30, f1_prev30, aucpr_prev30, aucroc_prev30, fraudrate_prev30, avgscore_prev30, fraudrate_dollar_prev30, fraudmissed_dollar_prev30]}    
        
    return output
    
        

In [19]:
tables_data = pd.DataFrame.from_dict(values_for_cells(df))



In [22]:
def create_baseline_data(dframe, ytrue = YTRUE_COL, ypred = YPRED_COL, scores = SCORE_COL, timecol = TIME_COL, amount = AMOUNT_COL):
    #Setting up variables with different date values
    modeltrain_date_start = datetime.strptime(MODEL_START_DATE, "%Y-%m-%d")
    modeltrain_date_end = (modeltrain_date_start + timedelta(days = 30)).strftime("%Y-%m-%d")
    
    #creating different datasets for the different time periods
    
    #dataset 1 - 30 days after model was trained
    data_first30 = dframe.query('{0} > @MODEL_START_DATE & {0} < @modeltrain_date_end'.format(TIME_COL))
    
    #PRECISION
    precision_initial = precision_score(y_true = data_first30[ytrue], y_pred = data_first30[ypred], pos_label = 1)
    
    #recall values
    recall_initial = recall_score(y_true = data_first30[ytrue], y_pred = data_first30[ypred], pos_label = 1)
    
    #F1 score
    f1_initial = f1_score(y_true = data_first30[ytrue], y_pred = data_first30[ypred], pos_label = 1)
    
    #auc pr
    aucpr_initial = average_precision_score(y_true = data_first30[ytrue], y_score = data_first30[scores], pos_label = 1)
    
    #auc roc
    aucroc_initial = roc_auc_score(y_true = data_first30[ytrue], y_score = data_first30[scores])
    
    #TODO - Confirm fraud rate definition
    #fraud rate
    fraudrate_initial = data_first30[ytrue].sum()/len(data_first30.index)
    
    #avg score
    avgscore_initial = data_first30[scores].sum()/len(data_first30.index)

    #TODO - Confirm fraud missed definition
    #fraud rate with dollar values
    fraudrate_dollar_initial = (data_first30[amount]*data_first30[YTRUE_COL]).sum()/data_first30[amount].sum()

    #$ value of fraud missed
    fraudmissed_dollar_initial = data_first30[scores].sum()/len(data_first30.index)
    
    #creating grouped by data frame with needed weeks
    baseline_dataframe = pd.DataFrame(dframe[WEEKSTART_COL].unique()).rename(columns={0: WEEKSTART_COL}).sort_values(by = WEEKSTART_COL)
    baseline_dataframe = baseline_dataframe.assign(precision_baseline = precision_initial,
                              recall_baseline = recall_initial, 
                              f1_baseline = f1_initial, 
                              aucpr_baseline = aucpr_initial,
                              aucroc_baseline = aucroc_initial,
                              fraudrate_baseline = fraudrate_initial,
                              avgscore_baseline = avgscore_initial,
                              fraudrate_dollar_baseline = fraudrate_dollar_initial,
                              fraudmissed_dollar_baseline = fraudmissed_dollar_initial
                              )
    
    return baseline_dataframe
    
    

In [56]:
test = pd.DataFrame(df[WEEKSTART_COL].unique()).rename(columns={0: WEEKSTART_COL}).sort_values(by = WEEKSTART_COL)

In [25]:
baseline_data = create_baseline_data(df)

In [24]:
test

Unnamed: 0,entered_lp_week,precision_baseline,recall_baseline,f1_baseline,aucpr_baseline,aucroc_baseline,fraudrate_baseline,avgscore_baseline,fraudrate_dollar_baseline,fraudmissed_dollar_baseline
108,2018-09-10 00:00:00.000,0.123867,0.464151,0.195548,0.187151,0.772568,0.020116,0.025807,0.022414,0.025807
69,2018-09-17 00:00:00.000,0.123867,0.464151,0.195548,0.187151,0.772568,0.020116,0.025807,0.022414,0.025807
52,2018-09-24 00:00:00.000,0.123867,0.464151,0.195548,0.187151,0.772568,0.020116,0.025807,0.022414,0.025807
72,2018-10-01 00:00:00.000,0.123867,0.464151,0.195548,0.187151,0.772568,0.020116,0.025807,0.022414,0.025807
20,2018-10-08 00:00:00.000,0.123867,0.464151,0.195548,0.187151,0.772568,0.020116,0.025807,0.022414,0.025807
...,...,...,...,...,...,...,...,...,...,...
70,2020-09-07 00:00:00.000,0.123867,0.464151,0.195548,0.187151,0.772568,0.020116,0.025807,0.022414,0.025807
91,2020-09-14 00:00:00.000,0.123867,0.464151,0.195548,0.187151,0.772568,0.020116,0.025807,0.022414,0.025807
46,2020-09-21 00:00:00.000,0.123867,0.464151,0.195548,0.187151,0.772568,0.020116,0.025807,0.022414,0.025807
106,2020-09-28 00:00:00.000,0.123867,0.464151,0.195548,0.187151,0.772568,0.020116,0.025807,0.022414,0.025807


In [None]:
precision_initial = precision_score(y_true = data_first30[YTRUE_COL], y_pred = data_first30[ypred], pos_label = 1)
    
    #recall values
recall_initial = recall_score(y_true = data_first30[ytrue], y_pred = data_first30[ypred], pos_label = 1)


In [57]:
test.assign(precision_baseline = precision_initial)

df.assign(column_new_1=np.nan, column_new_2='dogs', column_new_3=3)

Unnamed: 0,entered_lp_week
106,2018-09-10 00:00:00.000
4,2018-09-17 00:00:00.000
78,2018-09-24 00:00:00.000
83,2018-10-01 00:00:00.000
48,2018-10-08 00:00:00.000
...,...
75,2020-09-07 00:00:00.000
14,2020-09-14 00:00:00.000
8,2020-09-21 00:00:00.000
66,2020-09-28 00:00:00.000
