In [22]:
import os, sys, inspect
# sys.path.append('../..')

import pandas as pd
from pathlib import Path
import glob as glob
import os
import numpy as np
import re
import json
from datetime import datetime, timedelta, date
from connection import SCORE_DB_CONN, ANALYTICS_DB_CONN
from avay_bq import AvayBQServiceAcc
from da_utils.client.bigquery_client import BigQueryClient
from da_utils.client.google_sheets_client import GoogleSheetsClient
from da_utils.repository.google_sheets.google_sheets_repository import GoogleSheetsRepository
from da_utils.repository.bigquery.bigquery_repository import BigqueryRepository



pd.set_option("display.max_row", 100)

In [23]:
adc_path = '/home/linhnguyen/.config/application_default_credentials.json'
avay_bq_acc = AvayBQServiceAcc()
bq_analytic = BigQueryClient(billing_project_id='prj-ts-p-analytic-8057', adc_file_path_str=adc_path)
bq_repo = BigqueryRepository(bq_analytic)

client = GoogleSheetsClient()
repo = GoogleSheetsRepository(client)
workbook = repo.open_spreadsheet_by_key("1VTTN0Z0O8Grorki1YYo0LBeO4i8UR8S5XNCj7R5uAGs")



In [24]:
lead_phone_infos = avay_bq_acc.client.query(f"""
with rangedate as (
    select
date('2023-02-01') as sdate,
date('2023-05-31') as edate
)
,qualified as(
    select 
        lead_id
        ,case when (utm_source = 'accesstrade' or source = 'vaycucde.vn') THEN '1. AT'
            when utm_source like '%acebook%' or utm_source like '%fb%' or utm_source like '%hatfuel%' THEN '2. Facebook'
            when source in ('prod.vpp.viettel', 'prod.vtp.viettel') THEN '3. VDS'
            when utm_source = 'google_ads' then '4. Google'
            else '5. Others' end as lead_source
        ,case when telco_score between 0 and 599 then '4. Very low (< 600)'
            when telco_score between 600 and 642 then '3. Low (600 - 642)'
            when telco_score between 643 and 749 then '2. Medium (643 - 749)'
            when telco_score >= 750 then '1. High (750+)'
        else '5. Non-scorable' end as score_range
        ,case when telco_code = 'viettel' then '1.viettel'
when telco_code = 'vinaphone' then '2.vinaphone'
when telco_code = 'mobifone' then '3.mobifone'
end as telco_code
        ,loan_date
        ,case when bank_id in (1) then '1. FE'
when bank_id in (16) then '3. Cross-sell'
else '2. Non-FE' end as product
        ,cast(unit_cost as numeric) as cost
    from `avay-a9925.datamart.avay_compound`, rangedate
    where loan_date between sdate and edate
        and (loan_status in ('accepted'))
)

select * from qualified
        """  ).result().to_arrow().to_pandas()

lead_phone_infos = lead_phone_infos.fillna(0)

In [25]:
disbursed1 = avay_bq_acc.client.query(f"""
with rangedate as (
    select
date('2023-02-01') as sdate,
date('2023-05-31') as edate
),
disbursed as
(
    select 
        lead_id
        ,case when bank_code = 'lotte' then batch_date else disbursed_date end as disbursed_date
        ,cast(commission as numeric) as gross_revenue
    from `avay-a9925.dwh.disbursals`, rangedate
    where batch_date between sdate and edate

)

select * from disbursed
        """).result().to_arrow().to_pandas()

disbursed1 = disbursed1.fillna(0)


In [26]:
disbursed2 = bq_repo.get_data_from_query_into_pandas(f"""
with rangedate as (
    select
date('2023-02-01') as sdate,
date('2023-05-31') as edate
),
disbursed as
(select 
  lead_id, 
  case when bank_code = 'lotte' then batch_date else disbursed_date end as disbursed_date,
  cast(commission as numeric) as gross_revenue
from `prj-ts-p-analytic-8057.da.disbursals`, rangedate
where true
and batch_date between sdate and edate
)

select * from disbursed
        """)

disbursed2 = disbursed2.fillna(0)


In [27]:
df00 = pd.read_excel('/home/linhnguyen/TS 31.5.xlsx', sheet_name = 'DLT3')
df00 = df00[['ID đối tác','App Code']]
df00 = df00.rename(columns={"ID đối tác": "lead_id", "App Code": "app_code"})
df00 = df00.fillna(0)
df00['lead_id'] = df00['lead_id'].astype(int)

df01 = pd.read_excel('/home/linhnguyen/TS 31.5.xlsx', sheet_name = 'DLT5')
df01 = df01[['ID đối tác','App Code']]
df01 = df01.rename(columns={"ID đối tác": "lead_id", "App Code": "app_code"})
df01 = df01.fillna(0)
df01['lead_id'] = df01['lead_id'].astype(int)

df02 = pd.read_excel('/home/linhnguyen/TS 31.5.xlsx', sheet_name = 'DLT4')
df02 = df02[['ID đối tác','App Code']]
df02 = df02.rename(columns={"ID đối tác": "lead_id", "App Code": "app_code"})
df02 = df02.fillna(0)
df02['lead_id'] = df02['lead_id'].astype(int)

df1 = pd.concat([df02, df01, df00])

df2 = pd.read_excel('/home/linhnguyen/TS 31.5.xlsx', sheet_name = 'SỐ CASE LOAN')
df2 = df2[{'Mã hồ sơ', 'Sản phẩm', 'Số tiền được duyệt', 'Ngày giải ngân'}]
df2 = df2.rename(columns={"Sản phẩm": "product", "Mã hồ sơ": "app_code", "Số tiền được duyệt":"loan_amount", "Ngày giải ngân":"disbursed_date"})
df2['disbursed_date'] = pd.to_datetime(df2['disbursed_date'], format='%d/%m/%Y').dt.normalize()
fee_rate = pd.DataFrame(
    [
        {"product": "TRS01", "fee_rate": 0.055},
        {"product": "TRS02", "fee_rate": 0.06},
        {"product": "TSL01", "fee_rate": 0.055},
    ]
) 
df = df1.merge(df2, how = 'left').merge(fee_rate).assign(gross_revenue = lambda x: x.loan_amount * x.fee_rate)
disbursed3 = df[['lead_id','disbursed_date','loan_amount','gross_revenue']]
disbursed3['lead_id'] = disbursed3['lead_id'].astype(int)
disbursed3

  warn("""Cannot parse header or footer so it will be ignored""")
  df2 = df2[{'Mã hồ sơ', 'Sản phẩm', 'Số tiền được duyệt', 'Ngày giải ngân'}]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disbursed3['lead_id'] = disbursed3['lead_id'].astype(int)


Unnamed: 0,lead_id,disbursed_date,loan_amount,gross_revenue
0,14010944,2023-05-05,15000000.0,825000.0
1,14007311,2023-04-28,31000000.0,1705000.0
2,14006666,2023-04-28,30000000.0,1650000.0
3,14005340,2023-05-17,20000000.0,1100000.0
4,14000350,2023-04-26,35000000.0,1925000.0
...,...,...,...,...
564,13750828,2023-03-15,20000000.0,1100000.0
565,13745138,2023-03-07,30000000.0,1650000.0
566,13744195,2023-03-04,30000000.0,1650000.0
567,13737785,2023-03-02,40000000.0,2200000.0


In [28]:
# disbursed = disbursed1.combine_first(disbursed2).combine_first(disbursed3)

disbursed = pd.concat([disbursed1, disbursed2, disbursed3])
disbursed = disbursed.drop_duplicates(subset=['lead_id'])
disbursed = disbursed.fillna(0)
disbursed['gross_revenue'] = disbursed['gross_revenue'].astype('float')



In [29]:
leads_with_disbursement = lead_phone_infos.merge(disbursed, how = 'left').assign(
    lead_sent_day=lambda x: x.loan_date.astype(str).str[8:10].astype(int),
    lead_sent_month=lambda x: x.loan_date.astype(str).str[0:7],
    loan_sent_month=lambda x: x.disbursed_date.astype(str).str[0:7],)


leads_with_disbursement["loan"] = np.where((leads_with_disbursement["disbursed_date"].notnull()),1,0)
leads_with_disbursement["commission"] = np.where((leads_with_disbursement["disbursed_date"].notnull()),leads_with_disbursement["gross_revenue"],0)
leads_with_disbursement["commission"] = leads_with_disbursement["commission"].astype('Float64')
leads_with_disbursement["cost"] = leads_with_disbursement["cost"].astype('Float64')
leads_with_disbursement.fillna(0)


Unnamed: 0,lead_id,lead_source,score_range,telco_code,loan_date,product,cost,disbursed_date,gross_revenue,loan_amount,lead_sent_day,lead_sent_month,loan_sent_month,loan,commission
0,13730674,5. Others,4. Very low (< 600),1.viettel,2023-02-28,3. Cross-sell,0.0,0,0.0,0.0,28,2023-02,,0,0.0
1,13720144,5. Others,5. Non-scorable,3.mobifone,2023-02-26,1. FE,1147856.0,0,0.0,0.0,26,2023-02,,0,0.0
2,13629332,5. Others,4. Very low (< 600),1.viettel,2023-02-04,3. Cross-sell,4959.0,0,0.0,0.0,4,2023-02,,0,0.0
3,13640007,5. Others,2. Medium (643 - 749),1.viettel,2023-02-07,1. FE,1051.0,0,0.0,0.0,7,2023-02,,0,0.0
4,13711765,5. Others,5. Non-scorable,3.mobifone,2023-02-24,1. FE,258.0,0,0.0,0.0,24,2023-02,,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147744,13989615,3. VDS,1. High (750+),1.viettel,2023-04-23,2. Non-FE,0.0,0,0.0,0.0,23,2023-04,,0,0.0
147745,14104879,3. VDS,1. High (750+),1.viettel,2023-05-24,2. Non-FE,0.0,0,0.0,0.0,24,2023-05,,0,0.0
147746,14098027,3. VDS,1. High (750+),1.viettel,2023-05-22,2. Non-FE,0.0,0,0.0,0.0,22,2023-05,,0,0.0
147747,14113203,3. VDS,1. High (750+),1.viettel,2023-05-28,2. Non-FE,0.0,0,0.0,0.0,28,2023-05,,0,0.0


In [37]:
lead = leads_with_disbursement.groupby(["lead_sent_month", "score_range", "lead_source", "product"]).agg({"lead_id":"count","cost":"sum"}).reset_index()
lead.rename(columns = {'lead_sent_month':'month'}, inplace = True)
loan = leads_with_disbursement.groupby(["loan_sent_month","score_range", "lead_source", "product"]).agg({"loan":"sum","commission":"mean"}).reset_index()
loan.rename(columns = {'loan_sent_month':'month'}, inplace = True)
cf = lead.merge(loan, on = ['month','score_range', 'lead_source', 'product'], how = 'outer')
cf
repo.write_df_to_sheet(workbook.worksheet("Sheet21"),cf, starting_cell = 'A1')