In [1]:
import os, sys, inspect
# sys.path.append('../..')

import pandas as pd
from pathlib import Path
import glob as glob
import os
import numpy as np
import re
import json
from datetime import datetime, timedelta, date
from connection import SCORE_DB_CONN, ANALYTICS_DB_CONN
from avay_bq import AvayBQServiceAcc
from da_utils.client.bigquery_client import BigqueryClient
from da_utils.client.google_sheets_client import GoogleSheetsClient
from da_utils.repository.google_sheets.google_sheets_repository import GoogleSheetsService
from da_utils.repository.bigquery.bigquery_repository import BigqueryService



pd.set_option("display.max_row", 100)

In [2]:
adc_path = Path('/home/linhnguyen/.config/gcloud/application_default_credentials.json')
avay_bq_acc = AvayBQServiceAcc()
bq_client = BigqueryClient(billing_project_id='prj-ts-p-analytic-8057', adc_file_path=adc_path)
bq_repo = BigqueryService(bq_client)

client = GoogleSheetsClient()
repo = GoogleSheetsService(client)
workbook = repo.open_spreadsheet("1U9QDxArDL7MxYMs2H2-xVRbwfApTEJbahaKZoN1lY9s")
workbook2 = repo.open_spreadsheet("19D57H_pstFiff_dt_F5ewjqGnAOkSnE1Q9m2lFJCFsg")

In [3]:
lead_phone_infos = avay_bq_acc.client.query(f"""
with rangedate as (
    select
date('2021-01-01') as sdate,
date('2023-07-31') as edate
)
,qualified as(
    select 
        lead_id
        ,case when lead_source in ('Accesstrade', 'Vaycucde') then 'Accesstrade'
            when lead_source_group = 'Facebook' then 'Facebook'
            when lead_source_group = 'Google ads' then 'Google'
            when lead_source in ('Viettel Pay', 'Viettel Pay Pro') then 'VTP'
        else 'Others' end as lead_source
        ,case when telco_code in('viettel') then telco_code else 'others' end as telco_code
        ,concat(
            case when telco_code in ('viettel') then 'VT' else 'OTHERS' end,
            '_',
            case when bank_name ='SHBFinance' then 'SHB' else bank_name end,
            '_',
            case
            when
                offer_code IN  ('944', '1085', '1141', '1199', '1308', '1542', '1544', '1567', '1771', '1769', '1768', '1766', '1767', '1758', '1874')  then '1.High'
            when
                offer_code IN  ('945', '1086', '1142', '1200', '1309', '1568', '1772', '1770', '1747', '1748','1875')  then '2.Med'
            when
                offer_code IN  ('946', '1087', '1143', '1201', '1310', '1569', '1818')  then '3.Low'
            when offer_code in ('FECEOO_NONVT','FECEOO_VT_LOW') then 'CEOO' 
            else offer_code end) as Code

        ,loan_date
        ,bank_name
        ,offer_code
        ,cast(unit_cost as numeric) as cost
    from `avay-a9925.datamart.avay_compound`, rangedate
    where loan_date between sdate and edate
        and (loan_status in ('accepted'))
)

select * from qualified
        """  ).result().to_arrow().to_pandas()

lead_phone_infos = lead_phone_infos.fillna(0)

In [4]:
disbursed1 = avay_bq_acc.client.query(f"""
with rangedate as (
    select
date('2021-01-01') as sdate,
date('2023-07-31') as edate
),
disbursed as
(
    select 
        lead_id
        ,case when bank_code = 'lotte' then batch_date else disbursed_date end as disbursed_date
        ,cast(commission as numeric) as gross_revenue
    from `avay-a9925.dwh.disbursals`, rangedate
    where batch_date between sdate and edate

)

select * from disbursed
        """).result().to_arrow().to_pandas()

disbursed1 = disbursed1.fillna(0)


In [5]:
disbursed2 = bq_repo.get_data_from_query_into_pandas(f"""
with rangedate as (
    select
date('2021-01-01') as sdate,
date('2023-07-31') as edate
),
disbursed as
(select 
  lead_id, 
  case when bank_code = 'lotte' then batch_date else disbursed_date end as disbursed_date,
  cast(commission as numeric) as gross_revenue
from `prj-ts-p-analytic-8057.da.disbursals`, rangedate
where true
and batch_date between sdate and edate
)

select * from disbursed
        """)

disbursed2 = disbursed2.fillna(0)


In [6]:
df00 = pd.read_excel('/home/linhnguyen/TS 28.6.xlsx', sheet_name = 'DLT4')
df00 = df00[['ID đối tác','App Code']]
df00 = df00.rename(columns={"ID đối tác": "lead_id", "App Code": "app_code"})
df00 = df00.fillna(0)
df00['lead_id'] = df00['lead_id'].astype(int)

df01 = pd.read_excel('/home/linhnguyen/TS 28.6.xlsx', sheet_name = 'DLT5')
df01 = df01[['ID đối tác','App Code']]
df01 = df01.rename(columns={"ID đối tác": "lead_id", "App Code": "app_code"})
df01 = df01.fillna(0)
df01['lead_id'] = df01['lead_id'].astype(int)

df02 = pd.read_excel('/home/linhnguyen/TS 28.6.xlsx', sheet_name = 'DLT6')
df02 = df02[['ID đối tác','App Code']]
df02 = df02.rename(columns={"ID đối tác": "lead_id", "App Code": "app_code"})
df02 = df02.fillna(0)
df02['lead_id'] = df02['lead_id'].astype(int)

df1 = pd.concat([df02, df01, df00])

df2 = pd.read_excel('/home/linhnguyen/TS 28.6.xlsx', sheet_name = 'SỐ CASE LOAN')
df2 = df2[{'Mã hồ sơ', 'Sản phẩm', 'Số tiền được duyệt', 'Ngày giải ngân'}]
df2 = df2.rename(columns={"Sản phẩm": "product", "Mã hồ sơ": "app_code", "Số tiền được duyệt":"loan_amount", "Ngày giải ngân":"disbursed_date"})
df2['disbursed_date'] = pd.to_datetime(df2['disbursed_date'], format='%d/%m/%Y').dt.normalize()
fee_rate = pd.DataFrame(
    [
        {"product": "TRS01", "fee_rate": 0.055},
        {"product": "TRS02", "fee_rate": 0.06},
        {"product": "TSL01", "fee_rate": 0.055},
    ]
) 
df = df1.merge(df2, how = 'left').merge(fee_rate).assign(gross_revenue = lambda x: x.loan_amount * x.fee_rate)
disbursed3 = df[['lead_id','disbursed_date','loan_amount','gross_revenue']]
disbursed3['lead_id'] = disbursed3['lead_id'].astype(int)
disbursed3

  warn("""Cannot parse header or footer so it will be ignored""")
  df2 = df2[{'Mã hồ sơ', 'Sản phẩm', 'Số tiền được duyệt', 'Ngày giải ngân'}]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disbursed3['lead_id'] = disbursed3['lead_id'].astype(int)


Unnamed: 0,lead_id,disbursed_date,loan_amount,gross_revenue
0,14205128,2023-06-28,15000000.0,900000.0
1,14202845,2023-06-27,15000000.0,900000.0
2,14198341,2023-06-28,30000000.0,1800000.0
3,14192242,2023-06-24,30000000.0,1800000.0
4,14185284,2023-06-22,10000000.0,600000.0
...,...,...,...,...
421,13923634,2023-04-07,30000000.0,1650000.0
422,13911137,2023-04-05,20000000.0,1100000.0
423,13906215,2023-04-06,15000000.0,825000.0
424,13901330,2023-04-15,10000000.0,550000.0


In [7]:
# ceoo = pd.read_csv('/home/linhnguyen/2023-05 - FE CEOO - AVAY_detail_lead.csv')
# ceoo = ceoo[~ceoo['disbursaldate'].isnull()]
# ceoo.rename(columns = {'disbursaldate': 'disbursed_date', 'loan_amt': 'loan_amount'}, inplace = True)
# ceoo['loan_amount'] = ceoo['loan_amount'].str.replace(',', '').astype(float)
# ceoo['gross_revenue'] = ceoo['loan_amount']*0.03
# ceoo = ceoo[['lead_id', 'disbursed_date', 'loan_amount', 'gross_revenue']]
# ceoo

ceoo = pd.read_csv('/home/linhnguyen/CEOO_daily.csv')
ceoo = ceoo[~ceoo['DISBURSED_DATE'].isnull()]
ceoo.rename(columns = {'DISBURSED_DATE': 'disbursed_date', 'APPROVED_LOAN_AMOUNT': 'loan_amount', 'ts_lead_id': 'lead_id'}, inplace = True)
ceoo['loan_amount'] = ceoo['loan_amount'].replace(',', '').astype(float)
ceoo['gross_revenue'] = ceoo['loan_amount']*0.03
ceoo = ceoo[['lead_id', 'disbursed_date', 'loan_amount', 'gross_revenue']]
ceoo

Unnamed: 0,lead_id,disbursed_date,loan_amount,gross_revenue
1081,13968672,2023-04-18,13574000.0,407220.0
1082,13968672,2023-04-18,13574000.0,407220.0
1083,13968672,2023-04-18,13574000.0,407220.0
1084,13968672,2023-04-18,13574000.0,407220.0
1085,13968672,2023-04-18,13574000.0,407220.0
...,...,...,...,...
2096536,13579667,2023-02-02,10000000.0,300000.0
2096588,13579326,2023-02-10,10000000.0,300000.0
2096592,13583094,2023-02-06,10000000.0,300000.0
2096650,13589331,2023-02-09,10000000.0,300000.0


In [8]:
# disbursed = disbursed1.combine_first(disbursed2).combine_first(disbursed3)

disbursed = pd.concat([disbursed1, disbursed2, disbursed3, ceoo])
disbursed = disbursed.drop_duplicates(subset=['lead_id'])
disbursed = disbursed.fillna(0)
disbursed['gross_revenue'] = disbursed['gross_revenue'].astype('float')
disbursed[disbursed['lead_id']==13733718]


Unnamed: 0,lead_id,disbursed_date,gross_revenue,loan_amount
18668,13733718.0,2023-03-01,825000.0,0.0


In [9]:
leads_with_disbursement = lead_phone_infos.merge(disbursed, how = 'left').assign(
    lead_sent_day=lambda x: x.loan_date.astype(str).str[8:10].astype(int),
    lead_sent_month=lambda x: x.loan_date.astype(str).str[0:7],
    loan_sent_month=lambda x: x.disbursed_date.astype(str).str[0:7],)


leads_with_disbursement["loan"] = np.where((leads_with_disbursement["disbursed_date"].notnull()),1,0)
leads_with_disbursement["loan_in_month"] = np.where((leads_with_disbursement["disbursed_date"].notnull())&(leads_with_disbursement["loan_sent_month"] == leads_with_disbursement["lead_sent_month"]) ,1,0)
leads_with_disbursement["commission"] = np.where((leads_with_disbursement["disbursed_date"].notnull()),leads_with_disbursement["gross_revenue"],0)
leads_with_disbursement["commission"] = leads_with_disbursement["commission"].astype('Float64')

# Define the conditions
condition1 = leads_with_disbursement['lead_source'] == 'VTP'
condition2 = (leads_with_disbursement['telco_code'] == 'viettel') & (leads_with_disbursement['bank_name'] != 'F88')

# Define the outputs for each condition
output1 = leads_with_disbursement['commission']*0.775
output2 = leads_with_disbursement['commission']*0.175


# Apply the conditions and outputs using np.select()

leads_with_disbursement["telco_share"] =  np.select([condition1, condition2], [output1, output2], default=0)
leads_with_disbursement.fillna(0)


Unnamed: 0,lead_id,lead_source,telco_code,Code,loan_date,bank_name,offer_code,cost,disbursed_date,gross_revenue,loan_amount,lead_sent_day,lead_sent_month,loan_sent_month,loan,loan_in_month,commission,telco_share
0,14015980,Others,others,OTHERS_Lotte Finance_LOTTE00,2023-05-01,Lotte Finance,LOTTE00,0,0,0.0,0.0,1,2023-05,,0,0,0.0,0.0
1,14016240,Others,others,OTHERS_Lotte Finance_LOTTE00,2023-05-01,Lotte Finance,LOTTE00,0,0,0.0,0.0,1,2023-05,,0,0,0.0,0.0
2,14016530,Others,others,OTHERS_Lotte Finance_LOTTE00,2023-05-01,Lotte Finance,LOTTE00,0,0,0.0,0.0,1,2023-05,,0,0,0.0,0.0
3,14016351,VTP,viettel,VT_FE Credit_3.Low,2023-05-01,FE Credit,1569,0,0,0.0,0.0,1,2023-05,,0,0,0.0,0.0
4,14018097,VTP,viettel,VT_FE Credit_3.Low,2023-05-01,FE Credit,1569,0,0,0.0,0.0,1,2023-05,,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1657023,13643853,VTP,viettel,VT_FE Credit_CEOO,2023-02-08,FE Credit,FECEOO_VT_LOW,0,0,0.0,0.0,8,2023-02,,0,0,0.0,0.0
1657024,13789162,VTP,viettel,VT_FE Credit_CEOO,2023-03-10,FE Credit,FECEOO_VT_LOW,0,0,0.0,0.0,10,2023-03,,0,0,0.0,0.0
1657025,13720045,VTP,viettel,VT_FE Credit_CEOO,2023-02-26,FE Credit,FECEOO_VT_LOW,0,0,0.0,0.0,26,2023-02,,0,0,0.0,0.0
1657026,13472936,VTP,viettel,VT_FE Credit_CEOO,2022-12-28,FE Credit,FECEOO_VT_LOW,0,0,0.0,0.0,28,2022-12,,0,0,0.0,0.0


In [9]:
lead = leads_with_disbursement.groupby(["lead_sent_month", "Code", "bank_name", "lead_source"]).agg({"lead_id":"count","loan":"sum", "commission":"sum", "telco_share":"sum", "cost":"sum", "loan_in_month":"sum"}).reset_index()
lead['cr'] = lead['loan']/lead['lead_id']
lead['net revenue'] = lead['commission'] - lead['telco_share']
lead['commission per loan'] = lead['commission'] / lead['loan']
lead['revenue per lead'] = lead['net revenue']/lead['lead_id']
lead['cost per lead'] = lead['cost']/lead['lead_id']
lead['pct_loan_in_month'] = lead['loan_in_month']/lead['loan']

lead = lead[["lead_sent_month","Code","bank_name","lead_source","lead_id","loan","commission", "telco_share", "cost", "cr", 
"net revenue", "commission per loan", "revenue per lead", "cost per lead", "pct_loan_in_month" ]]
# repo.write_df_to_sheet(workbook.worksheet("note"),lead, starting_cell = 'A1')
#repo.write_df_to_sheet(workbook.worksheet("data_act&est"),lead, starting_cell = 'A1')

In [None]:
# loan = leads_with_disbursement.groupby(["loan_sent_month", "Code", "lead_source"]).agg({"loan":"sum", "commission":"sum", "telco_share":"sum"}).reset_index().rename(columns={'loan_sent_month': 'month', 'commission':'revenue'})

# output = lead.merge(loan, how = 'left')
# output['profit'] = output['revenue'] - output['telco_share'] - output['cost']
# output = output[['month','Code','lead_source', 'qualified_leads', 'loan', 'revenue', 'telco_share','cost', 'profit']]
# output
# repo.write_df_to_sheet(workbook.worksheet("data_act&est"),output, starting_cell = 'A1')


In [None]:
# pivot = output.pivot(index=['Code','lead_source'], columns='month', values=['qualified_leads','loan','revenue','telco_share','cost','profit']).reset_index()
# pivot

In [None]:
# repo.write_df_to_sheet(workbook.worksheet("MOM"),pivot, starting_cell = 'A1')
