In [None]:
# parameter
ym = '202108'
table_id = 'user_profile_pivot_monthly'
mode = 'stg'

In [None]:
lastmonth_tables = [
    'mmkt_svc_bas_f',
    'life_locationfeature_monthly',
    'topic_modeling_monthly_v2',
    'user_profile_pivot_monthly',
]

In [None]:


def get_lastmonth(ym):
    import datetime
    from dateutil.relativedelta import relativedelta
    ym = datetime.datetime.strptime(ym,'%Y%m')
    ym = ym - relativedelta(months=1)
    return datetime.datetime.strftime(ym,'%Y%m')

In [None]:
def get_query(feature, ym):
    query = ''
    if feature == 'life_locationfeature_monthly':
        query = f"""
            SELECT
              * 
            FROM (
              SELECT
                *
                EXCEPT (ym, dt)
              FROM
                cpm.life_locationfeature_monthly
              WHERE
                ym = cast('{ym}' as INT64)
                and svc_mgmt_num is not null
            )
        """
    elif feature == 'mmkt_svc_bas_f':
        query = f"""
            SELECT
              * 
            FROM (
              SELECT
                svc_mgmt_num,
                IFNULL(CAST(REPLACE(REPLACE(mth_age,'BBB','-1'),'#','-1') AS INT64), -1) AS mth_age,
                * 
                EXCEPT (svc_mgmt_num, mth_age)
              FROM
                comm.mmkt_svc_bas_f
              WHERE
                strd_ym = cast({ym} as int64)
                and svc_mgmt_num is not null
            )
        """
    elif feature == 'topic_modeling_monthly_v2':
        dt = ym[:4] + '-' + ym[4:6] + '-' + '01'
        query = f"""
            SELECT
                svc_mgmt_num,
                * 
                EXCEPT(
                    svc_mgmt_num, dt
                )
              FROM
                comm.topic_modeling_monthly_v2
              WHERE
                dt = cast('{dt}' as DATE)
                and svc_mgmt_num is not null
        """
    elif feature == 'user_profile_pivot_monthly':
        query = f"""
            SELECT
                svc_mgmt_num,
                * EXCEPT(svc_mgmt_num,
                  sex_cd,
                  eqp_mdl_cd,
                  fee_prod_id,
                  mbr_use_cnt,
                  eqp_out_prc,
                  svc_scrb_dt,
                  t_agr_t_eqpal_scrb_mth_cnt,
                  op_sale_chnl_cl_cd,
                  cncl_aply_bf_equip_chg_dt,
                  othr_co_icall_call,
                  allot_prn_amt,
                  cncl_aply_last_equip_chg_dt,
                  job_cd,
                  nm_cust_num,
                  pmth_inv_bamt,
                  mbr_card_gr_cd,
                  svc_gr_cd,
                  mng_nice_cb_scr,
                  emart_icall_call,
                  t_agr_t_eqpal_scrb_yn,
                  mng_nice_cb_grd,
                  scrb_sale_chnl_cl_cd,
                  allot_mth_cnt,
                  cust_birth_dt,
                  equip_chg_mth_cnt,
                  t_agr_t_eqpal_scrb_ym)
            FROM
                comm.user_profile_pivot_monthly
            WHERE
                ym = cast({ym} as int64)
        """
    else:
        query = ''
        
    return query

In [None]:
if table_id in lastmonth_tables:
    ym = get_lastmonth(ym)
    query = get_query(table_id, ym)
    month_ago_query = get_query(table_id, get_lastmonth(ym))
else:
    query = get_query(table_id, ym)
    month_ago_query = get_query(table_id, get_lastmonth(ym))

table_id = table_id.replace('_', '-')
output = f's3-prd://mls-profile-{mode}/user-profile-{table_id}-nogzip/ym={ym}/op=put'
del_output = f's3-prd://mls-profile-{mode}/user-profile-{table_id}-nogzip/ym={ym}/op=delete'

print(f'input query : {query}')
print(f'delete query : {month_ago_query}')
print(f'output : {output}') 
print(f'delete output : {del_output}') 

In [None]:
import json

from pyspark.sql import Row


def to_json_obj(row):
    x = row.asDict()
    row_dict = {}
    # remove null value
    for (key, value) in x.items():
        if value:
            row_dict[key] = value

    return Row(json.dumps(row_dict))

In [None]:
from pydatafabric.gcp import bq_to_df

df = bq_to_df(query).withColumnRenamed("svc_mgmt_num", "user_id")
month_ago_df = bq_to_df(month_ago_query).withColumnRenamed("svc_mgmt_num", "user_id")

keys_dt = df.select('user_id')
keys_last_dt = month_ago_df.select('user_id')

deleted = keys_last_dt.join(keys_dt, keys_last_dt.user_id == keys_dt.user_id, how='left_anti')

df_put = df.cache()
df_del = deleted.cache()

### Write
##### 전달과 비교해서 delete에도 써야함 (delete는 user_id만 write)

In [None]:
if df_del.count() > 0:
    df_del.rdd.map(to_json_obj).toDF().distinct().repartition(8).write.mode('overwrite').text(del_output)
df_put.rdd.map(to_json_obj).toDF().distinct().repartition(8).write.mode('overwrite').text(output)