In [10]:
from bs4 import BeautifulSoup
import urllib.request
import requests
from pprint import pprint
import pandas as pd
import numpy as np
import json
import copy
from datetime import datetime, timezone
from google.cloud import storage
import gcsfs
import re
from sqlalchemy import create_engine, text
from typing import List

In [11]:
postgres_config = {
    "host": "findy-medium-stage.czmgcqkw4ett.ap-southeast-1.rds.amazonaws.com",
    "database": "findy_medium_stage",
    "user": "postgres",
    "password": "F!nDy!Med!umStage2o24",
    "port": "5432"
}

In [12]:
def build_upsert_query(cols: List[str],
                       table_name: str,
                       unique_key: List[str]=[],
                       cols_not_for_update: List[str] = None) -> str:
    """
    Builds postgres upsert query using input arguments.
    Note: In the absence of unique_key, this will be just an insert query.
    Example : build_upsert_query(
        ['col1', 'col2', 'col3', 'col4'],
        "my_table",
        ['col1'],
        ['col2']
    ) ->
    INSERT INTO my_table (col1, col2, col3, col4) VALUES %s
    ON CONFLICT (col1) DO UPDATE SET (col3, col4) = (EXCLUDED.col3, EXCLUDED.col4) ;
    :param cols: the postgres table columns required in the
        insert part of the query.
    :param table_name: the postgres table name.
    :param unique_key: unique_key of the postgres table for checking
        unique constraint violations.
    :param cols_not_for_update: columns in cols which are not required in
        the update part of upsert query.
    :return: Upsert query as per input arguments.
    """
    cols = [f'"{col}"' for col in cols]
    cols_str = ', '.join(cols)
    insert_query = """ INSERT INTO %s (%s) VALUES %%s """ % (
        table_name, cols_str
    )
    if cols_not_for_update is not None:
        cols_not_for_update.extend(unique_key)
    else:
        cols_not_for_update = [col for col in unique_key]
    cols_not_for_update = [f'"{col}"' for col in cols_not_for_update]
    unique_key = [f'"{col}"' for col in unique_key]
    unique_key_str = ', '.join(unique_key)

    update_cols = [f"{col}" for col in cols if col not in cols_not_for_update]
    update_cols_str = ', '.join(update_cols)
    update_cols_with_excluded_markers = [f'EXCLUDED.{col}' for col in update_cols]
    update_cols_with_excluded_markers_str = ', '.join(
        update_cols_with_excluded_markers
    )
    if len(update_cols) > 1:
        equality_clause = "(%s) = (%s)"
    else:
        equality_clause = "%s = %s"

    on_conflict_clause = f""" ON CONFLICT (%s) DO UPDATE SET {equality_clause} ;"""
    on_conflict_clause = on_conflict_clause % (
        unique_key_str,
        update_cols_str,
        update_cols_with_excluded_markers_str
    )
    if len(unique_key) == 0:
        return insert_query
    return insert_query + on_conflict_clause

In [13]:
storage_client = storage.Client(project="ytone-430507")
today = datetime.now()
table_name = "douyin_influencer"
api_name = "get_author_link_info"
bucket_name = "3_staging_area"

In [14]:
engine = create_engine("postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format(**postgres_config))
l_df = pd.read_sql_table("douyin_influencer",con=engine)
l_df

Unnamed: 0,id,user_id,short_id,nickname,image_url,fans_count,city,province,level,order_cnt,...,total_favour_cnt,cooperate_index,cp_index,growth_index,shopping_index,spread_index,top_score,deleted,tags_ids,updated_hf
0,385502,7033195813941542920,,曼曼说漫,https://p26.douyinpic.com/aweme/1080x1080/awem...,276054.0,青浦区,上海市,,,...,,,,,,,,False,[95],NaT
1,299346,6837785038658994183,,爱尚北京,https://p3.douyinpic.com/aweme/1080x1080/aweme...,520544.0,东城区,北京市,,,...,,,,,,,,False,[48],NaT
2,281468,7051583381032665124,,小强,https://p3.douyinpic.com/aweme/1080x1080/aweme...,164938.0,合肥,,,,...,,,,,,,,False,[36],NaT
3,320496,6727459456277282829,,健身皓叔,https://p3.douyinpic.com/aweme/1080x1080/aweme...,276893.0,无锡市,江苏省,,,...,,,,,,,,False,[60],NaT
4,385451,7251096853150695461,,🍍蜜,https://p11.douyinpic.com/aweme/1080x1080/awem...,180023.0,九江,,,,...,,,,,,,,False,[95],NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161393,440400,7394282565030379561,,小妞说车,https://p11.douyinpic.com/aweme/1080x1080/awem...,798483.0,合肥,,,,...,,,,,,,,False,[31],NaT
161394,440401,7267752033296318523,,丰田队长,https://p3.douyinpic.com/aweme/1080x1080/aweme...,164515.0,沧州,,,,...,,,,,,,,False,[31],NaT
161395,440403,6869671499301650445,,帮帮说车,https://p26.douyinpic.com/aweme/1080x1080/awem...,112048.0,,,,,...,,,,,,,,False,[31],NaT
161396,440404,6870159996282208270,,中国交通安全栏,https://p11.douyinpic.com/aweme/1080x1080/awem...,481993.0,北京,,,,...,,,,,,,,False,[31],NaT


In [15]:
processing_blobs = [
{
    "blob": blob,
    "date": blob.name.split('/')[2],
    "batch": int(blob.name.split('/')[-1].replace(".parquet", "").split("_")[-1]),
} for blob in storage_client.list_blobs("3_staging_area",prefix="1_xingtu/douyin_influencer/") if api_name in blob.name]
processing_blobs

[{'blob': <Blob: 3_staging_area, 1_xingtu/douyin_influencer/2024-09-11/get_author_link_info_240920_0.parquet, 1726817967674340>,
  'date': '2024-09-11',
  'batch': 0},
 {'blob': <Blob: 3_staging_area, 1_xingtu/douyin_influencer/2024-09-17/get_author_link_info_240920_0.parquet, 1726818222494826>,
  'date': '2024-09-17',
  'batch': 0},
 {'blob': <Blob: 3_staging_area, 1_xingtu/douyin_influencer/2024-09-18/get_author_link_info_240920_0.parquet, 1726818224144416>,
  'date': '2024-09-18',
  'batch': 0},
 {'blob': <Blob: 3_staging_area, 1_xingtu/douyin_influencer/2024-09-19/get_author_link_info_240920_0.parquet, 1726818225742625>,
  'date': '2024-09-19',
  'batch': 0}]

In [16]:
bucket = storage_client.get_bucket(bucket_name)
meta_blob = bucket.blob("1_xingtu/douyin_influencer/meta.json")
processed_blobs = json.loads(meta_blob.download_as_string())
processed_blobs = [blob for blob in processed_blobs]
processed_blobs


[{'file_path': 'gs://0_raw_data/2_xingtu/2024-09-08/xingtu_detail_240908_0.json',
  'date': '2024-09-08',
  'batch': 0},
 {'file_path': 'gs://0_raw_data/2_xingtu/2024-09-08/xingtu_detail_240908_1.json',
  'date': '2024-09-08',
  'batch': 1},
 {'file_path': 'gs://3_staging_area/1_xingtu/douyin_influencer/get_author_base_info_240927_0.parquet',
  'date': '2024-09-11',
  'batch': 0},
 {'file_path': 'gs://3_staging_area/1_xingtu/douyin_influencer/get_author_base_info_240927_0.parquet',
  'date': '2024-09-17',
  'batch': 0},
 {'file_path': 'gs://3_staging_area/1_xingtu/douyin_influencer/get_author_base_info_240927_0.parquet',
  'date': '2024-09-18',
  'batch': 0},
 {'file_path': 'gs://3_staging_area/1_xingtu/douyin_influencer/get_author_base_info_240927_0.parquet',
  'date': '2024-09-19',
  'batch': 0},
 {'file_path': 'gs://3_staging_area/1_xingtu/douyin_influencer/get_authors_ranking_in_240927_0.parquet',
  'date': '2024-09-11',
  'batch': 0},
 {'file_path': 'gs://3_staging_area/1_xingtu/d

In [17]:
to_process = []
for processing_blob in processing_blobs:
    processing_date = datetime.strptime(processing_blob["date"], "%Y-%m-%d")
    if processing_date >= datetime(2024, 9, 8):
        if processing_blob["batch"] not in [processed_blob["batch"] for processed_blob in processed_blobs if api_name in processed_blob["file_path"] and processing_blob["date"] == processed_blob["date"]]:
            to_process.append(processing_blob)
pprint(to_process)
print(len(to_process))

[]
0


In [9]:
for item in to_process:
    r_df = pd.read_parquet("gs://" + bucket_name + "/" + item["blob"].name)
    r_df["deleted"] = False
    df = pd.merge(l_df, r_df, on="user_id", how="right")
    df.columns = [column.replace("_y", "") for column in df.columns]
    df = df[["core_user_id", "is_star"] + list(r_df.columns)]
    upsert_df = df
    query = build_upsert_query(upsert_df.columns, "douyin_influencer", ["core_user_id"])
    value = ", ".join([str(record).replace("''", "NULL") for record in upsert_df.fillna("").to_records(index=False)])
    query = query % value
    with engine.connect() as conn:
        result = conn.execute(text(query))
        conn.commit()
    processed_blobs.append({
        'file_path': "gs://" + bucket_name + "/1_xingtu/douyin_influencer/" + api_name + "_" + today.strftime("%y%m%d") + "_" + str(item["batch"]) + ".parquet",
        'date': item["date"],
        'batch': item["batch"]
    })
    meta_blob.upload_from_string(json.dumps(processed_blobs))