In [1]:
from bs4 import BeautifulSoup
import urllib.request
import requests
from pprint import pprint
import pandas as pd
import numpy as np
import json
import copy
from datetime import datetime, timezone
from google.cloud import storage
import gcsfs
import re
from sqlalchemy import create_engine, text
from typing import List

pd.set_option('display.max_columns', None)

In [2]:
postgres_config = {
    "host": "findy-medium-stage.czmgcqkw4ett.ap-southeast-1.rds.amazonaws.com",
    "database": "findy_medium_stage",
    "user": "postgres",
    "password": "F!nDy!Med!umStage2o24",
    "port": "5432"
}

In [3]:
def build_upsert_query(cols: List[str],
                       table_name: str,
                       unique_key: List[str]=[],
                       cols_not_for_update: List[str] = None) -> str:
    """
    Builds postgres upsert query using input arguments.
    Note: In the absence of unique_key, this will be just an insert query.
    Example : build_upsert_query(
        ['col1', 'col2', 'col3', 'col4'],
        "my_table",
        ['col1'],
        ['col2']
    ) ->
    INSERT INTO my_table (col1, col2, col3, col4) VALUES %s
    ON CONFLICT (col1) DO UPDATE SET (col3, col4) = (EXCLUDED.col3, EXCLUDED.col4) ;
    :param cols: the postgres table columns required in the
        insert part of the query.
    :param table_name: the postgres table name.
    :param unique_key: unique_key of the postgres table for checking
        unique constraint violations.
    :param cols_not_for_update: columns in cols which are not required in
        the update part of upsert query.
    :return: Upsert query as per input arguments.
    """
    cols = [f'"{col}"' for col in cols]
    cols_str = ', '.join(cols)
    insert_query = """ INSERT INTO %s (%s) VALUES %%s """ % (
        table_name, cols_str
    )
    if cols_not_for_update is not None:
        cols_not_for_update.extend(unique_key)
    else:
        cols_not_for_update = [col for col in unique_key]
    cols_not_for_update = [f'"{col}"' for col in cols_not_for_update]
    unique_key = [f'"{col}"' for col in unique_key]
    unique_key_str = ', '.join(unique_key)

    update_cols = [f"{col}" for col in cols if col not in cols_not_for_update]
    update_cols_str = ', '.join(update_cols)
    update_cols_with_excluded_markers = [f'EXCLUDED.{col}' for col in update_cols]
    update_cols_with_excluded_markers_str = ', '.join(
        update_cols_with_excluded_markers
    )
    if len(update_cols) > 1:
        equality_clause = "(%s) = (%s)"
    else:
        equality_clause = "%s = %s"

    on_conflict_clause = f""" ON CONFLICT (%s) DO UPDATE SET {equality_clause} ;"""
    on_conflict_clause = on_conflict_clause % (
        unique_key_str,
        update_cols_str,
        update_cols_with_excluded_markers_str
    )
    if len(unique_key) == 0:
        return insert_query
    return insert_query + on_conflict_clause

In [4]:
engine = create_engine("postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format(**postgres_config))
df = pd.read_sql_table("douyin_influencer",con=engine)
df

Unnamed: 0,id,user_id,short_id,nickname,image_url,fans_count,city,province,level,order_cnt,engage_rate,expected_cpm,expected_play_num,sex,lowest_price,create_time,modify_time,is_star,e_commerce_enable,updated,avg_play,tags_ids_level_two,core_user_id,order_complete_rate,unique_id,middle_play,order_avg_time_cost,order_complete_cnt,total_favour_cnt,cooperate_index,cp_index,growth_index,shopping_index,spread_index,top_score,deleted,tags_ids,updated_hf
0,745,7067890156870565925,92630927447,搞个小知识,https://p3.douyinpic.com/aweme/1080x1080/aweme...,192981.0,郑州市,河南省,,,,,15629.0,1.0,0.0,2022-02-23 20:03:34+00:00,2024-09-08 18:57:02+00:00,False,False,2024-09-27 17:03:54.857606+00:00,0.0,[89],1293475888497507,,92630927447,,,2.0,,771700.0,750900.0,,891500.0,813900.0,788400.0,False,[87],NaT
1,733,7068534992799793192,155199886,山城百货,https://p3.douyinpic.com/aweme/1080x1080/aweme...,129771.0,遵义,,,,,,250.0,1.0,0.0,2022-03-13 11:06:30+00:00,2024-09-08 18:43:37+00:00,False,False,2024-09-27 17:03:54.857606+00:00,0.0,[61],76929752190,,155199886,,,0.0,,760300.0,787300.0,,362500.0,523900.0,646100.0,False,[60],NaT
2,585,7070160089897762823,70958977730,元气阿琪🍓,https://p3.douyinpic.com/aweme/1080x1080/aweme...,324587.0,杭州市,浙江省,,,,,27651.0,2.0,0.0,2022-03-01 22:52:04+00:00,2024-09-08 18:44:08+00:00,False,False,2024-09-27 17:03:54.857606+00:00,15177.0,"[19, 52]",572212131404632,,70958977730,,,0.0,,878800.0,804500.0,,656600.0,734900.0,802500.0,False,"[15, 48]",NaT
3,587,7071534354526109727,59257587642,初·见,https://p26.douyinpic.com/aweme/1080x1080/awem...,90366.0,商洛市,陕西省,,,,,,1.0,0.0,2022-03-05 15:44:55+00:00,2024-09-08 18:47:09+00:00,False,False,2024-09-27 17:03:54.857606+00:00,0.0,[101],2080697123092592,,BINSHAO202488,,,0.0,,760500.0,136900.0,,93500.0,123200.0,156500.0,False,[100],NaT
4,2238,7072628401739137032,2321574917,哈哈蕾,https://p11.douyinpic.com/aweme/1080x1080/awem...,790793.0,北京,,,,,,46491.0,2.0,0.0,2022-03-08 14:30:23+00:00,2024-09-18 05:06:59+00:00,False,False,2024-09-27 17:03:54.857606+00:00,0.0,[50],1244239607105891,,hxlstudio,,,0.0,,760300.0,641300.0,,658400.0,629500.0,630000.0,False,[48],NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36011,49196,7394354270025465893,,,,,,,,,,,13447.0,,,NaT,NaT,False,,NaT,,,1732429576407486,,,,,0.0,,570300.0,556700.0,,662900.0,594300.0,633500.0,False,,NaT
36012,39509,7394501308713025563,,,,,,,,,,,85369.0,,,NaT,NaT,False,,NaT,,,1622531211534028,,,,,2.0,,761400.0,706100.0,,573400.0,670400.0,642800.0,False,,NaT
36013,32564,7394706079147360266,,,,,,,,,,,105460.0,,,NaT,NaT,False,,NaT,,,417423506548387,,,,,1.0,,570000.0,685699.0,,678300.0,638000.0,642800.0,False,,NaT
36014,33403,7395241053396762662,,,,,,,5.0,,,,426059.0,,,NaT,NaT,False,,NaT,,,86936406689,,,,,23.0,,809500.0,686000.0,,715500.0,723300.0,723100.0,False,,NaT


In [5]:
df.columns

Index(['id', 'user_id', 'short_id', 'nickname', 'image_url', 'fans_count',
       'city', 'province', 'level', 'order_cnt', 'engage_rate', 'expected_cpm',
       'expected_play_num', 'sex', 'lowest_price', 'create_time',
       'modify_time', 'is_star', 'e_commerce_enable', 'updated', 'avg_play',
       'tags_ids_level_two', 'core_user_id', 'order_complete_rate',
       'unique_id', 'middle_play', 'order_avg_time_cost', 'order_complete_cnt',
       'total_favour_cnt', 'cooperate_index', 'cp_index', 'growth_index',
       'shopping_index', 'spread_index', 'top_score', 'deleted', 'tags_ids',
       'updated_hf'],
      dtype='object')

In [6]:
get_author_base_info = ["avg_play", "city", "e_commerce_enable", "lowest_price", "province", "short_id", "tags_ids", "tags_ids_level_two", "unique_id", "nickname", "image_url", "fans_count", "sex", "create_time", "modify_time"]

In [7]:
get_author_base_info_check = df[get_author_base_info].any(axis=1)
get_author_base_info_check

  get_author_base_info_check = df[get_author_base_info].any(axis=1)
  get_author_base_info_check = df[get_author_base_info].any(axis=1)


0         True
1         True
2         True
3         True
4         True
         ...  
36011    False
36012    False
36013    False
36014    False
36015    False
Length: 36016, dtype: bool

In [8]:
get_authors_ranking_in = ["level"]

In [9]:
get_authors_ranking_in_check = df[get_authors_ranking_in].any(axis=1)
get_authors_ranking_in_check

0        False
1        False
2        False
3        False
4        False
         ...  
36011    False
36012    False
36013    False
36014     True
36015    False
Length: 36016, dtype: bool

In [10]:
handler_post = ["order_complete_cnt", "cooperate_index", "shopping_index", "spread_index", "top_score", "cp_index"]

In [11]:
handler_post_check = df[handler_post].any(axis=1)
handler_post_check

0        True
1        True
2        True
3        True
4        True
         ... 
36011    True
36012    True
36013    True
36014    True
36015    True
Length: 36016, dtype: bool

In [12]:
df = df[pd.concat([get_author_base_info_check, handler_post_check], axis=1).all(axis=1)]
df

Unnamed: 0,id,user_id,short_id,nickname,image_url,fans_count,city,province,level,order_cnt,engage_rate,expected_cpm,expected_play_num,sex,lowest_price,create_time,modify_time,is_star,e_commerce_enable,updated,avg_play,tags_ids_level_two,core_user_id,order_complete_rate,unique_id,middle_play,order_avg_time_cost,order_complete_cnt,total_favour_cnt,cooperate_index,cp_index,growth_index,shopping_index,spread_index,top_score,deleted,tags_ids,updated_hf
0,745,7067890156870565925,92630927447,搞个小知识,https://p3.douyinpic.com/aweme/1080x1080/aweme...,192981.0,郑州市,河南省,,,,,15629.0,1.0,0.0,2022-02-23 20:03:34+00:00,2024-09-08 18:57:02+00:00,False,False,2024-09-27 17:03:54.857606+00:00,0.0,[89],1293475888497507,,92630927447,,,2.0,,771700.0,750900.0,,891500.0,813900.0,788400.0,False,[87],NaT
1,733,7068534992799793192,155199886,山城百货,https://p3.douyinpic.com/aweme/1080x1080/aweme...,129771.0,遵义,,,,,,250.0,1.0,0.0,2022-03-13 11:06:30+00:00,2024-09-08 18:43:37+00:00,False,False,2024-09-27 17:03:54.857606+00:00,0.0,[61],76929752190,,155199886,,,0.0,,760300.0,787300.0,,362500.0,523900.0,646100.0,False,[60],NaT
2,585,7070160089897762823,70958977730,元气阿琪🍓,https://p3.douyinpic.com/aweme/1080x1080/aweme...,324587.0,杭州市,浙江省,,,,,27651.0,2.0,0.0,2022-03-01 22:52:04+00:00,2024-09-08 18:44:08+00:00,False,False,2024-09-27 17:03:54.857606+00:00,15177.0,"[19, 52]",572212131404632,,70958977730,,,0.0,,878800.0,804500.0,,656600.0,734900.0,802500.0,False,"[15, 48]",NaT
3,587,7071534354526109727,59257587642,初·见,https://p26.douyinpic.com/aweme/1080x1080/awem...,90366.0,商洛市,陕西省,,,,,,1.0,0.0,2022-03-05 15:44:55+00:00,2024-09-08 18:47:09+00:00,False,False,2024-09-27 17:03:54.857606+00:00,0.0,[101],2080697123092592,,BINSHAO202488,,,0.0,,760500.0,136900.0,,93500.0,123200.0,156500.0,False,[100],NaT
4,2238,7072628401739137032,2321574917,哈哈蕾,https://p11.douyinpic.com/aweme/1080x1080/awem...,790793.0,北京,,,,,,46491.0,2.0,0.0,2022-03-08 14:30:23+00:00,2024-09-18 05:06:59+00:00,False,False,2024-09-27 17:03:54.857606+00:00,0.0,[50],1244239607105891,,hxlstudio,,,0.0,,760300.0,641300.0,,658400.0,629500.0,630000.0,False,[48],NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35806,831,6999676764494495774,3919055889,垫底辣孩,https://p11.douyinpic.com/aweme/1080x1080/awem...,12527065.0,淮南市,安徽省,20.0,,,,5504929.0,1.0,0.0,2021-08-24 00:20:46+00:00,2024-09-08 18:42:03+00:00,False,False,2024-09-27 17:03:54.857606+00:00,0.0,[74],1424563588829655,,diandilahai5,,,1.0,,759800.0,701400.0,,874400.0,902300.0,836600.0,False,[72],NaT
35836,809,7002242135076372512,98840874020,神仙岭的鸿哥（带徒弟）,https://p11.douyinpic.com/aweme/1080x1080/awem...,142271.0,深圳,,,,,,693.0,1.0,0.0,2021-10-26 18:06:43+00:00,2024-09-08 18:42:45+00:00,False,False,2024-09-27 17:03:54.857606+00:00,0.0,[38],1179144618776104,,jzhm131417,,,0.0,,760500.0,592400.0,,315800.0,331599.0,474900.0,False,[36],NaT
35865,2309,7040394258922078245,949140436,灯塔破壁人李晨（狸花猫电玩）,https://p3.douyinpic.com/aweme/1080x1080/aweme...,235177.0,扬州市,江苏省,,,,,36033.0,1.0,0.0,2021-12-11 17:45:26+00:00,2024-09-18 17:52:43+00:00,False,False,2024-09-27 17:03:54.857606+00:00,4838.0,[38],20370553497,,pbrlichen,,,0.0,,772600.0,865500.0,,815000.0,835699.0,832700.0,False,[36],NaT
35883,2310,7063866057437478925,3637227704,小艾科普,https://p26.douyinpic.com/aweme/1080x1080/awem...,257074.0,陇南市,甘肃省,,,,,9355.0,2.0,0.0,2022-02-12 23:48:00+00:00,2024-09-18 17:38:35+00:00,False,False,2024-09-27 17:03:54.857606+00:00,0.0,[90],4283287359463452,,dyqusezbbm9u,,,1.0,,772700.0,730300.0,,876900.0,849500.0,768600.0,False,[87],NaT


In [13]:
today = datetime.now()

In [14]:
df = df[~df[["updated"]].any(axis=1)]

  df = df[~df[["updated"]].any(axis=1)]


In [15]:
upsert_df = df[["user_id", "core_user_id", "is_star", "deleted"]]
upsert_df

Unnamed: 0,user_id,core_user_id,is_star,deleted


In [16]:
if len(upsert_df):
    upsert_df["updated"] = str(today)
    query = build_upsert_query(upsert_df.columns, "douyin_influencer", ["core_user_id"])
    value = ", ".join([str(record).replace("''", "NULL") for record in upsert_df.fillna("").to_records(index=False)])
    query = query % value
    with engine.connect() as conn:
        result = conn.execute(text(query))
        conn.commit()