In [9]:
from bs4 import BeautifulSoup
import urllib.request
import requests
from pprint import pprint
import pandas as pd
import json
import copy
from datetime import datetime, timezone
from google.cloud import storage
import gcsfs
import re

In [10]:
storage_client = storage.Client(project="ytone-430507")
bucket = storage_client.bucket("3_staging_area")
today = datetime.now()
table_name = "douyin_influencer"
api_name = "search_for_author_square"
file_name = "xingtu_basic"

In [11]:
processing_blobs = [
{
    "blob": blob,
    "date": datetime.strptime(blob.name.split('/')[1], "%Y-%m-%d"),
    "batch_number": int(blob.name.split('/')[-1].replace(".json", "").split("_")[-1]),
} for blob in storage_client.list_blobs("0_raw_data",prefix="2_xingtu/") if file_name in blob.name]
processing_blobs

[{'blob': <Blob: 0_raw_data, 2_xingtu/2024-09-07/xingtu_basic_240907_0.json, 1725791792735542>,
  'date': datetime.datetime(2024, 9, 7, 0, 0),
  'batch_number': 0},
 {'blob': <Blob: 0_raw_data, 2_xingtu/2024-09-08/xingtu_basic_240908_0.json, 1725800931531139>,
  'date': datetime.datetime(2024, 9, 8, 0, 0),
  'batch_number': 0},
 {'blob': <Blob: 0_raw_data, 2_xingtu/2024-09-08/xingtu_basic_240908_1.json, 1725800934868643>,
  'date': datetime.datetime(2024, 9, 8, 0, 0),
  'batch_number': 1},
 {'blob': <Blob: 0_raw_data, 2_xingtu/2024-09-08/xingtu_basic_240908_2.json, 1725800937650829>,
  'date': datetime.datetime(2024, 9, 8, 0, 0),
  'batch_number': 2},
 {'blob': <Blob: 0_raw_data, 2_xingtu/2024-09-08/xingtu_basic_240908_3.json, 1725800941091533>,
  'date': datetime.datetime(2024, 9, 8, 0, 0),
  'batch_number': 3},
 {'blob': <Blob: 0_raw_data, 2_xingtu/2024-09-08/xingtu_basic_240908_4.json, 1725800945109037>,
  'date': datetime.datetime(2024, 9, 8, 0, 0),
  'batch_number': 4},
 {'blob': 

In [12]:
processed_blobs = [{
    "blob": blob,
    "date": datetime.strptime(blob.name.split('/')[2], "%Y-%m-%d"),
    "batch_number": int(blob.name.split('/')[-1].replace(".parquet", "").split("_")[-1]),
} for blob in storage_client.list_blobs("3_staging_area",prefix="1_xingtu/") if table_name == blob.name.split("/")[1] and api_name in blob.name and len(blob.name.split("/")) == 4]
processed_blobs

[{'blob': <Blob: 3_staging_area, 1_xingtu/douyin_influencer/2024-09-08/search_for_author_square_241004_0.parquet, 1728031831447964>,
  'date': datetime.datetime(2024, 9, 8, 0, 0),
  'batch_number': 0},
 {'blob': <Blob: 3_staging_area, 1_xingtu/douyin_influencer/2024-10-02/search_for_author_square_241007_0.parquet, 1728296556790576>,
  'date': datetime.datetime(2024, 10, 2, 0, 0),
  'batch_number': 0},
 {'blob': <Blob: 3_staging_area, 1_xingtu/douyin_influencer/2024-10-02/search_for_author_square_241007_1.parquet, 1728296876872193>,
  'date': datetime.datetime(2024, 10, 2, 0, 0),
  'batch_number': 1},
 {'blob': <Blob: 3_staging_area, 1_xingtu/douyin_influencer/2024-10-02/search_for_author_square_241007_10.parquet, 1728296883690546>,
  'date': datetime.datetime(2024, 10, 2, 0, 0),
  'batch_number': 10},
 {'blob': <Blob: 3_staging_area, 1_xingtu/douyin_influencer/2024-10-02/search_for_author_square_241007_11.parquet, 1728296890322348>,
  'date': datetime.datetime(2024, 10, 2, 0, 0),
  'ba

In [13]:
to_process = []
for processing_blob in processing_blobs:
    if processing_blob["date"] >= datetime(2024, 10, 2):
        if processing_blob["batch_number"] not in [processed_blob["batch_number"] for processed_blob in processed_blobs if processing_blob["date"] == processed_blob["date"]]:
            to_process.append(processing_blob)
pprint(to_process)
print(len(to_process))

[]
0


In [14]:
def get_basic_raw_data(item):
    attribute_datas = []
    basic_data = json.loads(item["blob"].download_as_string())
    for data in basic_data:
        for author_data in data["authors"]:
            attribute_datas.append(author_data["attribute_datas"])
    return attribute_datas

In [15]:
blob = bucket.blob("1_xingtu/author_options.json")
author_options = json.loads(blob.download_as_string())
pprint(author_options)

{'code': 0,
 'data': {'author_gpm': [{'author_gpm__ge': 150},
                         {'author_gpm__ge': 100, 'author_gpm__le': 150},
                         {'author_gpm__ge': 50, 'author_gpm__le': 100},
                         {'author_gpm__ge': 30, 'author_gpm__le': 50},
                         {'author_gpm__ge': 10, 'author_gpm__le': 30},
                         {'author_gpm__ge': 5, 'author_gpm__le': 10},
                         {'author_gpm__le': 5}],
          'avg_sale_amount': [{'avg_sale_amount__ge': 100000000},
                              {'avg_sale_amount__ge': 50000000,
                               'avg_sale_amount__le': 100000000},
                              {'avg_sale_amount__ge': 10000000,
                               'avg_sale_amount__le': 50000000},
                              {'avg_sale_amount__ge': 5000000,
                               'avg_sale_amount__le': 10000000},
                              {'avg_sale_amount__ge': 1000000,
                

In [16]:
tags = author_options["data"]["tag"]
pprint(tags)

[{'first': {'1': '美妆'},
  'second': [{'2': '美妆教程'}, {'3': '妆容展示'}, {'4': '护肤保养'}, {'5': '美妆测评种草'}]},
 {'first': {'6': '时尚'},
  'second': [{'7': '穿搭'}, {'8': '街拍'}, {'10': '造型'}, {'135': '时尚媒体'}]},
 {'first': {'11': '萌宠'},
  'second': [{'12': '日常宠物'}, {'13': '特别宠物'}, {'14': '宠物周边'}]},
 {'first': {'15': '测评'},
  'second': [{'16': '美妆测评'},
             {'17': '3C数码测评'},
             {'18': '汽车测评'},
             {'19': '美食产品测评'},
             {'20': '母婴产品测评'},
             {'21': '综合测评'},
             {'132': '酒店测评'}]},
 {'first': {'23': '游戏'},
  'second': [{'121': '游戏剧情'},
             {'122': '游戏解说'},
             {'123': '游戏资讯'},
             {'124': '游戏其他'},
             {'440': '游戏录屏'},
             {'441': '游戏集锦'}]},
 {'first': {'25': '二次元'},
  'second': [{'125': '二次元真人'},
             {'126': '动画漫画'},
             {'127': '配音声优'},
             {'128': '宅物手办'}]},
 {'first': {'27': '旅行'},
  'second': [{'28': '旅行记录'}, {'29': '旅行攻略'}, {'30': '旅行推荐'}, {'442': '户外生活'}]},
 {'first': {'31':

In [17]:
def get_search_for_author_square_data(raw_data):
    search_for_author_square_data = []
    for data in raw_data:
        record = {}
        profile_id = data.get("id")
        record["user_id"] = profile_id
        record["nickname"] = data.get("nick_name")
        record["image_url"] = data.get("avatar_uri")
        record["fans_count"] = data.get("follower")
        record["city"] = data.get("city")
        record["province"] = data.get("province")
        record["expected_play_num"] = data.get("expected_play_num")
        record["sex"] = data.get("gender")
        prices = []
        price_1_20 = data.get("price_1_20")
        if price_1_20:
            prices.append(price_1_20)
        price_20_60 = data.get("price_20_60")
        if price_20_60:
            prices.append(price_20_60)
        price_60 = data.get("price_60")
        if price_60:
            prices.append(price_60)
        if prices:
            record["lowest_price"] = min(prices)
        record["e_commerce_enable"] = data.get("e_commerce_enable")
        tags_relation = data.get("tags_relation")
        if tags_relation:
            tags_relation = json.loads(data.get("tags_relation"))
            tags_id = []
            tags_ids_level_two = []
            for tag_1, tag_2 in tags_relation.items():
                for tag in tags:
                    first_tag = tag["first"]
                    for key, value in first_tag.items():
                        if tag_1 == value:
                            tags_id.append(key)
                            second_tag = tag["second"]
                            for tag_2_value in tag_2:
                                for item in second_tag:
                                    for key, value in item.items():
                                        if tag_2_value == value:
                                            tags_ids_level_two.append(key)
            record["tags_ids"] = "{" + ",".join(tags_id) + "}"
            record["tags_ids_level_two"] = "{" + ",".join(tags_ids_level_two) + "}"
        record["core_user_id"] = data.get("core_user_id")
        search_for_author_square_data.append(record)
    return search_for_author_square_data

In [18]:
for item in to_process:
    raw_data = get_basic_raw_data(item)
    search_for_author_square_data = get_search_for_author_square_data(raw_data)
    df = pd.DataFrame(search_for_author_square_data)
    df.to_parquet("gs://3_staging_area/1_xingtu/douyin_influencer/" + str(item["date"].date()) + "/" + api_name + "_" + today.strftime("%y%m%d") + "_" + str(item["batch_number"]) + ".parquet")