In [12]:
from bs4 import BeautifulSoup
import urllib.request
import requests
from pprint import pprint
import pandas as pd
import json
import copy
from datetime import datetime, timezone
from google.cloud import storage
import gcsfs
import re

In [13]:
storage_client = storage.Client(project="ytone-430507")
today = datetime.now()
table_name = "douyin_video"
api_name = "search_for_star_hot_item"

In [14]:
processing_blobs = [
{
    "blob": blob,
    "date": datetime.strptime(blob.name.split('/')[1], "%Y-%m-%d"),
    "batch_number": int(blob.name.split('/')[-1].replace(".json", "").split("_")[-1]),
} for blob in storage_client.list_blobs("0_raw_data",prefix="2_xingtu/") if api_name in blob.name]
processing_blobs

[{'blob': <Blob: 0_raw_data, 2_xingtu/2024-10-02/search_for_star_hot_item_241002_0.json, 1727845379748570>,
  'date': datetime.datetime(2024, 10, 2, 0, 0),
  'batch_number': 0},
 {'blob': <Blob: 0_raw_data, 2_xingtu/2024-10-06/search_for_star_hot_item_241006_0.json, 1728189969156322>,
  'date': datetime.datetime(2024, 10, 6, 0, 0),
  'batch_number': 0}]

In [15]:
processed_blobs = [{
    "blob": blob,
    "date": datetime.strptime(blob.name.split('/')[2], "%Y-%m-%d"),
    "batch_number": int(blob.name.split('/')[-1].replace(".parquet", "").split("_")[-1]),
} for blob in storage_client.list_blobs("3_staging_area",prefix="1_xingtu/") if table_name == blob.name.split("/")[1] and len(blob.name.split("/")) == 4]
processed_blobs

[{'blob': <Blob: 3_staging_area, 1_xingtu/douyin_video/2024-09-30/search_for_star_hot_item_241001_0.parquet, 1727772493257816>,
  'date': datetime.datetime(2024, 9, 30, 0, 0),
  'batch_number': 0},
 {'blob': <Blob: 3_staging_area, 1_xingtu/douyin_video/2024-10-02/search_for_star_hot_item_241002_0.parquet, 1727845663822959>,
  'date': datetime.datetime(2024, 10, 2, 0, 0),
  'batch_number': 0},
 {'blob': <Blob: 3_staging_area, 1_xingtu/douyin_video/2024-10-06/search_for_star_hot_item_241007_0.parquet, 1728274957271450>,
  'date': datetime.datetime(2024, 10, 6, 0, 0),
  'batch_number': 0}]

In [16]:
to_process = []
for processing_blob in processing_blobs:
    if processing_blob["batch_number"] not in [processed_blob["batch_number"] for processed_blob in processed_blobs if processing_blob["date"] == processed_blob["date"]]:
        to_process.append(processing_blob)
pprint(to_process)
print(len(to_process))

[]
0


In [6]:
industry_mapping = {
    "美妆": 1,
    "3C及电器": 2,
    "零售": 3,
    "日化": 4,
    "游戏": 5,
    "食品饮料": 6,
    "工具类软件": 7,
    "母婴宠物": 8,
    "服装配饰": 9,
    "汽车": 10
}
industry_mapping

{'美妆': 1,
 '3C及电器': 2,
 '零售': 3,
 '日化': 4,
 '游戏': 5,
 '食品饮料': 6,
 '工具类软件': 7,
 '母婴宠物': 8,
 '服装配饰': 9,
 '汽车': 10}

In [7]:
category_mapping = {
    "时尚": 1,
    "美食": 2,
    "生活记录": 3,
    "生活家居": 4,
    "亲子": 5,
    "母婴": 6,
    "游戏": 7,
    "教育校园": 8,
    "汽车": 9,
    "体育": 10,
    "科技": 11,
    "科普": 12,
    "剧情": 13,
    "旅行": 14,
    "动物": 15
}
category_mapping

{'时尚': 1,
 '美食': 2,
 '生活记录': 3,
 '生活家居': 4,
 '亲子': 5,
 '母婴': 6,
 '游戏': 7,
 '教育校园': 8,
 '汽车': 9,
 '体育': 10,
 '科技': 11,
 '科普': 12,
 '剧情': 13,
 '旅行': 14,
 '动物': 15}

In [8]:
def get_search_for_star_hot_item_data(raw_data):
    result = []
    for data in raw_data:
        search_for_star_hot_item = data.get("data")
        if search_for_star_hot_item:
            for item in search_for_star_hot_item:
                attribute_datas = item.get("attribute_datas")
                record = {}
                record["video_id"] = attribute_datas["video_id"].replace('"', "")
                record["item_id"] = item["item_id"]
                item_title = attribute_datas.get("item_title")
                if item_title:
                    record["title"] = item_title.replace('"', "")
                record["image_url"] = attribute_datas["item_cover_url"].replace('"', "")
                record["like"] = attribute_datas["like"].replace('"', "")
                record["play"] = attribute_datas["play"].replace('"', "")
                record["influencer_id"] = item["star_id"]
                record["component_convert_count"] = attribute_datas["component_convert_count"].replace('"', "")
                record["sale_amount"] = attribute_datas["sale_amount"].replace('"', "")
                record["interaction_rate"] = attribute_datas["interact_rate"]
                record["completion_rate"] = attribute_datas["finish_play_rate"]
                record["xt_ranking_type"] = data["item_list_type"] - 1
                record["industry_ids"] = "{" + str(industry_mapping[data["industry"]]) + "}"
                record["category_ids"] = "{" + str(category_mapping[data["content"]]) + "}"
                result.append(record)
    return result

In [11]:
for item in to_process:
    raw_data = json.loads(item["blob"].download_as_string())    
    search_for_star_hot_item_data = get_search_for_star_hot_item_data(raw_data)
    df = pd.DataFrame(search_for_star_hot_item_data)
    df.to_parquet("gs://3_staging_area/1_xingtu/douyin_video/" + str(item["date"].date()) + "/search_for_star_hot_item_" + today.strftime("%y%m%d") + "_" + str(item["batch_number"]) + ".parquet")