In [18]:
from bs4 import BeautifulSoup
import urllib.request
import requests
from pprint import pprint
import pandas as pd
import json
import copy
from datetime import datetime, timezone
from google.cloud import storage
import gcsfs
import re

In [19]:
storage_client = storage.Client(project="ytone-430507")
today = datetime.now()
table_name = "douyin_fanstat"
api_name = "handler_post"

In [20]:
processing_blobs = [
{
    "blob": blob,
    "date": datetime.strptime(blob.name.split('/')[1], "%Y-%m-%d"),
    "batch_number": int(blob.name.split('/')[-1].replace(".json", "").split("_")[-1]),
} for blob in storage_client.list_blobs("0_raw_data",prefix="2_xingtu/") if api_name in blob.name]
processing_blobs

[{'blob': <Blob: 0_raw_data, 2_xingtu/2024-09-11/handler_post_0911_0.json, 1726037777115120>,
  'date': datetime.datetime(2024, 9, 11, 0, 0),
  'batch_number': 0},
 {'blob': <Blob: 0_raw_data, 2_xingtu/2024-09-12/handler_post_0912_0.json, 1726117857045627>,
  'date': datetime.datetime(2024, 9, 12, 0, 0),
  'batch_number': 0},
 {'blob': <Blob: 0_raw_data, 2_xingtu/2024-09-12/handler_post_0912_1.json, 1726117860404297>,
  'date': datetime.datetime(2024, 9, 12, 0, 0),
  'batch_number': 1},
 {'blob': <Blob: 0_raw_data, 2_xingtu/2024-09-12/handler_post_0912_10.json, 1726124693780255>,
  'date': datetime.datetime(2024, 9, 12, 0, 0),
  'batch_number': 10},
 {'blob': <Blob: 0_raw_data, 2_xingtu/2024-09-12/handler_post_0912_2.json, 1726117863832561>,
  'date': datetime.datetime(2024, 9, 12, 0, 0),
  'batch_number': 2},
 {'blob': <Blob: 0_raw_data, 2_xingtu/2024-09-12/handler_post_0912_3.json, 1726117867242406>,
  'date': datetime.datetime(2024, 9, 12, 0, 0),
  'batch_number': 3},
 {'blob': <Blo

In [21]:
processed_blobs = [{
    "blob": blob,
    "date": datetime.strptime(blob.name.split('/')[2], "%Y-%m-%d"),
    "batch_number": int(blob.name.split('/')[-1].replace(".parquet", "").split("_")[-1]),
} for blob in storage_client.list_blobs("3_staging_area",prefix="1_xingtu/") if table_name == blob.name.split("/")[1] and api_name in blob.name and len(blob.name.split("/")) == 4]
processed_blobs

[{'blob': <Blob: 3_staging_area, 1_xingtu/douyin_fanstat/2024-09-11/handler_post_241003_0.parquet, 1727927160719102>,
  'date': datetime.datetime(2024, 9, 11, 0, 0),
  'batch_number': 0},
 {'blob': <Blob: 3_staging_area, 1_xingtu/douyin_fanstat/2024-09-12/handler_post_241003_0.parquet, 1727928034491272>,
  'date': datetime.datetime(2024, 9, 12, 0, 0),
  'batch_number': 0},
 {'blob': <Blob: 3_staging_area, 1_xingtu/douyin_fanstat/2024-09-12/handler_post_241003_1.parquet, 1727928038509840>,
  'date': datetime.datetime(2024, 9, 12, 0, 0),
  'batch_number': 1},
 {'blob': <Blob: 3_staging_area, 1_xingtu/douyin_fanstat/2024-09-12/handler_post_241003_10.parquet, 1727928041861735>,
  'date': datetime.datetime(2024, 9, 12, 0, 0),
  'batch_number': 10},
 {'blob': <Blob: 3_staging_area, 1_xingtu/douyin_fanstat/2024-09-12/handler_post_241003_2.parquet, 1727928047873687>,
  'date': datetime.datetime(2024, 9, 12, 0, 0),
  'batch_number': 2},
 {'blob': <Blob: 3_staging_area, 1_xingtu/douyin_fanstat/2

In [22]:
to_process = []
for processing_blob in processing_blobs:
    if processing_blob["date"] >= datetime(2024, 10, 6):
        if processing_blob["batch_number"] not in [processed_blob["batch_number"] for processed_blob in processed_blobs if processing_blob["date"] == processed_blob["date"]] and processing_blob["scrape_date"] not in [processed_blob["scrape_date"] for processed_blob in processed_blobs if processing_blob["date"] == processed_blob["date"]]:
            to_process.append(processing_blob)
pprint(to_process)
print(len(to_process))

[{'batch_number': 0,
  'blob': <Blob: 0_raw_data, 2_xingtu/2024-10-06/handler_post_241002_0.json, 1728374750060460>,
  'date': datetime.datetime(2024, 10, 6, 0, 0)}]
1


In [7]:
def get_handler_post_data(data):
    result = []
    profile_id = data["profile_id"]
    handler_post = data.get("handler_post")
    if handler_post:
        fans_distribution = handler_post.get("fans_distribution")
        if fans_distribution:
            for distribution in fans_distribution:
                if "desc" in distribution:
                    texts = fans_distribution[distribution]
                    sum_values = 0
                    for text in texts:
                        label, value = text.split(": ")
                        record = {}
                        record["category"] = distribution.split("_")[1]
                        record["name"] = label
                        record["value"] = float(value)
                        record["influencer_id"] = profile_id
                        result.append(record)
                        sum_values += float(value)
                        print(label, value)
                    if sum_values != 100:
                        record = {}
                        record["category"] = distribution.split("_")[1]
                        record["name"] = "other"
                        record["value"] = 100 - sum_values
                        record["influencer_id"] = profile_id
                        result.append(record)
                        print("other", 100 - sum_values)
    return result

In [8]:
for item in to_process:
    handler_post_data = []
    raw_data = json.loads(item["blob"].download_as_string())
    for data in raw_data:
        result = get_handler_post_data(data)
        if result:
            handler_post_data.extend(result)
    df = pd.DataFrame(handler_post_data)
    df.to_parquet("gs://3_staging_area/1_xingtu/" + table_name + "/" + str(item["date"].date()) + "/" + api_name + "_" + today.strftime("%y%m%d") + "_" + str(item["batch_number"]) + ".parquet")

广东 12.48
江苏 7.33
山东 6.50
other 73.69
iPhone 37.97
华为 18.37
其他 9.62
other 34.03999999999999
31-40 41.37
24-30 32.92
18-23 12.10
other 13.610000000000014
男性 54.86
女性 45.14
随拍 55.89
亲子 12.25
美食 8.52
other 23.340000000000003
山东 9.13
河南 8.94
广东 7.24
other 74.69
华为 21.68
iPhone 19.51
vivo 13.17
other 45.64
31-40 41.64
50+ 25.27
41-50 24.56
other 8.530000000000001
女性 66.51
男性 33.49
随拍 67.49
亲子 11.00
社会时政 6.22
other 15.290000000000006
浙江 15.95
广东 11.58
江苏 9.47
other 63.0
iPhone 35.69
华为 16.69
其他 11.58
other 36.040000000000006
31-40 46.09
24-30 24.47
41-50 13.85
other 15.590000000000003
男性 78.63
女性 21.37
随拍 72.66
美食 6.46
社会时政 6.15
other 14.730000000000004
广东 16.14
河南 9.04
山东 5.78
other 69.03999999999999
oppo 18.58
vivo 18.37
华为 14.34
other 48.709999999999994
50+ 39.41
31-40 27.50
41-50 24.94
other 8.150000000000006
女性 52.69
男性 47.31
随拍 80.08
亲子 4.90
生活家居 4.50
other 10.519999999999996
广东 11.35
江苏 7.71
山东 7.29
other 73.65
iPhone 34.53
其他 16.03
华为 13.26
other 36.18
18-23 28.03
24-30 25.93
31-40 24