In [1]:
from bs4 import BeautifulSoup
import urllib.request
import requests
from pprint import pprint
import pandas as pd
import json
import copy
from datetime import datetime, timezone
from google.cloud import storage
import gcsfs
import re
from typing import List

In [2]:
storage_client = storage.Client(project="ytone-430507")
today = datetime.now()
api_name = "douyin_user_post"

In [3]:
processing_blobs = [
{
    "blob": blob,
    "date": datetime.strptime(blob.name.split('/')[1], "%Y-%m-%d"),
    "batch_number": blob.name.split('/')[-1].replace(".json", "").split("_")[-1],
} for blob in storage_client.list_blobs("0_raw_data",prefix="3_douyin/") if api_name in blob.name]
processing_blobs

[{'blob': <Blob: 0_raw_data, 3_douyin/2024-10-13/douyin_user_post_1014_0.json, 1728841476150899>,
  'date': datetime.datetime(2024, 10, 13, 0, 0),
  'batch_number': '0'},
 {'blob': <Blob: 0_raw_data, 3_douyin/2024-10-13/douyin_user_post_1014_1.json, 1728841491073016>,
  'date': datetime.datetime(2024, 10, 13, 0, 0),
  'batch_number': '1'},
 {'blob': <Blob: 0_raw_data, 3_douyin/2024-10-13/douyin_user_post_1014_2.json, 1728841505683513>,
  'date': datetime.datetime(2024, 10, 13, 0, 0),
  'batch_number': '2'},
 {'blob': <Blob: 0_raw_data, 3_douyin/2024-10-13/douyin_user_post_1014_3.json, 1728841520263091>,
  'date': datetime.datetime(2024, 10, 13, 0, 0),
  'batch_number': '3'},
 {'blob': <Blob: 0_raw_data, 3_douyin/2024-10-13/douyin_user_post_1014_4.json, 1728841535667722>,
  'date': datetime.datetime(2024, 10, 13, 0, 0),
  'batch_number': '4'},
 {'blob': <Blob: 0_raw_data, 3_douyin/2024-10-13/douyin_user_post_1014_5.json, 1728841554833509>,
  'date': datetime.datetime(2024, 10, 13, 0, 0)

In [5]:
processed_blobs = [{
    "blob": blob,
    "date": datetime.strptime(blob.name.split('/')[2], "%Y-%m-%d"),
    "batch_number": blob.name.split('/')[-1].replace(".parquet", "").split("_")[-1],
} for blob in storage_client.list_blobs("3_staging_area",prefix="2_douyin/") if api_name in blob.name]
processed_blobs

[{'blob': <Blob: 3_staging_area, 2_douyin/douyin_user_post/2024-10-13/douyin_user_post_241014_0.parquet, 1728893176080756>,
  'date': datetime.datetime(2024, 10, 13, 0, 0),
  'batch_number': '0'},
 {'blob': <Blob: 3_staging_area, 2_douyin/douyin_user_post/2024-10-13/douyin_user_post_241014_1.parquet, 1728893187747009>,
  'date': datetime.datetime(2024, 10, 13, 0, 0),
  'batch_number': '1'},
 {'blob': <Blob: 3_staging_area, 2_douyin/douyin_user_post/2024-10-13/douyin_user_post_241014_2.parquet, 1728893196673185>,
  'date': datetime.datetime(2024, 10, 13, 0, 0),
  'batch_number': '2'},
 {'blob': <Blob: 3_staging_area, 2_douyin/douyin_user_post/2024-10-13/douyin_user_post_241014_3.parquet, 1728893205850248>,
  'date': datetime.datetime(2024, 10, 13, 0, 0),
  'batch_number': '3'},
 {'blob': <Blob: 3_staging_area, 2_douyin/douyin_user_post/2024-10-13/douyin_user_post_241014_4.parquet, 1728893216586452>,
  'date': datetime.datetime(2024, 10, 13, 0, 0),
  'batch_number': '4'},
 {'blob': <Blob

In [6]:
to_process = []
for processing_blob in processing_blobs:
    if processing_blob["date"] >= datetime(2024, 9, 17):
        if processing_blob["batch_number"] not in [processed_blob["batch_number"] for processed_blob in processed_blobs if processing_blob["date"] == processed_blob["date"]]:
            to_process.append(processing_blob)
pprint(to_process)
print(len(to_process))

[]
0


In [40]:
def get_douyin_user_post_data(raw_data):
    result = []
    for data in raw_data:
        user_post_data = data.get("user_post_data")
        if user_post_data:
            aweme_list = user_post_data.get("aweme_list")
            if aweme_list:
                for aweme in aweme_list:
                    record = {}
                    video = aweme.get("video")
                    if video:
                        play_addr = video.get("play_addr")
                        if play_addr:
                            uri = play_addr["uri"]
                            record["play_addr"] = uri
                        dynamic_cover = video.get("dynamic_cover")
                        if dynamic_cover:
                            uri = dynamic_cover["uri"]
                            record["dynamic_cover"] = uri
                    record["aweme_id"] = aweme["aweme_id"]
                    record["caption"] = aweme["caption"]
                    statistics = aweme.get("statistics")
                    if statistics:
                        record["comment_count"] = statistics["comment_count"]
                        record["digg_count"] = statistics["digg_count"]
                        record["share_count"] = statistics["share_count"]
                        record["collect_count"] = statistics["collect_count"]
                    record["create_time"] = aweme["create_time"]
                    record["author_user_id"] = aweme["author_user_id"]
                    music = aweme.get("music")
                    if music:
                        record["video_duration"] = music["video_duration"]
                    record["note_interactions"] = sum([
                        record["comment_count"],
                        record["digg_count"],
                        record["share_count"],
                        record["collect_count"]
                    ])
                    result.append(record)
    return result

In [44]:
for item in to_process:
    raw_data = json.loads(item["blob"].download_as_string())
    result = get_douyin_user_post_data(raw_data)
    df = pd.DataFrame(result)
    df.to_parquet("gs://3_staging_area/2_douyin/douyin_user_post/" + str(item["date"].date()) + "/douyin_user_post_" + today.strftime("%y%m%d") + "_" + str(item["batch_number"]) + ".parquet")