In [1]:
from bs4 import BeautifulSoup
import urllib.request
import requests
from pprint import pprint
import pandas as pd
import json
import copy
from datetime import datetime, timezone
from google.cloud import storage
import gcsfs
import re
from typing import List

In [2]:
storage_client = storage.Client()
bucket_name = "0_raw_data"
prefix = "2_xingtu/2024-09-08/"

In [38]:
def get_basic_data():
    attribute_datas = []
    for blob in storage_client.list_blobs(bucket_name,prefix=prefix):
        if "basic" in blob.name:
            print(blob.name)
            raw_data = json.loads(blob.download_as_string())
            for data in raw_data:
                for author_data in data["authors"]:
                    attribute_datas.append({"file_name": blob.name, **author_data["attribute_datas"]})
    return attribute_datas

basic_data = get_basic_data()

2_xingtu/2024-09-08/xingtu_basic_240908_0.json
2_xingtu/2024-09-08/xingtu_basic_240908_1.json
2_xingtu/2024-09-08/xingtu_basic_240908_2.json
2_xingtu/2024-09-08/xingtu_basic_240908_3.json
2_xingtu/2024-09-08/xingtu_basic_240908_4.json
2_xingtu/2024-09-08/xingtu_basic_240908_5.json


In [39]:
print(len(basic_data))
pprint(basic_data[:10])

37526
[{'author_avatar_frame_icon': '0',
  'author_status': '1',
  'author_type': '1',
  'avatar_uri': 'https://p26.douyinpic.com/aweme/1080x1080/aweme-avatar/tos-cn-i-0813_c8f2145620494cecab8d6b4790922b48.jpeg?from=4010531038',
  'brand_boost_vv': '1000000',
  'city': '成都',
  'core_user_id': '1706893960807544',
  'e_commerce_enable': '1',
  'ecom_video_product_num_30d': '0',
  'expected_cpa3_level': '1',
  'expected_natural_play_num': '903580',
  'expected_play_num': '1115542',
  'fans_increment_rate_within_15d': '0.0768',
  'fans_increment_within_15d': '10093',
  'file_name': '2_xingtu/2024-09-08/xingtu_basic_240908_0.json',
  'follower': '141669',
  'game_type': '',
  'gender': '2',
  'grade': '0',
  'id': '7135272451914596383',
  'interact_rate_within_30d': '0.006',
  'is_black_horse_author': 'false',
  'is_cocreate_author': 'false',
  'link_convert_index': '74.76',
  'link_convert_index_by_industry': '74.76',
  'link_i_cnt_by_industry': '11106196',
  'link_k_cnt_by_industry': '129

In [53]:
test_data = []
for data in basic_data:
    record = {}
    record["id"] = data["id"]
    record["core_user_id"] = data["core_user_id"]
    record["nick_name"] = data["nick_name"]
    record["city"] = data["city"]
    record["follower"] = data["follower"]
    test_data.append(record)

In [54]:
df = pd.DataFrame(test_data).drop_duplicates()
df = df[df.duplicated(["id", "core_user_id"], keep=False)]
df.sort_values("id").tail(10)


Unnamed: 0,id,core_user_id,nick_name,city,follower
10768,7391702456586993691,3441927638224732,虾仁不是老六,,175488
308,7391702456586993691,3441927638224732,虾仁不是老六,,176431
10164,7391732058281738250,3243976018115580,山有虎,成都,156600
245,7391732058281738250,3243976018115580,山有虎,成都,158305
11361,7395975644032532517,3620069812476045,逗缺沙雕动画,,120009
482,7395975644032532517,3620069812476045,逗缺沙雕动画,,120637
2681,7406718859041374217,594161900201837,海子,,565498
12454,7406718859041374217,594161900201837,海子,,565332
491,7408452451734388762,2773400655768524,李有理评测,重庆,123878
11427,7408452451734388762,2773400655768524,李有理评测,重庆,122385


In [6]:
influencer_data = []
influencer_columns = [
    'user_id', 'short_id', 'nickname', 'image_url', 'fans_count', 'city',
    'province', 'level', 'order_cnt', 'engage_rate', 'expected_cpm',
    'expected_play_num', 'sex', 'lowest_price', 'create_time',
    'modify_time', 'is_star', 'e_commerce_enable', 'updated', 'avg_play',
    'tags_ids_level_two', 'core_user_id', 'order_complete_rate',
    'unique_id', 'middle_play', 'order_avg_time_cost', 'order_complete_cnt',
    'total_favour_cnt', 'cooperate_index', 'cp_index', 'growth_index',
    'shopping_index', 'spread_index', 'top_score', 'deleted', 'tags_ids',
    'updated_hf'
]

hotcomment_data = []
hotcomment_columns = ["rate", "content", "influencer_id"]

fantracker_data = []
fantracker_columns = ["date_key", "fans_count", "influencer_id"]

videostat_data = []
videostat_columns = ["name", "rate", "compare_avg", "compare_author", "influencer_id"]

videotracker_data = []
videotracker_columns = ["date_key", "comment", "like", "share", "play", "video_id"]

video_data = []
video_columns = ["video_id", "item_id", "title", "image_url", "comment", "like", "share", "play", "create_time", "duration", "influencer_id"]

price_data = []
price_columns = ["video_type", "settlement_type", "is_open", "origin_price", "price", "has_discount", "influencer_id"]

fanstat_data = []
fanstat_columns = ["category", "name", "value", "influencer_id"]

In [153]:
def get_author_base_info(data):
    record = {}
    get_author_base_info = data.get("get_author_base_info")
    if get_author_base_info:
        for key in get_author_base_info:
            if key in influencer_columns:
                record[key] = get_author_base_info[key]
        record["nickname"] = get_author_base_info["nick_name"]
        record["image_url"] = get_author_base_info["avatar_uri"]
        record["fans_count"] = get_author_base_info["follower"]
        record["sex"] = get_author_base_info["gender"]
        record["create_time"] = datetime.fromtimestamp(int(get_author_base_info["create_time"])).replace(tzinfo=timezone.utc)
        record["modify_time"] = datetime.fromtimestamp(int(get_author_base_info["modify_time"])).replace(tzinfo=timezone.utc)
    return record

In [61]:
def get_authors_ranking_in(data, profile_id):
    record = {}
    get_authors_ranking_in = data.get("get_authors_ranking_in")
    if get_authors_ranking_in:
        author_rank_infos = get_authors_ranking_in.get("author_rank_infos")
        if author_rank_infos:
            author_rank_info = author_rank_infos[profile_id][0]
            for key in author_rank_info:
                if key in influencer_columns:
                    record[key] = author_rank_info[key]
    return record

In [62]:
def get_author_link_info(data):
    record = {}
    get_author_link_info = data.get("get_author_link_info")
    if get_author_link_info:
        record["cooperate_index"] = get_author_link_info["cooperate_index"]["rank"]
        record["cp_index"] = get_author_link_info["cp_index"]["rank"]
        record["shopping_index"] = get_author_link_info["link_shopping_index"]["rank"]
        record["spread_index"] = get_author_link_info["link_spread_index"]["rank"]
    return record

In [63]:
def get_author_hot_comment_tokens(data, profile_id) -> List:
    result = []
    get_author_hot_comment_tokens = data.get("get_author_hot_comment_tokens")
    hot_comment_tokens = get_author_hot_comment_tokens["hot_comment_tokens"]
    for hot_comment_token in hot_comment_tokens:
        record = {}
        record["rate"] = hot_comment_token["hot_rate"]
        record["content"] = hot_comment_token["comment_token"]
        record["influencer_id"] = profile_id
        result.append(record)
    return result

In [None]:
with open("/Users/lenamlinhtugiam/Documents/Python Data/result/author_audience_distribution_0913_list300.json", "r") as f:
    raw_data = json.load(f)
pprint(raw_data[0])

In [2]:
category_mapping = {
    "省份分布": "province",
    "年龄分布": "age",
    "设备品牌分布": "device",
    "性别分布": "gender",
}

In [None]:
for data in raw_data:
    profile_id = data["profile_id"]
    author_audience_distribution = data.get("author_audience_distribution")
    if author_audience_distribution:
        print(author_audience_distribution)
        distributions = author_audience_distribution["distributions"]
        pprint(distributions)
        for distribution in distributions:
            type_display = distribution["type_display"]
            if type_display in category_mapping:
                for item in distribution["distribution_list"]:
                    record = {}
                    record["category"] = category_mapping[type_display]
                    record["name"] = item["distribution_key"]
                    record["value"] = item["distribution_value"]
                    record["influencer_id"] = profile_id
                    fanstat_data.append(record)

In [None]:
pd.DataFrame(fanstat_data, columns=fanstat_columns).to_parquet("gs://1_transform_data/2_xingtu/" + str(today) + "/douyin_fanstat/douyin_fanstat_" + today.strftime("%y%m%d") + "_" + str(0) + ".parquet")

In [8]:
category_list = ["province", "age", "device", "gender" , "interest"]

for data in raw_data:
    profile_id = data["profile_id"]
    fans_distribution = data.get("fans_distribution")
    if fans_distribution:
        fans_distribution = {distribution_key: fans_distribution[distribution_key] for distribution_key in fans_distribution if "desc" not in distribution_key}
        for key in fans_distribution:
            for category in category_list:
                if category in key:
                    distributions = fans_distribution[key]
                    for distribution in distributions:
                        record = {}
                        record["category"] = category
                        record["name"] = distribution["distribution_key"]
                        record["value"] = distribution["distribution_value"]
                        record["influencer_id"] = profile_id
                        print(record)
                        fanstat_data.append(record)
                    break

In [None]:
today = datetime.now().date()
today

In [15]:
pd.DataFrame(fanstat_data, columns=fanstat_columns).to_parquet("gs://1_transform_data/2_xingtu/" + str(today) + "/douyin_fanstat/douyin_fanstat_" + today.strftime("%y%m%d") + "_" + str(0) + ".parquet")

In [191]:
batch_count = 0
for blob in storage_client.list_blobs(bucket_name,prefix=prefix):
    if "detail" in blob.name:
        raw_data = json.loads(blob.download_as_string())
        for data in raw_data:
            profile_id = data.get("profile_id")
            if profile_id:
                ### Douyin Influencer
                record = {}
                record["user_id"] = profile_id

                ## get_author_base_info
                result = get_author_base_info(data)
                record.update(result)
        
                ## get_authors_ranking_in
                result = get_authors_ranking_in(data, profile_id)
                record.update(result)

                ## get_author_link_info
                result = get_author_link_info(data)
                record.update(result)

                ## search_for_author_square (Basic)           
                for basic in basic_data:
                    if profile_id == basic["id"]:
                        record["expected_play_num"] = basic["expected_play_num"]
                influencer_data.append(record)


                ### Douyin Hotcomment
                result = get_author_hot_comment_tokens(data, profile_id)
                hotcomment_data.extend(result)

                ### Douyin Fantracker
                daily_fans = data["get_author_daily_fans"]["daily"]
                for daily_fan in daily_fans:
                    record = {}
                    record["date_key"] = daily_fan["date"]
                    record["fans_count"] = daily_fan["fans_cnt"]
                    record["date_key"] = daily_fan["date"]
                    record["influencer_id"] = profile_id
                    fantracker_data.append(record)

                ### Douyin Videostat
                data_description = data["get_author_show_items_v2"]["data_description"]
                for description in data_description:
                    record = {}
                    if data_description[description]:
                        record["name"] = description
                        record["rate"] = data_description[description]["rate"]
                        record["compare_avg"] = data_description[description]["compare_avg"]
                        record["compare_author"] = data_description[description]["compare_author"]
                        record["influencer_id"] = profile_id
                        videostat_data.append(record)

                ### Douyin Videotracker
                items = data["get_author_show_items_v2"]["latest_star_item_info"] + data["get_author_show_items_v2"]["latest_item_info"]
                for item in items:
                    record = {}
                    record["date_key"] = datetime.fromtimestamp(item["create_time"]).replace(tzinfo=timezone.utc).date()
                    record["comment"] = item["comment"]
                    record["like"] = item["like"]
                    record["share"] = item["share"]
                    record["play"] = item["play"]
                    record["video_id"] = item["video_id"]
                    videotracker_data.append(record)

                ### Douyin Video
                items = data["get_author_show_items_v2"]["latest_star_item_info"] + data["get_author_show_items_v2"]["latest_item_info"]
                for item in items:
                    record = {}
                    record["video_id"] = item["video_id"]
                    record["item_id"] = item["item_id"]
                    record["title"] = item["title"]
                    record["image_url"] = item["item_cover"]
                    record["comment"] = item["comment"]
                    record["like"] = item["like"]
                    record["share"] = item["share"]
                    record["play"] = item["play"]
                    record["create_time"] = datetime.fromtimestamp(item["create_time"]).replace(tzinfo=timezone.utc).strftime('%Y-%m-%d %H:%M:%S%z')
                    record["duration"] = item["duration"]
                    record["influencer_id"] = profile_id
                    video_data.append(record)

                ### Douyin Price
                price_info = data["get_author_marketing_info"]["price_info"]
                for price in price_info:
                    record = {}
                    record["video_type"] = price["video_type"]
                    record["settlement_type"] = price["settlement_type"]
                    record["is_open"] = price["is_open"]
                    if "origin_price" in price:
                        record["origin_price"] = price["origin_price"]
                    if "price" in price:
                        record["price"] = price["price"]
                    record["has_discount"] = price["has_discount"]
                    record["influencer_id"] = profile_id
                    price_data.append(record)

                ### Calculated Metrics
                ## Douyin Influencer
                video_type_2_price = list(filter(lambda x: x["video_type"] == 2, price_data))
                for price in video_type_2_price:
                    for influencer in influencer_data:
                        if price["influencer_id"] == influencer["user_id"]:
                            if influencer["avg_play"] == 0:
                                expected_cpm = 0
                            else:
                                expected_cpm = (price["price"] * 1000) / influencer["avg_play"]
                            influencer["expected_cpm"] = expected_cpm
                
                # data fixing
                influencer_df = pd.DataFrame(influencer_data, columns=influencer_columns)
                influencer_df["tags_ids_level_two"] = influencer_df["tags_ids_level_two"].str.replace("[", "{").str.replace("]", "}")
                influencer_df["tags_ids"] = influencer_df["tags_ids"].str.replace("[", "{").str.replace("]", "}")
                influencer_df["deleted"] = False
                influencer_df["updated"] = datetime.now()

        today = datetime.now().date()
        # influencer_df.to_parquet("gs://1_transform_data/2_xingtu/" + str(today) + "/douyin_influencer/douyin_influencer_" + today.strftime("%y%m%d") + "_" + str(batch_count) + ".parquet")
        # pd.DataFrame(hotcomment_data, columns=hotcomment_columns).to_parquet("gs://1_transform_data/2_xingtu/" + str(today) + "/douyin_hotcomment/douyin_hotcomment_" + today.strftime("%y%m%d") + "_" + str(batch_count) + ".parquet")
        # pd.DataFrame(fantracker_data, columns=fantracker_columns).to_parquet("gs://1_transform_data/2_xingtu/" + str(today) + "/douyin_fantracker/douyin_fantracker_" + today.strftime("%y%m%d") + "_" + str(batch_count) + ".parquet")
        # pd.DataFrame(videostat_data, columns=videostat_columns).to_parquet("gs://1_transform_data/2_xingtu/" + str(today) + "/douyin_videostat/douyin_videostat_" + today.strftime("%y%m%d") + "_" + str(batch_count) + ".parquet")
        # pd.DataFrame(videotracker_data, columns=videotracker_columns).to_parquet("gs://1_transform_data/2_xingtu/" + str(today) + "/douyin_videotracker/douyin_videotracker_" + today.strftime("%y%m%d") + "_" + str(batch_count) + ".parquet")
        # pd.DataFrame(video_data, columns=video_columns).to_parquet("gs://1_transform_data/2_xingtu/" + str(today) + "/douyin_video/douyin_video_" + today.strftime("%y%m%d") + "_" + str(batch_count) + ".parquet")
        # pd.DataFrame(price_data, columns=price_columns).to_parquet("gs://1_transform_data/2_xingtu/" + str(today) + "/douyin_price/douyin_price_" + today.strftime("%y%m%d") + "_" + str(batch_count) + ".parquet")
        batch_count += 1

In [None]:
pd.DataFrame(hotcomment_data, columns=hotcomment_columns)

In [None]:
for blob in storage_client.list_blobs(bucket_name,prefix=prefix):
    if "detail" in blob.name:
        raw_data = json.loads(blob.download_as_string())

In [None]:
print(raw_data[0])

In [None]:
video_type_2_price = list(filter(lambda x: x["video_type"] == 2, price_data))
video_type_2_price

In [371]:
for price in video_type_2_price:
    for influencer in influencer_data:
        if price["influencer_id"] == influencer["user_id"]:
            if influencer["avg_play"] == 0:
                expected_cpm = 0
            else:
                expected_cpm = (price["price"] * 1000) / influencer["avg_play"]
            influencer["expected_cpm"] = expected_cpm

In [None]:
influencer_data

In [331]:
cols = ['col1', 'col2', 'col3', 'col4']
table_name = "my_table"
unique_key = ['col1']
cols_not_for_update = ['col2']

In [None]:
cols = [f'"{col}"' for col in cols]
cols_str = ', '.join(cols)
insert_query = """ INSERT INTO %s (%s) VALUES %%s """ % (
        table_name, cols_str
    )

insert_query

In [None]:
if cols_not_for_update is not None:
    cols_not_for_update.extend(unique_key)
else:
    cols_not_for_update = [col for col in unique_key]
cols_not_for_update = [f'"{col}"' for col in cols_not_for_update]
cols_not_for_update

In [None]:
unique_key = [f'"{col}"' for col in unique_key]
unique_key_str = ', '.join(unique_key)

update_cols = [f"{col}" for col in cols if col not in cols_not_for_update]
update_cols_str = ', '.join(update_cols)
update_cols_with_excluded_markers = [f'EXCLUDED.{col}' for col in update_cols]
update_cols_with_excluded_markers_str = ', '.join(
    update_cols_with_excluded_markers
)
if len(update_cols) > 1:
    equality_clause = "(%s) = (%s)"
else:
    equality_clause = "%s = %s"

on_conflict_clause = f""" ON CONFLICT (%s) DO UPDATE SET {equality_clause} ;"""
on_conflict_clause = on_conflict_clause % (
    unique_key_str,
    update_cols_str,
    update_cols_with_excluded_markers_str
)
on_conflict_clause

In [None]:
if len(unique_key) == 0:
    insert_query
insert_query + on_conflict_clause

In [None]:
pd.DataFrame(price_data, columns=price_columns)

In [175]:
for description in data_description:
    record = {}
    record["name"] = description
    record["rate"] = data_description[description]["rate"]
    record["compare_avg"] = data_description[description]["compare_avg"]
    record["compare_author"] = data_description[description]["compare_author"]

In [None]:
data["get_author_show_items_v2"]

In [None]:
pd.DataFrame(fantracker_data, columns=fantracker_columns)

In [None]:
print("gs://1_transform_data/2_xingtu/" + str(datetime.now().date()) + "/douyin_influencer")

In [None]:
len(attribute_datas)

In [None]:
author_rank

In [31]:
url = 'https://www.xingtu.cn/gw/api/gsearch/search_for_author_square'
headers = {
    "Accept": "application/json, text/plain, */*",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
    "Agw-Js-Conv": "str",
    "Connection": "keep-alive",
    "Content-Length": "368",
    "Content-Type": "application/json",
    "Host": "www.xingtu.cn",
    "Origin": "https://www.xingtu.cn",
    "Referer": "https://www.xingtu.cn/ad/creator/market",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15",
    "X-CSRFToken": "zol2LPRTWRc3HJZYdpf9oLxPAphkJiad",
    "x-login-source": "1",
    "x-secsdk-csrf-token": "0001000000011b9a319287438e55d17219ac90e115771a0198ff342441808a598fef59034b0117eae3e24f59abeb",
}

json_data = {
    "scene_param": {
        "platform_source":1,"search_scene":1,"display_scene":1,"marketing_target":1,"task_category":1,"first_industry_id":0
    },
    "search_param": {"seach_type":2},
    "sort_param": {"sort_type":2,"sort_field":{"field_name":"score"}},
    "page_param": {"page":1,"limit":20},
    "attribute_filter": [{
        "field":{
            "field_name":"price_by_video_type__ge",
            "rel_id":"2"
        },
        "field_value":"0"
    }]
}

In [None]:
r = requests.post(url, json=json_data, headers=headers, cookies=cookies)
data = r.json()
data

In [None]:
author_id = data["authors"][0]["attribute_datas"]["id"]
url = 'https://www.xingtu.cn/gw/api/author/get_author_base_info?o_author_id=' + author_id + '&platform_source=1&platform_channel=1&recommend=true&search_session_id=&need_sec_uid=true'
r = requests.get(url, cookies=cookies)
print(r)
pprint(r.json())

In [16]:
data["authors"][0]["attribute_datas"].update(r.json())

In [33]:
for author in data["authors"]:
    author_id = author["attribute_datas"]["id"]
    url = 'https://www.xingtu.cn/gw/api/author/get_author_base_info?o_author_id=' + str(author_id) + '&platform_source=1&platform_channel=1&recommend=true&search_session_id=&need_sec_uid=true'
    try:
        r = requests.get(url, cookies=cookies)
        author["attribute_datas"].update(r.json())
    except Exception as e:
        print(e)

In [None]:
data

In [None]:
author_id = data["authors"][0]["attribute_datas"]["id"]
url = 'https://www.xingtu.cn/gw/api/data_sp/get_author_link_info?o_author_id=' + str(author_id) + '&platform_source=1&platform_channel=1&industy_tag=0'
r = requests.get(url, cookies=cookies)
print(r)
pprint(r.json())

In [35]:
for author in data["authors"]:
    author_id = author["attribute_datas"]["id"]
    url = 'https://www.xingtu.cn/gw/api/data_sp/get_author_link_info?o_author_id=' + str(author_id) + '&platform_source=1&platform_channel=1&industy_tag=0'
    try:
        r = requests.get(url, cookies=cookies)
        author["attribute_datas"].update(r.json())
    except Exception as e:
        print(e)

In [None]:
data[""]

In [None]:
author_id = data["authors"][2]["attribute_datas"]["id"]
url = 'https://www.xingtu.cn/gw/api/gsearch/get_authors_ranking_in?author_ids=' + str(author_id) + '&platform_channel=1&biz_scene=douyin_video_author_ranks'
r = requests.get(url, cookies=cookies)
print(r)
pprint(r.json())

In [None]:
author_id = attribute_datas["id"]
url = 'https://www.xingtu.cn/gw/api/gsearch/get_authors_ranking_in?author_ids=' + str(author_id) + '&platform_channel=1&biz_scene=douyin_video_author_ranks'
r = requests.get(url, cookies=cookies)
print(r)
pprint(r.json())

In [None]:
attribute_datas

In [37]:
for author in data["authors"]:
    author_id = author["attribute_datas"]["id"]
    url = 'https://www.xingtu.cn/gw/api/gsearch/get_authors_ranking_in?author_ids=' + str(author_id) + '&platform_channel=1&biz_scene=douyin_video_author_ranks'
    try:
        r = requests.get(url, cookies=cookies)
        author["attribute_datas"].update(r.json())
    except Exception as e:
        print(e)

In [None]:
data

In [None]:
for author in data["authors"]:
    pprint(author["attribute_datas"]["author_rank_infos"])

In [118]:
author = data["authors"][0]
author_id = author["attribute_datas"]["id"]
author["attribute_datas"]["level"] = author["attribute_datas"]["author_rank_infos"][str(author_id)][0]["level"]

In [17]:
author = data["authors"][0]

In [53]:
reports = []

for author in data["authors"]:
    report = {}
    attribute_datas = author["attribute_datas"]
    report["user_id"] = attribute_datas["id"]
    report["short_id"] = attribute_datas["short_id"]
    report["nickname"] = attribute_datas["nick_name"]
    report['image_url'] = attribute_datas["avatar_uri"]
    report["fans_count"] = attribute_datas["follower"]
    report["city"] = attribute_datas["city"]
    report["province"] = attribute_datas["province"]
    if attribute_datas["author_rank_infos"]:
        report["level"] = attribute_datas["author_rank_infos"][str(report["user_id"])][0]["level"]
    report["order_cnt"] = None
    report["engage_rate"] = None
    report["expected_cpm"] = None
    report["expected_play_num"] = attribute_datas["expected_play_num"]
    report["sex"] = attribute_datas["gender"]
    report["lowest_price"] = attribute_datas["lowest_price"]
    report["create_time"] = datetime.fromtimestamp(attribute_datas["create_time"]).replace(tzinfo=timezone.utc).strftime('%Y-%m-%d %H:%M:%S%z')
    report["modify_time"] = datetime.fromtimestamp(attribute_datas["modify_time"]).replace(tzinfo=timezone.utc).strftime('%Y-%m-%d %H:%M:%S%z')
    report["is_star"] = attribute_datas["is_star"]
    report["e_commerce_enable"] = attribute_datas["e_commerce_enable"]
    report["updated"] = pd.NA
    report["avg_play"] = attribute_datas["avg_play"]
    report["tags_ids_level_two"] = attribute_datas["tags_ids_level_two"]
    report["core_user_id"] = attribute_datas["core_user_id"]
    report["order_complete_rate"] = pd.NA
    report["unique_id"] = attribute_datas["unique_id"]
    report["middle_play"] = pd.NA
    report["order_avg_time_cost"] = pd.NA
    report["order_complete_cnt"] = pd.NA
    report["total_favour_cnt"] = pd.NA
    report["cooperate_index"] = attribute_datas["cooperate_index"]["rank"]
    report["cp_index"] = attribute_datas["cp_index"]["rank"]
    report["growth_index"] = pd.NA
    report["shopping_index"] = attribute_datas["link_shopping_index"]["rank"]
    report["spread_index"] = attribute_datas["link_spread_index"]["rank"]
    report["top_score"] = pd.NA
    report["deleted"] = pd.NA
    report["tags_ids"] = pd.NA
    report["updated_hf"] = pd.NA
    reports.append(report)

In [None]:
df = pd.DataFrame(reports)
df


In [55]:
df.to_parquet("Douyin Findy first 20 KOL.parquet")

In [None]:
df = pd.read_parquet("Douyin Findy first 20 KOL.parquet")
df