In [1]:

import os
from pyspark.sql import Row, SparkSession
from typing import Dict, Any
import json
import uuid
from functools import partial
from typing import Iterable, Dict, Any
# create spark session
from pyspark.sql import Row, DataFrame
from loguru import logger
from xinghe.spark import *
from app.common.json_util import *

from xinghe.s3 import *


import uuid
import traceback
from datetime import datetime
from llm_web_kit.input.datajson import DataJson
from func_timeout import FunctionTimedOut, func_timeout

#| track_id                  | uuid                           | 全局唯一的ID                                                                                                        | 是              |
#| dataset_name              | str                            | 数据集的名字（全局唯一），这个名字是管理员输入的，然后做索引的时候带到index里来                                     | 是              |
#| data_source_category      | str                            | 这一行数据代表的是HTML，PDF，EBOOK,CC,labCC类型                                                                     | 是，此处是 HTML |
#| html                      | 字符串                         | 以UTF-8为编码的HTML文件全文                                                                                         | 是              |
#| url                       | 字符串                         | 这个文件的来源网址                                                                                                  | 是              |
#| file_bytes                | 整数                           | 文件的size, 单位是byte                                                                                              | 是              |
#| meta_info                 | 字典                           | 存放关于文件的元信息:如果能从文件里获取到作者，制作日期等信息。或者数据本身就带有一些其他的信息都放入到这个字段里。 | 是              |
#| meta_info->input_datetime | 其格式为 `yyyy-mm-dd HH:MM:SS` | 生成这个json索引文件这一条数据的时间，可以不用那么精确                                                              | 是              |


def process_platform_data(spark: SparkSession, config,platform: str, input_paths: list, version: str = "001"):
    """接收 spark 作为参数而不是持有它"""
    platform_config = platform_configs.get(platform)
    if not platform_config:
        raise ValueError(f"Unsupported platform: {platform}")

    # Driver 端操作
    input_df = read_any_path(spark, ",".join(input_paths), config)
    print(f"读取数据结束")
    # 准备 Worker 端配置
    worker_config = {
        "field_mappings": platform_config.get("field_mappings"),
        "extractor_config": platform_config.get("extractor_config")
    }
    # 分别广播不同配置
    broadcast_field_mappings = spark.sparkContext.broadcast(worker_config["field_mappings"])
    broadcast_extractor_config_path = spark.sparkContext.broadcast(worker_config["extractor_config"])  # 广播路径

    print("原始数据json结构")
    input_rdd = input_df.rdd.map(lambda x: Row(**{**json.loads(x.value), "filename": x.filename})).cache()
  #  pandas_df = input_rdd.toDF().toPandas()
  #  print(pandas_df)
    print(input_rdd.take(1)[0].asDict().keys())
    # 数据转换（仅用字段映射）
    transformed_rdd = input_rdd.map(
        lambda row: transform_row(row, broadcast_field_mappings.value)
    )
   
    print("formatter数据结束")
    # 数据抽取（仅用抽取器配置路径）
    processed_rdd = transformed_rdd.mapPartitions(
         lambda x:extract_data(
            x,broadcast_extractor_config_path = broadcast_extractor_config_path.value
        )
    )
    print("extractor数据结束")
    print("写入数据中")
   
    # 输出结果Row(value=json.dumps(x.asDict()))).toDF()
    write_any_path(processed_rdd.map(lambda x: Row(value=json.dumps(x.asDict()))).toDF(), platform_config["output_template"],config)
    print("写入数据结束")

    # 清理广播变量
    broadcast_field_mappings.unpersist()
    broadcast_extractor_config_path.unpersist()



def transform_row(row, config: dict) -> Row:
    """根据平台配置转换行数据"""
    mappings = config

    return Row(
        track_id=getattr(row, mappings["track_id"],str(uuid.uuid4())),
        url=getattr(row, mappings["url"], ''),
        html=getattr(row, mappings["html"], ''),
        page_layout_type=mappings.get("page_layout_type_map").get(
            getattr(row, mappings["layout_field"], ''),
            "article"
        ),
        domain=extract_domain_info(getattr(row, mappings["url"], ''))['domain'],
        dataset_name=mappings["dataset_name"],
        data_source_category=mappings["data_source_category"],
        meta_info={"filename":row.filename}
    )




def extract_data(partition, broadcast_extractor_config_path):
    from loguru import logger
    extractor_chain = ExtractSimpleFactory.create('/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc')
    timeout_seconds = 10
    # 为每个分区创建唯一的错误日志文件
    # partition_id = str(uuid.uuid4())
    # current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    # error_log_path = f"s3://xyz-llm-users/xyz-users/yujia/CC-MAIN-2024-33/output/v002/error_logs/{current_time}_{partition_id}.json"
    # s3_doc_writer = S3DocWriter(path=error_log_path)
    try:
        for row in partition:
            d = row.asDict()
           
            input_data = DataJson(d)
            data_e: DataJson = func_timeout(timeout_seconds, extractor_chain.extract, args=(input_data,))
            #data_e: DataJson = extractor_chain.extract(input_data)
            yield Row(**data_e.to_dict())
    except FunctionTimedOut as e1:
            d['__error'] = {
                "error_type":"TIMEOUT",
                "error_message": "extract function timeout",
                "traceback":"TIMEOUT"
            }
            yield Row(**d)
    except Exception as e:
            # 记录更详细的错误信息
            error_info = {
                "error_type": type(e).__name__,
                "error_message": str(e),
                "traceback": traceback.format_exc(),
            }
            logger.error(error_info)
            # s3_doc_writer.write(error_info)
            d['__error'] = error_info
            yield Row(**d)
           


def _safe_extract(data: Dict, extractor, timeout: int = 10) -> Dict:
  
    """直接返回字典，避免生成器"""
    try:
        timeout_seconds = 10
        input_data = DataJson(data.asDict())
        print(input_data)
        data_e: DataJson = func_timeout(timeout_seconds, extractor.extract, \
                                                args=(input_data,))

        print(Row(**data_e.to_dict()))
        return data_e.to_dict() 
    except Exception as e:
        # 记录更详细的错误信息
        error_info = {
            "error_type": type(e).__name__,
            "error_message": str(e),
            "traceback": traceback.format_exc(),
        }
        logger.error(error_info)
        # s3_doc_writer.write(error_info)
        data['__error'] = error_info
        yield Row(**d)


from urllib.parse import urlparse

def extract_domain_info(url: str) -> dict:
    """从 URL 中提取完整的域名信息"""
    parsed = urlparse(url)
    netloc = parsed.netloc
    domain_parts = netloc.split(":")
    domain = domain_parts[0]  # 去除端口号
    root_domain = ".".join(domain.split(".")[-2:]) if len(domain.split(".")) >= 2 else domain

    return {
        "full_url": url,
        "netloc": netloc,
        "domain": domain,
        "root_domain": root_domain
    }
        
def handle_error(row: Dict, error: Exception) -> Dict:
    """统一错误处理"""
    row_dict = row.asDict()
    return {
     ** row,
    "__error": {
        "type": type(error).__name__,
        "message": str(error),
        "traceback": traceback.format_exc()
    }
    }

    
def extract_platform_from_s3_path(s3_path: str) -> str:
    """
    从 S3 路径中提取平台名称（存储桶后的第一个目录）
    
    示例输入: 
    - "s3://private-cooperate-data/zh-web-baijiahao/20241218_p1/"
    输出: "zh-web-baijiahao"
    
    - "s3://private-cooperate-data/DouBan/"
    输出: "DouBan"
    """
    # 分割路径并过滤空字符串
    parts = [p for p in s3_path.split("/") if p.strip() != ""]
    
    # 验证路径格式
    if len(parts) < 3:
        raise ValueError(f"无效的 S3 路径格式: {s3_path}")
    
    # 平台名称是存储桶后的第一个目录
    return parts[2]

#
#
# config = {
#     "spark_conf_name": "spark_4",
#     "skip_success_check": True,
#     "spark.executor.memory":"8g",
#     # 根据个人路径进行替换1
#     "spark.executorEnv.LLM_WEB_KIT_CFG_PATH": "/share/jiangwenhao/.llm-web-kit.jsonc",
#     "spark.dynamicAllocation.maxExecutors": "400"
# }
# paths = [
# #"s3://private-cooperate-data/zh-web-baijiahao/",
# #"s3://private-crawl-data/zh-web-netease/20241218_p1/",
#    "s3://private-crawl-data/zh-web-tencent/20241218_p1/",
#     "s3://private-crawl-data/zh-web-sohu/20241218_p1/",
#  #   "s3://private-crawl-data/zh-web-sina/20241218_p1/",
#    # "s3://crawl-data/blog_sina_com_cn/gz_file/1729501052/",
#    # "s3://private-cooperate-data/DouBan/"
#
#
# ]


# spark = new_spark_session("llm_kit_cc", config)
# version="008"
# for path in paths:
#     platform = extract_platform_from_s3_path(path)
#     print(f"路径: {path} → 平台: {platform}")
#     platform_configs = {
#
#     "zh-web-baijiahao": {
#
#         "field_mappings": {
#             "track_id":"track_id",
#             "page_layout_type_map" :{
#                                 "":"article",
#                                 "文章":"article",
#                                 "网易":'article',
#                                 "视频":"video"
#                             },
#         "dataset_name": "baijiahao",
#         "url":"url",
#         "html":"content",
#         "layout_field": "channel",
#         "data_source_category":"JSON"
#         },
#         "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
#         "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
#
#     },
#     "zh-web-netease": {
#
#         "field_mappings": {
#              "track_id":"track_id",
#             "page_layout_type_map" :{
#                                 "":"article",
#                                 "文章":"article",
#                                 "网易":'article',
#                                 "视频":"video"
#                             },
#         "dataset_name": "net-ease",
#         "url":"url",
#         "html":"content",
#         "layout_field": "f_name",
#         "data_source_category":"HTML"
#         },
#         "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
#         "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
#     },
#     "zh-web-tencent": {
#
#         "field_mappings": {
#              "track_id":"track_id",
#             "page_layout_type_map" :{
#                                 "":"article",
#                                 "文章":"article",
#                                 "网易":'article',
#                                 "视频":"video",
#                                 "腾讯网":"article"
#                             },
#         "dataset_name": "tencent",
#         "url":"url",
#         "html":"content",
#         "layout_field": "f_name",
#         "data_source_category":"HTML"
#         },
#         "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
#         "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
#     },
#      "zh-web-sohu": {
#
#         "field_mappings": {
#              "track_id":"track_id",
#             "page_layout_type_map" :{
#                                 "":"article",
#                                 "文章":"article",
#                                 "网易":'article',
#                                 "视频":"video",
#                                 "搜狐网":"article"
#                             },
#         "dataset_name": "souhu",
#         "url":"url",
#         "html":"content",
#         "layout_field": "f_name",
#         "data_source_category":"HTML"
#         },
#         "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
#         "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
#     },
#     "zh-web-sina": {
#
#         "field_mappings": {
#              "track_id":"track_id",
#             "page_layout_type_map" :{
#                                 "":"article",
#                                 "文章":"article",
#                                 "网易":'article',
#                                 "视频":"video",
#                                 "搜狐网":"article",
#                                 "黑猫投诉":"forum"
#                             },
#         "dataset_name": "sina",
#         "url":"url",
#         "html":"content",
#         "layout_field": "f_name",
#         "data_source_category":"HTML"
#         },
#         "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
#         "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
#     },
#
#     "blog_sina_com_cn": {
#
#         "field_mappings": {
#              "track_id":"track_id",
#             "page_layout_type_map" :{
#                                 "":"article",
#                                 "文章":"article",
#                                 "网易":'article',
#                                 "视频":"video",
#                                 "搜狐网":"article",
#                                 "黑猫投诉":"forum"
#                             },
#         "dataset_name": "sina",
#         "url":"url",
#         "html":"html",
#         "layout_field": "f_name",
#         "data_source_category":"HTML"
#         },
#         "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
#         "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
#     },
#     "DouBan": {
#
#         "field_mappings": {
#              "track_id":"track_id",
#             "page_layout_type_map" :{
#                                 "":"article",
#                                 "文章":"article",
#                                 "网易":'article',
#                                 "视频":"video",
#                                 "搜狐网":"article",
#                                 "黑猫投诉":"forum",
#                                 "豆瓣网":""
#                             },
#         "dataset_name": "sina",
#         "url":"url",
#         "html":"html",
#         "layout_field": "f_name",
#         "data_source_category":"HTML"
#         },
#         "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
#         "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
#     },
#}
    # 处理正式数据
 #  process_platform_data(
 #      spark=spark,
 #      config=config,
 #      platform=platform,
 #      input_paths=[path],
 #      version=version
 # )


    # 处理其他平台数据
    # processor.process_platform_data("other_platform", [...])

In [2]:

import os
from pyspark.sql import Row, SparkSession
from typing import Dict, Any
import json
import uuid
from functools import partial
from typing import Iterable, Dict, Any
# create spark session
from pyspark.sql import Row, DataFrame
from loguru import logger
from xinghe.spark import *
from app.common.json_util import *

from xinghe.s3 import *


import uuid
import traceback
from datetime import datetime
from llm_web_kit.input.datajson import DataJson
from func_timeout import FunctionTimedOut, func_timeout

In [4]:
version="001"
sub_dir = "202401"
input_paths =[f's3://crawl-data/blog_sina_com_cn/gz_file/1729501052/']
platform = extract_platform_from_s3_path(input_paths[0])
platform_configs = {

    "zh-web-baijiahao": {

        "field_mappings": {
            "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video"
                            },
        "dataset_name": "baijiahao",
        "url":"url",
        "html":"content",
        "layout_field": "channel",
        "data_source_category":"JSON"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/{sub_dir}/v{version}/"

    },
    "zh-web-netease": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video"
                            },
        "dataset_name": "net-ease",
        "url":"url",
        "html":"content",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
    "zh-web-tencent": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "腾讯网":"article"
                            },
        "dataset_name": "tencent",
        "url":"url",
        "html":"content",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
     "zh-web-sohu": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "搜狐网":"article"
                            },
        "dataset_name": "souhu",
        "url":"url",
        "html":"content",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
    "zh-web-sina": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "搜狐网":"article",
                                "黑猫投诉":"forum",
                                "新浪网":"article"
                            },
        "dataset_name": "sina",
        "url":"url",
        "html":"content",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },

    "blog_sina_com_cn": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "搜狐网":"article",
                                "黑猫投诉":"forum"
                            },
        "dataset_name": "sina",
        "url":"url",
        "html":"html",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
    "DouBan": {
       
        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "搜狐网":"article",
                                "黑猫投诉":"forum",
                                "豆瓣网":""
                            },
        "dataset_name": "sina",
        "url":"url",
        "html":"html",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
}
platform_config = platform_configs.get(platform)
platform_config


{'field_mappings': {'track_id': 'track_id',
  'page_layout_type_map': {'': 'article',
   '文章': 'article',
   '网易': 'article',
   '视频': 'video',
   '搜狐网': 'article',
   '黑猫投诉': 'forum'},
  'dataset_name': 'sina',
  'url': 'url',
  'html': 'html',
  'layout_field': 'f_name',
  'data_source_category': 'HTML'},
 'extractor_config': '/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc',
 'output_template': 's3://llm-users-phdd2/jiangwenhao/article/blog_sina_com_cn/v001/'}

In [7]:
spark

In [7]:
# input_paths =[f's3://zhuanxiang-hw60p/article/sina/v006/']





# Driver 端操作
# input_df = read_any_path(spark, ",".join(input_paths), config)


In [9]:
#def is_error__(row):
#    if "__error__" in row:  # 先检查键是否存在
#        return True
#    else:
#        return False
#
#def del_error__(row):
#    row.pop("__error__", None)  # 若键不存在，返回 None 且不报错
#    return row
#final_df = input_df.rdd.map(lambda x: json.loads(x.value)).filter(is_error__)
#final_df.take(1)[0]



                                                                                

IndexError: list index out of range

In [53]:
write_any_path(final_df, "s3://zhuanxiang-hw60p/article/sina/v006/")

                                                                                

{'rows': 180601564,
 'bytes': {'sum': 1711075727829,
  'min': 516,
  'max': 14546436,
  'cnt': 180601564,
  'avg': 9474.313},
 'files': 2228,
 'sub_paths': {}}

In [6]:

config = {
    "spark_conf_name": "spark_4",
    "skip_success_check": True,
    # 根据个人路径进行替换1
    "spark.executorEnv.LLM_WEB_KIT_CFG_PATH": "/share/jiangwenhao/.llm-web-kit.jsonc",
    "spark.dynamicAllocation.maxExecutors": "400",
    "spark.executor.cores": "20"
}
spark = new_spark_session("llm_kit_cc", config)


print(f"路径: {input_paths[0]} → 平台: {platform}")


"""接收 spark 作为参数而不是持有它"""
platform_config = platform_configs.get(platform)
if not platform_config:
    raise ValueError(f"Unsupported platform: {platform}")

# Driver 端操作
input_df = read_any_path(spark, ",".join(input_paths), config)
print(f"读取数据结束")
# 准备 Worker 端配置
worker_config = {
    "field_mappings": platform_config.get("field_mappings"),
    "extractor_config": platform_config.get("extractor_config")
}
# 分别广播不同配置
broadcast_field_mappings = worker_config["field_mappings"]
broadcast_extractor_config_path = worker_config["extractor_config"] # 广播路径


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


路径: s3://crawl-data/blog_sina_com_cn/gz_file/1729501052/ → 平台: blog_sina_com_cn
读取数据结束


In [91]:
get_s3_config("s3://web-parse-hw60p/xuchao/zx-html-error/")


{'endpoint': 'http://10.140.104.11',
 'ak': 'C68F0C5011E8E9C10F64',
 'sk': '5HaI9RymQ3poL/BIC8g2ifksOwYAAAGVEejpySCi'}

                                                                                

{'value': '{"track_id":"3edfc1c9-c87b-480f-a26e-b90a7d54c24a","url":"https://blog.sina.com.cn/s/article_archive_1884248122_201206_1.html","status":200,"html":"<!DOCTYPE html PUBLIC \\"-//W3C//DTD XHTML 1.0 Transitional//EN\\" \\"//www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\\"><html xmlns=\\"//www.w3.org/1999/xhtml\\"><head><script id=\\"sinaads-ck-script\\" charset=\\"utf-8\\" src=\\"//d4.sina.com.cn/litong/zhitou/sinaads/src/spec/sinaads_ck.js\\"></script><script src=\\"https://d8.sina.com.cn/litong/zhitou/wenjing28/js/postMan.js\\"></script>\\n<meta http-equiv=\\"Content-Type\\" content=\\"text/html; charset=utf-8\\">\\n<title>归档_安徽文学_新浪博客</title>\\n<meta http-equiv=\\"X-UA-Compatible\\" content=\\"IE=EmulateIE8,chrome=1\\">\\n<meta name=\\"renderer\\" content=\\"webkit\\">\\n<meta name=\\"keywords\\" content=\\"\\">\\n<meta name=\\"description\\" content=\\"\\">\\n<meta content=\\"always\\" name=\\"referrer\\">\\n<!--–[if lte IE 6]-->\\n<script type=\\"text/javascript\\">\\ntr

In [8]:
# input_df = read_any_path(spark, ",".join(['s3a://private-crawl-data/zh-web-tencent/20241218_p1/2020/1733715370691.json.gz']), config)
# input_df.show()
input_df.take(1)[0]

25/03/14 11:28:27 WARN YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
                                                                                

28174242

In [12]:

from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType
#安全解析函数（包含异常捕获）
def safe_json_loads(s):
    try:
        json.loads(s)
        return True
    except:
        return False
# 将Python函数注册为Spark UDF
safe_json_udf = udf(safe_json_loads, BooleanType())

# 在filter中使用UDF生成的Column表达式
df_filtered = input_df.filter(safe_json_udf(col("value")))

broadcast_field_mappings

{'track_id': 'track_id',
 'page_layout_type_map': {'': 'article',
  '文章': 'article',
  '网易': 'article',
  '视频': 'video',
  '搜狐网': 'article',
  '黑猫投诉': 'forum'},
 'dataset_name': 'sina',
 'url': 'url',
 'html': 'html',
 'layout_field': 'f_name',
 'data_source_category': 'HTML'}

In [13]:
df_filtered.count()

ERROR:root:KeyboardInterrupt while sending command.            (49 + 188) / 237]
Traceback (most recent call last):
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/share/python/3.10.9/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:


input_rdd = df_filtered.rdd.map(lambda x: Row(**{**json.loads(x.value), "filename": x.filename}))





In [None]:

def transform_row(row, config: dict) -> Row:

    """根据平台配置转换行数据"""
    mappings = config
    from loguru import logger
    try:
        d = Row(
                track_id=getattr(row, mappings["track_id"],str(uuid.uuid4())),
                url=getattr(row, mappings["url"], ''),
                html=getattr(row, mappings["html"], ''),
                page_layout_type=mappings.get("page_layout_type_map").get(
                    getattr(row, mappings["layout_field"], ''),
                    "article"
                ),
                domain=extract_domain_info(getattr(row, mappings["url"], ''))['domain'],
                dataset_name=mappings["dataset_name"],
                data_source_category=mappings["data_source_category"],
                meta_info={"filename":row.filename}
            )
        return d 
    except Exception as e:
        # 记录更详细的错误信息
        error_info = {
            "error_data":Row(**d),
            "error_type": type(e).__name__,
            "error_message": str(e),
            "traceback": traceback.format_exc(),
        }
        # s3_doc_writer.write(error_info)
        d['__error'] = error_info
        logger.error(error_info)
        return Row(**d)
   
    
 
#  pandas_df = input_rdd.toDF().toPandas()
#  print(pandas_df)

# 数据转换（仅用字段映射）
transformed_rdd = input_rdd.map(
    lambda row: transform_row(row, broadcast_field_mappings)
).repartition(6000)


In [None]:
transformed_rdd.count()

In [None]:
transformed_not_error_rdd = transformed_rdd.filter(lambda x:"__error" not in x)
transformed_not_error_rdd.count()



In [None]:
transformed_not_empty_rdd = transformed_not_error_rdd.filter(lambda x :x.html !='').cache()
transformed_not_empty_rdd.count()

## 测试 To_Main_HTML

In [None]:


#timeout_seconds = 10
#extractor_chain = ExtractSimpleFactory.create('/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc')
#data_e: DataJson = func_timeout(timeout_seconds, extractor_chain.extract, args=(input_data,))
#d = json.loads(data_e.to_json())
#d['to_main_html'] = data_e.get_content_list().to_main_html()
#d

In [52]:
# transformed_rdd_1 = transformed_rdd.filter(lambda x: "<!--VIDEO_0-->"  not in x.html)
# transformed_rdd_2 = transformed_rdd_1.filter(lambda x: "<!--MUSIC_0-->" not in x.html)
# transformed_rdd_2 = transformed_rdd_2.cache()
transformed_not_empty_rdd.cache()
# 18660
# 62
# 50000-(18660+62)=31278


MapPartitionsRDD[54] at coalesce at NativeMethodAccessorImpl.java:0

## Extractor

In [None]:


def extract_data(partition, broadcast_extractor_config_path):
    from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
    from loguru import logger

    extractor_chain = ExtractSimpleFactory.create('/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc')
    timeout_seconds = 10
    # 为每个分区创建唯一的错误日志文件
    # partition_id = str(uuid.uuid4())
    # current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    # error_log_path = f"s3://xyz-llm-users/xyz-users/yujia/CC-MAIN-2024-33/output/v002/error_logs/{current_time}_{partition_id}.json"
    # s3_doc_writer = S3DocWriter(path=error_log_path)
    for row in partition:
        try:

            d = row.asDict()
            input_data = DataJson(d)
            data_e: DataJson = func_timeout(timeout_seconds, extractor_chain.extract, args=(input_data,))
            #data_e: DataJson = extractor_chain.extract(input_data)
            
    
            yield Row(**data_e.to_dict())
        except FunctionTimedOut as e1:
                d['__error'] = {
                    "error_type":"TIMEOUT",
                    "error_message": "extract function timeout",
                    "traceback":"TIMEOUT"
                }

                yield Row(**d)
        except Exception as e:
                # 记录更详细的错误信息
                error_info = {
                    "error_data":Row(**d),
                    "error_type": type(e).__name__,
                    "error_message": str(e),
                    "traceback": traceback.format_exc(),
                }
                logger.error(error_info)
                # s3_doc_writer.write(error_info)
                d['__error'] = error_info

                yield Row(**d)
           


# 数据抽取（仅用抽取器配置路径）
processed_rdd = transformed_not_empty_rdd.mapPartitions(
     lambda x:extract_data(
        x,broadcast_extractor_config_path = broadcast_extractor_config_path
    )
)



In [31]:

spark
processed_rdd.take(1)[0]

DEBUG:Command to send: c
o511
partitions
e

DEBUG:Answer received: !ylo703
DEBUG:Command to send: c
o703
size
e

DEBUG:Answer received: !yi6000
DEBUG:Command to send: r
u
PythonRDD
rj
e

DEBUG:Answer received: !ycorg.apache.spark.api.python.PythonRDD
DEBUG:Command to send: r
m
org.apache.spark.api.python.PythonRDD
runJob
e

DEBUG:Answer received: !ym
DEBUG:Command to send: c
o421
sc
e

DEBUG:Answer received: !yro704
DEBUG:Command to send: r
u
PythonUtils
rj
e

DEBUG:Answer received: !ycorg.apache.spark.api.python.PythonUtils
DEBUG:Command to send: r
m
org.apache.spark.api.python.PythonUtils
getBroadcastThreshold
e

DEBUG:Answer received: !ym
DEBUG:Command to send: c
z:org.apache.spark.api.python.PythonUtils
getBroadcastThreshold
ro421
e

DEBUG:Answer received: !yL1048576
DEBUG:Command to send: r
u
SimplePythonFunction
rj
e

DEBUG:Answer received: !ycorg.apache.spark.api.python.SimplePythonFunction
DEBUG:Command to send: i
java.util.HashMap
e

DEBUG:Answer received: !yao705
DEBUG:Comman

Row(track_id='c5825dc9-dd61-4ef3-9517-89bab95dfdae', url='https://tousu.sina.com.cn/complaint/view/17358758620#gsdata_cmd5=60d4284ce0d5ebbf73a11d28c5d298db', html='<div class="ts-d-item"> \n <!-- 底部处理流程头像部分 --> \n <div class="ts-d-user clearfix"> \n  <img src="" alt="" class="avatar"> \n  <span class="u-name">黑猫消费者服务平台</span> \n  <span class="u-status">商家处理中</span> \n </div> \n <div class="ts-d-cont"> \n  <p>已分配商家 京东客服</p> \n  <br> \n  <p></p> \n </div> \n</div>', page_layout_type='forum', domain='tousu.sina.com.cn', dataset_name='sina', data_source_category='HTML', meta_info={'filename': 's3://private-crawl-data/zh-web-sina/20241218_p1/2022/1732884424577.json.gz', 'statics': {'paragraph': 3, 'paragraph.text': 3}}, content_list=[[{'type': 'paragraph', 'raw_content': '<div class="ts-d-item"><div class="ts-d-user clearfix"><span class="u-name">黑猫消费者服务平台</span></div></div>', 'content': [{'c': '黑猫消费者服务平台', 't': 'text'}]}, {'type': 'paragraph', 'raw_content': '<div class="ts-d-item"><div cl

DEBUG:Command to send: m
d
o712
e

DEBUG:Answer received: !yv
DEBUG:Command to send: m
d
o713
e

DEBUG:Answer received: !yv


## 排除error 数据

In [None]:
processed_rdd_filter = processed_rdd.filter(lambda x:"__error" not in x).persist()


In [None]:
processed_rdd_filter.count()



In [98]:
processed_rdd_error_filter = processed_rdd.filter(lambda x:"__error"  in x)
write_any_path(processed_rdd_error_filter.map(lambda x: Row(value=json_dumps(x.asDict()))).toDF(),"s3://web-parse-hw60p/xuchao/zx-html-error/sina_blog/v001")

25/03/13 18:34:53 WARN TaskSetManager: Lost task 1628.0 in stage 23.0 (TID 46535) (host-10-140-92-226 executor 218): TaskKilled (Stage cancelled)
25/03/13 18:35:01 WARN TaskSetManager: Lost task 5177.0 in stage 23.0 (TID 46045) (host-10-140-92-136 executor 183): TaskKilled (Stage cancelled)
25/03/13 18:35:03 WARN TaskSetManager: Lost task 2117.0 in stage 23.0 (TID 44120) (host-10-140-92-90 executor 140): TaskKilled (Stage cancelled)
25/03/13 18:35:10 WARN TaskSetManager: Lost task 1091.0 in stage 23.0 (TID 43811) (host-10-140-92-228 executor 193): TaskKilled (Stage cancelled)
                                                                                

{'rows': 167733,
 'bytes': {'sum': 5138783050,
  'min': 1816,
  'max': 2334641,
  'cnt': 167733,
  'avg': 30636.685},
 'files': 6000,
 'sub_paths': {}}

In [88]:
platform_config["output_template"]

's3://llm-users-phdd2/jiangwenhao/article/zh-web-sina/v001/'

In [89]:
result_df = processed_rdd_filter.map(lambda x: Row(value=json_dumps(x.asDict()))).toDF()

# 输出结果Row(value=json.dumps(x.asDict()))).toDF()
#write_any_path(processed_rdd.map(lambda x: Row(value=json.dumps(x.asDict()))).toDF(), platform_config["output_template"],config)

platform_config

                                                                                

{'field_mappings': {'track_id': 'track_id',
  'page_layout_type_map': {'': 'article',
   '文章': 'article',
   '网易': 'article',
   '视频': 'video',
   '搜狐网': 'article',
   '黑猫投诉': 'forum',
   '新浪网': 'article'},
  'dataset_name': 'sina',
  'url': 'url',
  'html': 'content',
  'layout_field': 'f_name',
  'data_source_category': 'HTML'},
 'extractor_config': '/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc',
 'output_template': 's3://llm-users-phdd2/jiangwenhao/article/zh-web-sina/v001/'}

In [95]:
platform_config["output_template"]

's3://llm-users-phdd2/jiangwenhao/article/zh-web-sina/v002/'

25/03/13 17:29:13 WARN TaskSetManager: Lost task 211.0 in stage 17.0 (TID 30427) (host-10-140-93-45 executor 241): TaskKilled (Stage cancelled)
25/03/13 17:29:14 WARN TaskSetManager: Lost task 4973.0 in stage 17.0 (TID 34145) (host-10-140-93-29 executor 116): TaskKilled (Stage cancelled)
25/03/13 17:29:14 WARN TaskSetManager: Lost task 4052.0 in stage 17.0 (TID 34681) (host-10-140-92-123 executor 34): TaskKilled (Stage cancelled)
25/03/13 17:29:14 WARN TaskSetManager: Lost task 1117.0 in stage 17.0 (TID 35156) (host-10-140-93-63 executor 87): TaskKilled (Stage cancelled)
25/03/13 17:29:14 WARN TaskSetManager: Lost task 522.0 in stage 17.0 (TID 30992) (host-10-140-92-28 executor 250): TaskKilled (Stage cancelled)
25/03/13 17:29:14 WARN TaskSetManager: Lost task 521.0 in stage 17.0 (TID 30742) (host-10-140-92-28 executor 250): TaskKilled (Stage cancelled)
25/03/13 17:29:14 WARN TaskSetManager: Lost task 649.0 in stage 17.0 (TID 32503) (host-10-140-92-28 executor 249): TaskKilled (Stage c

In [96]:
write_any_path(result_df, platform_config["output_template"])



                                                                                

{'rows': 229402632,
 'bytes': {'sum': 1764715225792,
  'min': 380,
  'max': 14546436,
  'cnt': 229402632,
  'avg': 7692.655},
 'files': 6000,
 'sub_paths': {}}

In [None]:
x = "s3://llm-users-phdd2/jiangwenhao/article/zh-web-tencent/"
s3 = get_s3_client(x)
_ = list(list_s3_objects(client=s3, path = x))
_

In [None]:
get_s3_config("s3://llm-users-phdd2/jiangwenhao/article/zh-web-tencent/2019/v017/")

In [None]:
# 清理广播变量
broadcast_field_mappings.unpersist()
broadcast_extractor_config_path.unpersist()

In [None]:
from xinghe.spark import *
from app.common.json_util import *

from xinghe.s3 import *

delete_path = "s3://llm-users-phdd2/jiangwenhao/"
client = get_s3_client(delete_path)
delete_s3_object(delete_path +'article/zh-web-baijiahao/v004', client = client, dry_run = False)

In [None]:
list(list_s3_objects('s3://llm-users-phdd2/jiangwenhao/article/zh-web-baijiahao/', client = client, recursive=False))

In [None]:
   def process_platform_data(self, platform: str, input_paths: list, version: str = "001"):
        """处理指定平台数据的主流程"""
        # 获取平台配置
        config = self.platform_configs.get(platform)
        if not config:
            raise ValueError(f"Unsupported platform: {platform}")

        # 读取原始数据
        input_df = read_any_path(self.spark, ",".join(input_paths), config)

        # 数据转换
        transformed_rdd = input_df.rdd.map(
            partial(self._transform_row, platform=platform)
        ).repartition(6000)

        # 数据抽取
        processed_rdd = transformed_rdd.mapPartitions(
            partial(self._extract_data, platform=platform)
        )

        # 写入输出
        output_path = config["output_template"].format(
            platform=platform,
            version=version.zfill(3)
        )
        write_any_path(
            processed_rdd.map(lambda x: Row(value=json.dumps(x))).toDF(),
            output_path,
            {"skip_output_check": True}
        )

In [None]:
from pyspark.sql import Row, SparkSession
from typing import Dict, Any
import json
import uuid
from functools import partial
from typing import Iterable, Dict, Any
# create spark session
from pyspark.sql import Row, DataFrame

from xinghe.spark import *
from app.common.json_util import *

from xinghe.s3 import *


config = {
    "spark_conf_name": "spark_4",
    "skip_success_check": True,
    # 根据个人路径进行替换1
    "spark.executorEnv.LLM_WEB_KIT_CFG_PATH": "/share/jiangwenhao/.llm-web-kit.jsonc",
    "spark.dynamicAllocation.maxExecutors": "400",
}

spark = new_spark_session("llm_kit_cc", config)

processor = WebDataProcessor(spark)
platform="netease"
input_paths=["s3://private-crawl-data/zh-web-netease/20241218_p1/"]
version="002"


In [None]:
# 获取平台配置
self =processor
config = processor.platform_configs.get(platform)
if not config:
    raise ValueError(f"Unsupported platform: {platform}")

# 读取原始数据
input_df = read_any_path(self.spark, ",".join(input_paths), config)


In [None]:
input_df.count()

In [None]:
sample_fraction = 0.1
if sample_fraction==1.0:
    df_sample = input_df
else:
    df_sample = input_df.sample(fraction=sample_fraction)

In [None]:
df_sample = df_sample.limit(1)
type(df_sample)

In [None]:
transformed_rdd = df_sample.rdd.map(
        partial(self._transform_row, platform=platform)
    )
transformed_rdd.count()

In [None]:
processor.platform_configs.get(platform)