In [27]:

import os
from pyspark.sql import Row, SparkSession
from typing import Dict, Any
import json
import uuid
from functools import partial
from typing import Iterable, Dict, Any
# create spark session
from pyspark.sql import Row, DataFrame
from loguru import logger
from xinghe.spark import *
from app.common.json_util import *
from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
from xinghe.s3 import *


import uuid
import traceback
from datetime import datetime
from llm_web_kit.input.datajson import DataJson
from func_timeout import FunctionTimedOut, func_timeout

#| track_id                  | uuid                           | 全局唯一的ID                                                                                                        | 是              |
#| dataset_name              | str                            | 数据集的名字（全局唯一），这个名字是管理员输入的，然后做索引的时候带到index里来                                     | 是              |
#| data_source_category      | str                            | 这一行数据代表的是HTML，PDF，EBOOK,CC,labCC类型                                                                     | 是，此处是 HTML |
#| html                      | 字符串                         | 以UTF-8为编码的HTML文件全文                                                                                         | 是              |
#| url                       | 字符串                         | 这个文件的来源网址                                                                                                  | 是              |
#| file_bytes                | 整数                           | 文件的size, 单位是byte                                                                                              | 是              |
#| meta_info                 | 字典                           | 存放关于文件的元信息:如果能从文件里获取到作者，制作日期等信息。或者数据本身就带有一些其他的信息都放入到这个字段里。 | 是              |
#| meta_info->input_datetime | 其格式为 `yyyy-mm-dd HH:MM:SS` | 生成这个json索引文件这一条数据的时间，可以不用那么精确                                                              | 是              |


def process_platform_data(spark: SparkSession, config,platform: str, input_paths: list, version: str = "001"):
    """接收 spark 作为参数而不是持有它"""
    platform_config = platform_configs.get(platform)
    if not platform_config:
        raise ValueError(f"Unsupported platform: {platform}")

    # Driver 端操作
    input_df = read_any_path(spark, ",".join(input_paths), config)
    print(f"读取数据结束")
    # 准备 Worker 端配置
    worker_config = {
        "field_mappings": platform_config.get("field_mappings"),
        "extractor_config": platform_config.get("extractor_config")
    }
    # 分别广播不同配置
    broadcast_field_mappings = spark.sparkContext.broadcast(worker_config["field_mappings"])
    broadcast_extractor_config_path = spark.sparkContext.broadcast(worker_config["extractor_config"])  # 广播路径

    print("原始数据json结构")
    input_rdd = input_df.rdd.map(lambda x: Row(**{**json.loads(x.value), "filename": x.filename})).cache()
  #  pandas_df = input_rdd.toDF().toPandas()
  #  print(pandas_df)
    print(input_rdd.take(1)[0].asDict().keys())
    # 数据转换（仅用字段映射）
    transformed_rdd = input_rdd.map(
        lambda row: transform_row(row, broadcast_field_mappings.value)
    )
   
    print("formatter数据结束")
    # 数据抽取（仅用抽取器配置路径）
    processed_rdd = transformed_rdd.mapPartitions(
         lambda x:extract_data(
            x,broadcast_extractor_config_path = broadcast_extractor_config_path.value
        )
    )
    print("extractor数据结束")
    print("写入数据中")
   
    # 输出结果Row(value=json.dumps(x.asDict()))).toDF()
    write_any_path(processed_rdd.map(lambda x: Row(value=json.dumps(x.asDict()))).toDF(), platform_config["output_template"],config)
    print("写入数据结束")

    # 清理广播变量
    broadcast_field_mappings.unpersist()
    broadcast_extractor_config_path.unpersist()



def transform_row(row, config: dict) -> Row:
    """根据平台配置转换行数据"""
    mappings = config

    return Row(
        track_id=getattr(row, mappings["track_id"],str(uuid.uuid4())),
        url=getattr(row, mappings["url"], ''),
        html=getattr(row, mappings["html"], ''),
        page_layout_type=mappings.get("page_layout_type_map").get(
            getattr(row, mappings["layout_field"], ''),
            "article"
        ),
        domain=extract_domain_info(getattr(row, mappings["url"], ''))['domain'],
        dataset_name=mappings["dataset_name"],
        data_source_category=mappings["data_source_category"],
        meta_info={"filename":row.filename}
    )




def extract_data(partition, broadcast_extractor_config_path):
    from loguru import logger
    extractor_chain = ExtractSimpleFactory.create('/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc')
    timeout_seconds = 10
    # 为每个分区创建唯一的错误日志文件
    # partition_id = str(uuid.uuid4())
    # current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    # error_log_path = f"s3://xyz-llm-users/xyz-users/yujia/CC-MAIN-2024-33/output/v002/error_logs/{current_time}_{partition_id}.json"
    # s3_doc_writer = S3DocWriter(path=error_log_path)
    try:
        for row in partition:
            d = row.asDict()
           
            input_data = DataJson(d)
            data_e: DataJson = func_timeout(timeout_seconds, extractor_chain.extract, args=(input_data,))
            #data_e: DataJson = extractor_chain.extract(input_data)
            yield Row(**data_e.to_dict())
    except FunctionTimedOut as e1:
            d['__error'] = {
                "error_type":"TIMEOUT",
                "error_message": "extract function timeout",
                "traceback":"TIMEOUT"
            }
            yield Row(**d)
    except Exception as e:
            # 记录更详细的错误信息
            error_info = {
                "error_type": type(e).__name__,
                "error_message": str(e),
                "traceback": traceback.format_exc(),
            }
            logger.error(error_info)
            # s3_doc_writer.write(error_info)
            d['__error'] = error_info
            yield Row(**d)
           


def _safe_extract(data: Dict, extractor, timeout: int = 10) -> Dict:
  
    """直接返回字典，避免生成器"""
    try:
        timeout_seconds = 10
        input_data = DataJson(data.asDict())
        print(input_data)
        data_e: DataJson = func_timeout(timeout_seconds, extractor.extract, \
                                                args=(input_data,))

        print(Row(**data_e.to_dict()))
        return data_e.to_dict() 
    except Exception as e:
        # 记录更详细的错误信息
        error_info = {
            "error_type": type(e).__name__,
            "error_message": str(e),
            "traceback": traceback.format_exc(),
        }
        logger.error(error_info)
        # s3_doc_writer.write(error_info)
        data['__error'] = error_info
        yield Row(**d)


from urllib.parse import urlparse

def extract_domain_info(url: str) -> dict:
    """从 URL 中提取完整的域名信息"""
    parsed = urlparse(url)
    netloc = parsed.netloc
    domain_parts = netloc.split(":")
    domain = domain_parts[0]  # 去除端口号
    root_domain = ".".join(domain.split(".")[-2:]) if len(domain.split(".")) >= 2 else domain

    return {
        "full_url": url,
        "netloc": netloc,
        "domain": domain,
        "root_domain": root_domain
    }
        
def handle_error(row: Dict, error: Exception) -> Dict:
    """统一错误处理"""
    row_dict = row.asDict()
    return {
     ** row,
    "__error": {
        "type": type(error).__name__,
        "message": str(error),
        "traceback": traceback.format_exc()
    }
    }

    
def extract_platform_from_s3_path(s3_path: str) -> str:
    """
    从 S3 路径中提取平台名称（存储桶后的第一个目录）
    
    示例输入: 
    - "s3://private-cooperate-data/zh-web-baijiahao/20241218_p1/"
    输出: "zh-web-baijiahao"
    
    - "s3://private-cooperate-data/DouBan/"
    输出: "DouBan"
    """
    # 分割路径并过滤空字符串
    parts = [p for p in s3_path.split("/") if p.strip() != ""]
    
    # 验证路径格式
    if len(parts) < 3:
        raise ValueError(f"无效的 S3 路径格式: {s3_path}")
    
    # 平台名称是存储桶后的第一个目录
    return parts[2]



config = {
    "spark_conf_name": "spark_4",
    "skip_success_check": True,
    "spark.executor.memory":"8g",
    # 根据个人路径进行替换1
    "spark.executorEnv.LLM_WEB_KIT_CFG_PATH": "/share/jiangwenhao/.llm-web-kit.jsonc",
    "spark.dynamicAllocation.maxExecutors": "400"
}
paths = [
#"s3://private-cooperate-data/zh-web-baijiahao/",
#"s3://private-crawl-data/zh-web-netease/20241218_p1/",
   "s3://private-crawl-data/zh-web-tencent/20241218_p1/",
    "s3://private-crawl-data/zh-web-sohu/20241218_p1/",
 #   "s3://private-crawl-data/zh-web-sina/20241218_p1/",
   # "s3://crawl-data/blog_sina_com_cn/gz_file/1729501052/",
   # "s3://private-cooperate-data/DouBan/"
    

]


# spark = new_spark_session("llm_kit_cc", config)
# version="008"
# for path in paths:
#     platform = extract_platform_from_s3_path(path)
#     print(f"路径: {path} → 平台: {platform}")
#     platform_configs = {
#
#     "zh-web-baijiahao": {
#
#         "field_mappings": {
#             "track_id":"track_id",
#             "page_layout_type_map" :{
#                                 "":"article",
#                                 "文章":"article",
#                                 "网易":'article',
#                                 "视频":"video"
#                             },
#         "dataset_name": "baijiahao",
#         "url":"url",
#         "html":"content",
#         "layout_field": "channel",
#         "data_source_category":"JSON"
#         },
#         "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
#         "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
#
#     },
#     "zh-web-netease": {
#
#         "field_mappings": {
#              "track_id":"track_id",
#             "page_layout_type_map" :{
#                                 "":"article",
#                                 "文章":"article",
#                                 "网易":'article',
#                                 "视频":"video"
#                             },
#         "dataset_name": "net-ease",
#         "url":"url",
#         "html":"content",
#         "layout_field": "f_name",
#         "data_source_category":"HTML"
#         },
#         "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
#         "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
#     },
#     "zh-web-tencent": {
#
#         "field_mappings": {
#              "track_id":"track_id",
#             "page_layout_type_map" :{
#                                 "":"article",
#                                 "文章":"article",
#                                 "网易":'article',
#                                 "视频":"video",
#                                 "腾讯网":"article"
#                             },
#         "dataset_name": "tencent",
#         "url":"url",
#         "html":"content",
#         "layout_field": "f_name",
#         "data_source_category":"HTML"
#         },
#         "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
#         "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
#     },
#      "zh-web-sohu": {
#
#         "field_mappings": {
#              "track_id":"track_id",
#             "page_layout_type_map" :{
#                                 "":"article",
#                                 "文章":"article",
#                                 "网易":'article',
#                                 "视频":"video",
#                                 "搜狐网":"article"
#                             },
#         "dataset_name": "souhu",
#         "url":"url",
#         "html":"content",
#         "layout_field": "f_name",
#         "data_source_category":"HTML"
#         },
#         "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
#         "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
#     },
#     "zh-web-sina": {
#
#         "field_mappings": {
#              "track_id":"track_id",
#             "page_layout_type_map" :{
#                                 "":"article",
#                                 "文章":"article",
#                                 "网易":'article',
#                                 "视频":"video",
#                                 "搜狐网":"article",
#                                 "黑猫投诉":"forum"
#                             },
#         "dataset_name": "sina",
#         "url":"url",
#         "html":"content",
#         "layout_field": "f_name",
#         "data_source_category":"HTML"
#         },
#         "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
#         "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
#     },
#
#     "blog_sina_com_cn": {
#
#         "field_mappings": {
#              "track_id":"track_id",
#             "page_layout_type_map" :{
#                                 "":"article",
#                                 "文章":"article",
#                                 "网易":'article',
#                                 "视频":"video",
#                                 "搜狐网":"article",
#                                 "黑猫投诉":"forum"
#                             },
#         "dataset_name": "sina",
#         "url":"url",
#         "html":"html",
#         "layout_field": "f_name",
#         "data_source_category":"HTML"
#         },
#         "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
#         "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
#     },
#     "DouBan": {
#
#         "field_mappings": {
#              "track_id":"track_id",
#             "page_layout_type_map" :{
#                                 "":"article",
#                                 "文章":"article",
#                                 "网易":'article',
#                                 "视频":"video",
#                                 "搜狐网":"article",
#                                 "黑猫投诉":"forum",
#                                 "豆瓣网":""
#                             },
#         "dataset_name": "sina",
#         "url":"url",
#         "html":"html",
#         "layout_field": "f_name",
#         "data_source_category":"HTML"
#         },
#         "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
#         "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
#     },
#}
    # 处理正式数据
 #  process_platform_data(
 #      spark=spark,
 #      config=config,
 #      platform=platform,
 #      input_paths=[path],
 #      version=version
 # )


    # 处理其他平台数据
    # processor.process_platform_data("other_platform", [...])

In [29]:

import os
from pyspark.sql import Row, SparkSession
from typing import Dict, Any
import json
import uuid
from functools import partial
from typing import Iterable, Dict, Any
# create spark session
from pyspark.sql import Row, DataFrame
from loguru import logger
from xinghe.spark import *
from app.common.json_util import *

from xinghe.s3 import *


import uuid
import traceback
from datetime import datetime
from llm_web_kit.input.datajson import DataJson
from func_timeout import FunctionTimedOut, func_timeout

In [30]:
version="018"
# sub_dir = "202401"
input_paths =[f's3://private-crawl-data/zh-web-tencent/20241218_p1/']
platform = extract_platform_from_s3_path(input_paths[0])
platform_configs = {

    "zh-web-baijiahao": {

        "field_mappings": {
            "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video"
                            },
        "dataset_name": "baijiahao",
        "url":"url",
        "html":"content",
        "layout_field": "channel",
        "data_source_category":"JSON"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/{sub_dir}/v{version}/"

    },
    "zh-web-netease": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video"
                            },
        "dataset_name": "net-ease",
        "url":"url",
        "html":"content",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
    "zh-web-tencent": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "腾讯网":"article"
                            },
        "dataset_name": "tencent",
        "url":"url",
        "html":"content",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
     "zh-web-sohu": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "搜狐网":"article"
                            },
        "dataset_name": "souhu",
        "url":"url",
        "html":"content",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
    "zh-web-sina": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "搜狐网":"article",
                                "黑猫投诉":"forum"
                            },
        "dataset_name": "sina",
        "url":"url",
        "html":"content",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },

    "blog_sina_com_cn": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "搜狐网":"article",
                                "黑猫投诉":"forum"
                            },
        "dataset_name": "sina",
        "url":"url",
        "html":"html",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
    "DouBan": {
       
        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "搜狐网":"article",
                                "黑猫投诉":"forum",
                                "豆瓣网":""
                            },
        "dataset_name": "sina",
        "url":"url",
        "html":"html",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
}
platform_config = platform_configs.get(platform)
platform_config


{'field_mappings': {'track_id': 'track_id',
  'page_layout_type_map': {'': 'article',
   '文章': 'article',
   '网易': 'article',
   '视频': 'video',
   '腾讯网': 'article'},
  'dataset_name': 'tencent',
  'url': 'url',
  'html': 'content',
  'layout_field': 'f_name',
  'data_source_category': 'HTML'},
 'extractor_config': '/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc',
 'output_template': 's3://llm-users-phdd2/jiangwenhao/article/zh-web-tencent/v018/'}

In [31]:

config = {
    "spark_conf_name": "spark_4",
    "skip_success_check": True,
    # 根据个人路径进行替换1
    "spark.executorEnv.LLM_WEB_KIT_CFG_PATH": "/share/jiangwenhao/.llm-web-kit.jsonc",
    "spark.dynamicAllocation.maxExecutors": "400",
}
spark = new_spark_session("llm_kit_cc", config)


print(f"路径: {input_paths[0]} → 平台: {platform}")


"""接收 spark 作为参数而不是持有它"""
platform_config = platform_configs.get(platform)
if not platform_config:
    raise ValueError(f"Unsupported platform: {platform}")

# Driver 端操作
input_df = read_any_path(spark, ",".join(input_paths), config)
print(f"读取数据结束")
# 准备 Worker 端配置
worker_config = {
    "field_mappings": platform_config.get("field_mappings"),
    "extractor_config": platform_config.get("extractor_config")
}
# 分别广播不同配置
broadcast_field_mappings = spark.sparkContext.broadcast(worker_config["field_mappings"])
broadcast_extractor_config_path = spark.sparkContext.broadcast(worker_config["extractor_config"])  # 广播路径


路径: s3://private-crawl-data/zh-web-tencent/20241218_p1/ → 平台: zh-web-tencent
读取数据结束


In [32]:
get_s3_config("s3://private-crawl-data/zh-web-tencent/20241218_p1/")


{'endpoint': 'http://10.140.97.42',
 'ak': 'OA8AZVFH6110XW4A51NX',
 'sk': 'aZ5XfNpTZ8xSAa9lcs7MhYAy7wGr3WAzhzO5AfR8'}

In [33]:
from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType
#安全解析函数（包含异常捕获）
def safe_json_loads(s):
    try:
        json.loads(s)
        return True
    except:
        return False
# 将Python函数注册为Spark UDF
safe_json_udf = udf(safe_json_loads, BooleanType())

# 在filter中使用UDF生成的Column表达式
df_filtered = input_df.filter(safe_json_udf(col("value")))



In [34]:

input_rdd = df_filtered.rdd.map(lambda x: Row(**{**json.loads(x.value), "filename": x.filename})).cache()


In [35]:

def transform_row(row, config: dict) -> Row:
    """根据平台配置转换行数据"""
    mappings = config

    return Row(
        track_id=getattr(row, mappings["track_id"],str(uuid.uuid4())),
        url=getattr(row, mappings["url"], ''),
        html=getattr(row, mappings["html"], ''),
        page_layout_type=mappings.get("page_layout_type_map").get(
            getattr(row, mappings["layout_field"], ''),
            "article"
        ),
        domain=extract_domain_info(getattr(row, mappings["url"], ''))['domain'],
        dataset_name=mappings["dataset_name"],
        data_source_category=mappings["data_source_category"],
        meta_info={"filename":row.filename}
    )
#  pandas_df = input_rdd.toDF().toPandas()
#  print(pandas_df)

# 数据转换（仅用字段映射）
transformed_rdd = input_rdd.map(
    lambda row: transform_row(row, broadcast_field_mappings.value)
).repartition(1000)


In [36]:
# transformed_rdd_1 = transformed_rdd.filter(lambda x: "<!--VIDEO_0-->"  not in x.html)
# transformed_rdd_2 = transformed_rdd_1.filter(lambda x: "<!--MUSIC_0-->" not in x.html)
# transformed_rdd_2 = transformed_rdd_2.cache()
transformed_rdd.cache()
# 18660
# 62
# 50000-(18660+62)=31278

MapPartitionsRDD[16] at coalesce at NativeMethodAccessorImpl.java:0

In [37]:


def extract_data(partition, broadcast_extractor_config_path):
   
    from loguru import logger

    extractor_chain = ExtractSimpleFactory.create('/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc')
    timeout_seconds = 10
    # 为每个分区创建唯一的错误日志文件
    # partition_id = str(uuid.uuid4())
    # current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    # error_log_path = f"s3://xyz-llm-users/xyz-users/yujia/CC-MAIN-2024-33/output/v002/error_logs/{current_time}_{partition_id}.json"
    # s3_doc_writer = S3DocWriter(path=error_log_path)
    for row in partition:
        try:

            d = row.asDict()
            input_data = DataJson(d)
            data_e: DataJson = func_timeout(timeout_seconds, extractor_chain.extract, args=(input_data,))
            #data_e: DataJson = extractor_chain.extract(input_data)
            
    
            yield Row(**data_e.to_dict())
        except FunctionTimedOut as e1:
                d['__error'] = {
                    "error_type":"TIMEOUT",
                    "error_message": "extract function timeout",
                    "traceback":"TIMEOUT"
                }

                yield Row(**d)
        except Exception as e:
                # 记录更详细的错误信息
                error_info = {
                    "error_data":Row(**d),
                    "error_type": type(e).__name__,
                    "error_message": str(e),
                    "traceback": traceback.format_exc(),
                }
                logger.error(error_info)
                # s3_doc_writer.write(error_info)
                d['__error'] = error_info

                yield Row(**d)
           


# 数据抽取（仅用抽取器配置路径）
processed_rdd = transformed_rdd.mapPartitions(
     lambda x:extract_data(
        x,broadcast_extractor_config_path = broadcast_extractor_config_path.value
    )
)



In [38]:
result_df = processed_rdd.map(lambda x: Row(value=json.dumps(x.asDict()))).filter(lambda x:"__error" not in x).toDF()
# 输出结果Row(value=json.dumps(x.asDict()))).toDF()
#write_any_path(processed_rdd.map(lambda x: Row(value=json.dumps(x.asDict()))).toDF(), platform_config["output_template"],config)

platform_config

                                                                                

{'field_mappings': {'track_id': 'track_id',
  'page_layout_type_map': {'': 'article',
   '文章': 'article',
   '网易': 'article',
   '视频': 'video',
   '腾讯网': 'article'},
  'dataset_name': 'tencent',
  'url': 'url',
  'html': 'content',
  'layout_field': 'f_name',
  'data_source_category': 'HTML'},
 'extractor_config': '/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc',
 'output_template': 's3://llm-users-phdd2/jiangwenhao/article/zh-web-tencent/v018/'}

In [None]:
write_any_path(result_df, platform_config["output_template"])





In [None]:
x = "s3://llm-users-phdd2/jiangwenhao/article/zh-web-tencent/v018/"
s3 = get_s3_client(x)
_ = list(list_s3_objects(client=s3, path = x))
_

In [3]:
get_s3_config("s3://llm-users-phdd2/jiangwenhao/article/zh-web-tencent/2019/v017/")

{'endpoint': 'http://10.135.0.241',
 'ak': 'L5F6OE3EQEK00V4MV0E4',
 'sk': 'oAwAZurrQps6VbOXKHpG9XjRtmjAG7ROOjfzquwC'}

In [None]:
# 清理广播变量
broadcast_field_mappings.unpersist()
broadcast_extractor_config_path.unpersist()

In [23]:
from xinghe.spark import *
from app.common.json_util import *

from xinghe.s3 import *

delete_path = "s3://llm-users-phdd2/jiangwenhao/"
client = get_s3_client(delete_path)
delete_s3_object(delete_path +'article/zh-web-baijiahao/v004', client = client, dry_run = False)

In [24]:
list(list_s3_objects('s3://llm-users-phdd2/jiangwenhao/article/zh-web-baijiahao/', client = client, recursive=False))

['s3://llm-users-phdd2/jiangwenhao/article/zh-web-baijiahao/v002/',
 's3://llm-users-phdd2/jiangwenhao/article/zh-web-baijiahao/v003/',
 's3://llm-users-phdd2/jiangwenhao/article/zh-web-baijiahao/v004/',
 's3://llm-users-phdd2/jiangwenhao/article/zh-web-baijiahao/v005/']

In [None]:
   def process_platform_data(self, platform: str, input_paths: list, version: str = "001"):
        """处理指定平台数据的主流程"""
        # 获取平台配置
        config = self.platform_configs.get(platform)
        if not config:
            raise ValueError(f"Unsupported platform: {platform}")

        # 读取原始数据
        input_df = read_any_path(self.spark, ",".join(input_paths), config)

        # 数据转换
        transformed_rdd = input_df.rdd.map(
            partial(self._transform_row, platform=platform)
        ).repartition(6000)

        # 数据抽取
        processed_rdd = transformed_rdd.mapPartitions(
            partial(self._extract_data, platform=platform)
        )

        # 写入输出
        output_path = config["output_template"].format(
            platform=platform,
            version=version.zfill(3)
        )
        write_any_path(
            processed_rdd.map(lambda x: Row(value=json.dumps(x))).toDF(),
            output_path,
            {"skip_output_check": True}
        )

In [4]:
from pyspark.sql import Row, SparkSession
from typing import Dict, Any
import json
import uuid
from functools import partial
from typing import Iterable, Dict, Any
# create spark session
from pyspark.sql import Row, DataFrame

from xinghe.spark import *
from app.common.json_util import *

from xinghe.s3 import *


config = {
    "spark_conf_name": "spark_4",
    "skip_success_check": True,
    # 根据个人路径进行替换1
    "spark.executorEnv.LLM_WEB_KIT_CFG_PATH": "/share/jiangwenhao/.llm-web-kit.jsonc",
    "spark.dynamicAllocation.maxExecutors": "400",
}

spark = new_spark_session("llm_kit_cc", config)

processor = WebDataProcessor(spark)
platform="netease"
input_paths=["s3://private-crawl-data/zh-web-netease/20241218_p1/"]
version="002"


In [7]:
# 获取平台配置
self =processor
config = processor.platform_configs.get(platform)
if not config:
    raise ValueError(f"Unsupported platform: {platform}")

# 读取原始数据
input_df = read_any_path(self.spark, ",".join(input_paths), config)


In [21]:
input_df.count()

25/02/27 15:05:42 WARN TaskSetManager: Lost task 120.0 in stage 13.0 (TID 694) (host-10-140-92-29 executor 1): java.io.EOFException: Unexpected end of input stream
	at org.apache.hadoop.io.compress.DecompressorStream.decompress(DecompressorStream.java:165)
	at org.apache.hadoop.io.compress.DecompressorStream.read(DecompressorStream.java:105)
	at java.io.InputStream.read(InputStream.java:101)
	at org.apache.hadoop.util.LineReader.fillBuffer(LineReader.java:191)
	at org.apache.hadoop.util.LineReader.readDefaultLine(LineReader.java:227)
	at org.apache.hadoop.util.LineReader.readLine(LineReader.java:185)
	at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.nextKeyValue(LineRecordReader.java:200)
	at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
	at org.apache.spark.sql.execution.datasources.HadoopFileLinesReader.hasNext(HadoopFileLinesReader.scala:67)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.

Py4JJavaError: An error occurred while calling o1376.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 120 in stage 13.0 failed 4 times, most recent failure: Lost task 120.3 in stage 13.0 (TID 697) (host-10-140-92-130 executor 2): java.io.EOFException: Unexpected end of input stream
	at org.apache.hadoop.io.compress.DecompressorStream.decompress(DecompressorStream.java:165)
	at org.apache.hadoop.io.compress.DecompressorStream.read(DecompressorStream.java:105)
	at java.io.InputStream.read(InputStream.java:101)
	at org.apache.hadoop.util.LineReader.fillBuffer(LineReader.java:191)
	at org.apache.hadoop.util.LineReader.readDefaultLine(LineReader.java:227)
	at org.apache.hadoop.util.LineReader.readLine(LineReader.java:185)
	at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.nextKeyValue(LineRecordReader.java:200)
	at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
	at org.apache.spark.sql.execution.datasources.HadoopFileLinesReader.hasNext(HadoopFileLinesReader.scala:67)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.hashAgg_doAggregateWithoutKey_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:101)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: java.io.EOFException: Unexpected end of input stream
	at org.apache.hadoop.io.compress.DecompressorStream.decompress(DecompressorStream.java:165)
	at org.apache.hadoop.io.compress.DecompressorStream.read(DecompressorStream.java:105)
	at java.io.InputStream.read(InputStream.java:101)
	at org.apache.hadoop.util.LineReader.fillBuffer(LineReader.java:191)
	at org.apache.hadoop.util.LineReader.readDefaultLine(LineReader.java:227)
	at org.apache.hadoop.util.LineReader.readLine(LineReader.java:185)
	at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.nextKeyValue(LineRecordReader.java:200)
	at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
	at org.apache.spark.sql.execution.datasources.HadoopFileLinesReader.hasNext(HadoopFileLinesReader.scala:67)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:125)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.hashAgg_doAggregateWithoutKey_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:101)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)


In [24]:
sample_fraction = 0.1
if sample_fraction==1.0:
    df_sample = input_df
else:
    df_sample = input_df.sample(fraction=sample_fraction)

In [32]:
df_sample = df_sample.limit(1)
type(df_sample)

pyspark.sql.dataframe.DataFrame

In [35]:
transformed_rdd = df_sample.rdd.map(
        partial(self._transform_row, platform=platform)
    )
transformed_rdd.count()

25/02/27 15:09:16 WARN TaskSetManager: Lost task 0.0 in stage 33.0 (TID 1197) (host-10-140-92-130 executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/data/nm-local-dir/usercache/jiangwenhao/appcache/application_1738517174793_0504/container_e02_1738517174793_0504_01_000003/pyspark.zip/pyspark/worker.py", line 830, in main
    process()
  File "/data/nm-local-dir/usercache/jiangwenhao/appcache/application_1738517174793_0504/container_e02_1738517174793_0504_01_000003/pyspark.zip/pyspark/worker.py", line 820, in process
    out_iter = func(split_index, iterator)
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/pyspark/rdd.py", line 5405, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/pyspark/rdd.py", line 5405, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/share/jiangwenhao/envs/code_cl

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 33.0 failed 4 times, most recent failure: Lost task 0.3 in stage 33.0 (TID 1200) (host-10-140-92-130 executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/data/nm-local-dir/usercache/jiangwenhao/appcache/application_1738517174793_0504/container_e02_1738517174793_0504_01_000003/pyspark.zip/pyspark/worker.py", line 830, in main
    process()
  File "/data/nm-local-dir/usercache/jiangwenhao/appcache/application_1738517174793_0504/container_e02_1738517174793_0504_01_000003/pyspark.zip/pyspark/worker.py", line 820, in process
    out_iter = func(split_index, iterator)
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/pyspark/rdd.py", line 5405, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/pyspark/rdd.py", line 5405, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/pyspark/rdd.py", line 5405, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/pyspark/rdd.py", line 828, in func
    return f(iterator)
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/pyspark/rdd.py", line 2297, in <lambda>
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/pyspark/rdd.py", line 2297, in <genexpr>
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "/data/nm-local-dir/usercache/jiangwenhao/appcache/application_1738517174793_0504/container_e02_1738517174793_0504_01_000003/pyspark.zip/pyspark/util.py", line 81, in wrapper
    return f(*args, **kwargs)
TypeError: WebDataProcessor._transform_row() missing 1 required positional argument: 'row'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:561)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:767)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:749)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:514)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1019)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2303)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2328)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1019)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1018)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/data/nm-local-dir/usercache/jiangwenhao/appcache/application_1738517174793_0504/container_e02_1738517174793_0504_01_000003/pyspark.zip/pyspark/worker.py", line 830, in main
    process()
  File "/data/nm-local-dir/usercache/jiangwenhao/appcache/application_1738517174793_0504/container_e02_1738517174793_0504_01_000003/pyspark.zip/pyspark/worker.py", line 820, in process
    out_iter = func(split_index, iterator)
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/pyspark/rdd.py", line 5405, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/pyspark/rdd.py", line 5405, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/pyspark/rdd.py", line 5405, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/pyspark/rdd.py", line 828, in func
    return f(iterator)
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/pyspark/rdd.py", line 2297, in <lambda>
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/pyspark/rdd.py", line 2297, in <genexpr>
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "/data/nm-local-dir/usercache/jiangwenhao/appcache/application_1738517174793_0504/container_e02_1738517174793_0504_01_000003/pyspark.zip/pyspark/util.py", line 81, in wrapper
    return f(*args, **kwargs)
TypeError: WebDataProcessor._transform_row() missing 1 required positional argument: 'row'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:561)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:767)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:749)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:514)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1019)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2303)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
processor.platform_configs.get(platform)