In [1]:

import os
from pyspark.sql import Row, SparkSession
from typing import Dict, Any
import json
import uuid
from functools import partial
from typing import Iterable, Dict, Any
# create spark session
from pyspark.sql import Row, DataFrame
from loguru import logger
from xinghe.spark import *
from app.common.json_util import *

from xinghe.s3 import *


import uuid
import traceback
from datetime import datetime
from llm_web_kit.input.datajson import DataJson
from func_timeout import FunctionTimedOut, func_timeout

#| track_id                  | uuid                           | 全局唯一的ID                                                                                                        | 是              |
#| dataset_name              | str                            | 数据集的名字（全局唯一），这个名字是管理员输入的，然后做索引的时候带到index里来                                     | 是              |
#| data_source_category      | str                            | 这一行数据代表的是HTML，PDF，EBOOK,CC,labCC类型                                                                     | 是，此处是 HTML |
#| html                      | 字符串                         | 以UTF-8为编码的HTML文件全文                                                                                         | 是              |
#| url                       | 字符串                         | 这个文件的来源网址                                                                                                  | 是              |
#| file_bytes                | 整数                           | 文件的size, 单位是byte                                                                                              | 是              |
#| meta_info                 | 字典                           | 存放关于文件的元信息:如果能从文件里获取到作者，制作日期等信息。或者数据本身就带有一些其他的信息都放入到这个字段里。 | 是              |
#| meta_info->input_datetime | 其格式为 `yyyy-mm-dd HH:MM:SS` | 生成这个json索引文件这一条数据的时间，可以不用那么精确                                                              | 是              |


def process_platform_data(spark: SparkSession, config,platform: str, input_paths: list, version: str = "001"):
    """接收 spark 作为参数而不是持有它"""
    platform_config = platform_configs.get(platform)
    if not platform_config:
        raise ValueError(f"Unsupported platform: {platform}")

    # Driver 端操作
    input_df = read_any_path(spark, ",".join(input_paths), config)
    print(f"读取数据结束")
    # 准备 Worker 端配置
    worker_config = {
        "field_mappings": platform_config.get("field_mappings"),
        "extractor_config": platform_config.get("extractor_config")
    }
    # 分别广播不同配置
    broadcast_field_mappings = spark.sparkContext.broadcast(worker_config["field_mappings"])
    broadcast_extractor_config_path = spark.sparkContext.broadcast(worker_config["extractor_config"])  # 广播路径

    print("原始数据json结构")
    input_rdd = input_df.rdd.map(lambda x: Row(**{**json.loads(x.value), "filename": x.filename})).cache()
  #  pandas_df = input_rdd.toDF().toPandas()
  #  print(pandas_df)
    print(input_rdd.take(1)[0].asDict().keys())
    # 数据转换（仅用字段映射）
    transformed_rdd = input_rdd.map(
        lambda row: transform_row(row, broadcast_field_mappings.value)
    )
   
    print("formatter数据结束")
    # 数据抽取（仅用抽取器配置路径）
    processed_rdd = transformed_rdd.mapPartitions(
         lambda x:extract_data(
            x,broadcast_extractor_config_path = broadcast_extractor_config_path.value
        )
    )
    print("extractor数据结束")
    print("写入数据中")
   
    # 输出结果Row(value=json.dumps(x.asDict()))).toDF()
    write_any_path(processed_rdd.map(lambda x: Row(value=json.dumps(x.asDict()))).toDF(), platform_config["output_template"],config)
    print("写入数据结束")

    # 清理广播变量
    broadcast_field_mappings.unpersist()
    broadcast_extractor_config_path.unpersist()



def transform_row(row, config: dict) -> Row:
    """根据平台配置转换行数据"""
    mappings = config

    return Row(
        track_id=getattr(row, mappings["track_id"],str(uuid.uuid4())),
        url=getattr(row, mappings["url"], ''),
        html=getattr(row, mappings["html"], ''),
        page_layout_type=mappings.get("page_layout_type_map").get(
            getattr(row, mappings["layout_field"], ''),
            "article"
        ),
        domain=extract_domain_info(getattr(row, mappings["url"], ''))['domain'],
        dataset_name=mappings["dataset_name"],
        data_source_category=mappings["data_source_category"],
        meta_info={"filename":row.filename}
    )




def extract_data(partition, broadcast_extractor_config_path):
    from loguru import logger
    extractor_chain = ExtractSimpleFactory.create('/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc')
    timeout_seconds = 10
    # 为每个分区创建唯一的错误日志文件
    # partition_id = str(uuid.uuid4())
    # current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    # error_log_path = f"s3://xyz-llm-users/xyz-users/yujia/CC-MAIN-2024-33/output/v002/error_logs/{current_time}_{partition_id}.json"
    # s3_doc_writer = S3DocWriter(path=error_log_path)
    try:
        for row in partition:
            d = row.asDict()
           
            input_data = DataJson(d)
            data_e: DataJson = func_timeout(timeout_seconds, extractor_chain.extract, args=(input_data,))
            #data_e: DataJson = extractor_chain.extract(input_data)
            yield Row(**data_e.to_dict())
    except FunctionTimedOut as e1:
            d['__error'] = {
                "error_type":"TIMEOUT",
                "error_message": "extract function timeout",
                "traceback":"TIMEOUT"
            }
            yield Row(**d)
    except Exception as e:
            # 记录更详细的错误信息
            error_info = {
                "error_type": type(e).__name__,
                "error_message": str(e),
                "traceback": traceback.format_exc(),
            }
            logger.error(error_info)
            # s3_doc_writer.write(error_info)
            d['__error'] = error_info
            yield Row(**d)
           


def _safe_extract(data: Dict, extractor, timeout: int = 10) -> Dict:
  
    """直接返回字典，避免生成器"""
    try:
        timeout_seconds = 10
        input_data = DataJson(data.asDict())
        print(input_data)
        data_e: DataJson = func_timeout(timeout_seconds, extractor.extract, \
                                                args=(input_data,))

        print(Row(**data_e.to_dict()))
        return data_e.to_dict() 
    except Exception as e:
        # 记录更详细的错误信息
        error_info = {
            "error_type": type(e).__name__,
            "error_message": str(e),
            "traceback": traceback.format_exc(),
        }
        logger.error(error_info)
        # s3_doc_writer.write(error_info)
        data['__error'] = error_info
        yield Row(**d)


from urllib.parse import urlparse

def extract_domain_info(url: str) -> dict:
    """从 URL 中提取完整的域名信息"""
    parsed = urlparse(url)
    netloc = parsed.netloc
    domain_parts = netloc.split(":")
    domain = domain_parts[0]  # 去除端口号
    root_domain = ".".join(domain.split(".")[-2:]) if len(domain.split(".")) >= 2 else domain

    return {
        "full_url": url,
        "netloc": netloc,
        "domain": domain,
        "root_domain": root_domain
    }
        
def handle_error(row: Dict, error: Exception) -> Dict:
    """统一错误处理"""
    row_dict = row.asDict()
    return {
     ** row,
    "__error": {
        "type": type(error).__name__,
        "message": str(error),
        "traceback": traceback.format_exc()
    }
    }

    
def extract_platform_from_s3_path(s3_path: str) -> str:
    """
    从 S3 路径中提取平台名称（存储桶后的第一个目录）
    
    示例输入: 
    - "s3://private-cooperate-data/zh-web-baijiahao/20241218_p1/"
    输出: "zh-web-baijiahao"
    
    - "s3://private-cooperate-data/DouBan/"
    输出: "DouBan"
    """
    # 分割路径并过滤空字符串
    parts = [p for p in s3_path.split("/") if p.strip() != ""]
    
    # 验证路径格式
    if len(parts) < 3:
        raise ValueError(f"无效的 S3 路径格式: {s3_path}")
    
    # 平台名称是存储桶后的第一个目录
    return parts[2]


In [2]:

import os
from pyspark.sql import Row, SparkSession
from typing import Dict, Any
import json
import uuid
from functools import partial
from typing import Iterable, Dict, Any
# create spark session
from pyspark.sql import Row, DataFrame
from loguru import logger
from xinghe.spark import *
from app.common.json_util import *

from xinghe.s3 import *


import uuid
import traceback
from datetime import datetime
from llm_web_kit.input.datajson import DataJson
from func_timeout import FunctionTimedOut, func_timeout

## 抽取规则

In [5]:
version="002"
sub_dir=''
input_paths =[f's3://private-crawl-data/zh-web-tencent/20241218_p1/']
platform = extract_platform_from_s3_path(input_paths[0])
platform_configs = {

    "zh-web-baijiahao": {

        "field_mappings": {
            "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video"
                            },
        "dataset_name": "baijiahao",
        "url":"url",
        "html":"content",
        "layout_field": "channel",
        "data_source_category":"JSON"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/{sub_dir}/v{version}/"

    },
    "zh-web-netease": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video"
                            },
        "dataset_name": "net-ease",
        "url":"url",
        "html":"content",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
    "zh-web-tencent": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "腾讯网":"article"
                            },
        "dataset_name": "tencent",
        "url":"url",
        "html":"content",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
     "zh-web-sohu": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "搜狐网":"article"
                            },
        "dataset_name": "souhu",
        "url":"url",
        "html":"content",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
    "zh-web-sina": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "搜狐网":"article",
                                "黑猫投诉":"forum",
                                "新浪网":"article"
                            },
        "dataset_name": "sina",
        "url":"url",
        "html":"content",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },

    "blog_sina_com_cn": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "搜狐网":"article",
                                "黑猫投诉":"forum",
                                
                            },
        "dataset_name": "sina_blog",
        "url":"url",
        "html":"html",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
    "DouBan": {
       
        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "搜狐网":"article",
                                "黑猫投诉":"forum",
                                "豆瓣网":""
                            },
        "dataset_name": "sina",
        "url":"url",
        "html":"html",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
}
platform_config = platform_configs.get(platform)
platform_config


{'field_mappings': {'track_id': 'track_id',
  'page_layout_type_map': {'': 'article',
   '文章': 'article',
   '网易': 'article',
   '视频': 'video',
   '腾讯网': 'article'},
  'dataset_name': 'tencent',
  'url': 'url',
  'html': 'content',
  'layout_field': 'f_name',
  'data_source_category': 'HTML'},
 'extractor_config': '/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc',
 'output_template': 's3://llm-users-phdd2/jiangwenhao/article/zh-web-tencent/v002/'}

In [6]:
# input_paths =[f's3://zhuanxiang-hw60p/article/sina/v006/']





# Driver 端操作
# input_df = read_any_path(spark, ",".join(input_paths), config)


In [7]:
#def is_error__(row):
#    if "__error__" in row:  # 先检查键是否存在
#        return True
#    else:
#        return False
#
#def del_error__(row):
#    row.pop("__error__", None)  # 若键不存在，返回 None 且不报错
#    return row
#final_df = input_df.rdd.map(lambda x: json.loads(x.value)).filter(is_error__)
#final_df.take(1)[0]



In [8]:
config = {
    "spark_conf_name": "spark_4",
    "skip_success_check": True,
    # 根据个人路径进行替换1
    "spark.executorEnv.LLM_WEB_KIT_CFG_PATH": "/share/jiangwenhao/.llm-web-kit.jsonc",
    "spark.dynamicAllocation.maxExecutors": "400",
    "spark.executor.cores": "50",
    "spark.executor.memory": "60g"
}
spark = new_spark_session("llm_kit_cc", config)
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [9]:




print(f"路径: {input_paths[0]} → 平台: {platform}")


"""接收 spark 作为参数而不是持有它"""
platform_config = platform_configs.get(platform)
if not platform_config:
    raise ValueError(f"Unsupported platform: {platform}")

# Driver 端操作
input_df = read_any_path(spark, ",".join(input_paths), config)
print(f"读取数据结束")
# 准备 Worker 端配置
worker_config = {
    "field_mappings": platform_config.get("field_mappings"),
    "extractor_config": platform_config.get("extractor_config")
}
# 分别广播不同配置
broadcast_field_mappings = worker_config["field_mappings"]
broadcast_extractor_config_path = worker_config["extractor_config"] # 广播路径


路径: s3://private-crawl-data/zh-web-tencent/20241218_p1/ → 平台: zh-web-tencent
读取数据结束


In [10]:
get_s3_config(input_paths[0])


{'endpoint': 'http://10.140.97.21',
 'ak': 'OA8AZVFH6110XW4A51NX',
 'sk': 'aZ5XfNpTZ8xSAa9lcs7MhYAy7wGr3WAzhzO5AfR8'}

In [11]:
broadcast_field_mappings

{'track_id': 'track_id',
 'page_layout_type_map': {'': 'article',
  '文章': 'article',
  '网易': 'article',
  '视频': 'video',
  '腾讯网': 'article'},
 'dataset_name': 'tencent',
 'url': 'url',
 'html': 'content',
 'layout_field': 'f_name',
 'data_source_category': 'HTML'}

In [11]:
# input_df = read_any_path(spark, ",".join(['s3a://private-crawl-data/zh-web-tencent/20241218_p1/2020/1733715370691.json.gz']), config)
# input_df.show()
input_df.take(1)[0]

                                                                                

Row(value='{"track_id":"3edfc1c9-c87b-480f-a26e-b90a7d54c24a","url":"https://blog.sina.com.cn/s/article_archive_1884248122_201206_1.html","status":200,"html":"<!DOCTYPE html PUBLIC \\"-//W3C//DTD XHTML 1.0 Transitional//EN\\" \\"//www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\\"><html xmlns=\\"//www.w3.org/1999/xhtml\\"><head><script id=\\"sinaads-ck-script\\" charset=\\"utf-8\\" src=\\"//d4.sina.com.cn/litong/zhitou/sinaads/src/spec/sinaads_ck.js\\"></script><script src=\\"https://d8.sina.com.cn/litong/zhitou/wenjing28/js/postMan.js\\"></script>\\n<meta http-equiv=\\"Content-Type\\" content=\\"text/html; charset=utf-8\\">\\n<title>归档_安徽文学_新浪博客</title>\\n<meta http-equiv=\\"X-UA-Compatible\\" content=\\"IE=EmulateIE8,chrome=1\\">\\n<meta name=\\"renderer\\" content=\\"webkit\\">\\n<meta name=\\"keywords\\" content=\\"\\">\\n<meta name=\\"description\\" content=\\"\\">\\n<meta content=\\"always\\" name=\\"referrer\\">\\n<!--–[if lte IE 6]-->\\n<script type=\\"text/javascript\\">\\ntr

In [12]:

from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType
#安全解析函数（包含异常捕获）
def safe_json_loads(s):
    try:
        json.loads(s)
        return True
    except:
        return False
# 将Python函数注册为Spark UDF
safe_json_udf = udf(safe_json_loads, BooleanType())

# 在filter中使用UDF生成的Column表达式
df_filtered = input_df.filter(safe_json_udf(col("value")))

broadcast_field_mappings

{'track_id': 'track_id',
 'page_layout_type_map': {'': 'article',
  '文章': 'article',
  '网易': 'article',
  '视频': 'video',
  '腾讯网': 'article'},
 'dataset_name': 'tencent',
 'url': 'url',
 'html': 'content',
 'layout_field': 'f_name',
 'data_source_category': 'HTML'}

In [7]:
df_filtered.count()

AttributeError: 'DataFrame' object has no attribute 'rePartitions'

In [13]:


input_rdd = df_filtered.rdd.map(lambda x: Row(**{**json.loads(x.value), "filename": x.filename}))

print(input_rdd.getNumPartitions()) 



288


In [31]:
row_data = input_rdd.take(1)[0]

                                                                                

In [32]:
transform_row(row_data,broadcast_field_mappings).asDict()

['posttime', 'n_content_county', 'm_identity', 'n_origin', 'n_headimg_url', 'f_domain_sec', 'title', 'uuid', 'n_contain_img', 'c_nav', 'n_img_urls', 'n_is_origin', 'n_content_city', 'n_video_urls', 'footplate', 'n_origin_author_name', 'n_contain_video', 'm_is_verified', 'n_origin_url', 'n_origin_author_uid', 'c_level', 'n_content_field', 'm_name', 'c_name', 'n_content_province', 'n_origin_posttime']


{'track_id': 'cfaa59a9-488e-485b-a8bd-a3b3848bdedc',
 'url': 'https://new.qq.com/omn/20241006/20241006A019G500',
 'html': '<body> \n <div class="rich_media_content autoTypeSetting24psection"> \n  <!--NO_AD_ERROR_2--> \n  <section style="line-height: 1.75em"> \n   <span style="color: rgb(64, 118, 0); font-size: 18px">为庆祝中华人民共和国75周年华诞，中国环境APP青山Life研究所面向社会征集生态环保主题少儿绘画，得到少年儿童的积极响应。孩子们用一幅幅主题突出、形式新颖、富有时代气息和生活情趣的作品，展现出对生态环境保护与绿色低碳生活方式的思考与行动，对美丽中国的期盼与向往。现将投稿作品予以选登，与大家共赏。</span> \n  </section> \n  <p><img src="https://inews.gtimg.com/om_bt/OKkFbuaxesP2NUMNlDB2v_Zz1F9emTdkEufvZTiX2zmvQAA/641"></p> \n  <p><img src="https://inews.gtimg.com/om_bt/O6kWJTdnl0xJXVrLTIpmjMkBdRolB8F55Td5KHobBICUQAA/641"></p> \n  <p><img src="https://inews.gtimg.com/om_bt/Onhkv-5GoDyfGmDZf-6gPGyDOcxa8bN-SYMfB_OcSMC8UAA/641"></p> \n  <p><img src="https://inews.gtimg.com/om_bt/O_v907d7XVoPz8yExCAErm1aw63EEJ6QreEE1WSPhCxC8AA/641"></p> \n  <p><img src="https://inews.gtimg.com/om_bt/Oa6FLEf-piLAtC_8hUbIEm54Cz4cnASRcU3qAyfEHPu

## Transform

In [14]:

def transform_row(row, config: dict) -> Row:

    """根据平台配置转换行数据"""
    mappings = config
    from loguru import logger
    try:

         # 获取已使用的字段名
        used_field_names = {
            mappings["track_id"],
            mappings["url"],
            mappings["html"],
            mappings["layout_field"],
            "filename"  # 显式使用的filename字段
        }

        # 获取原始行的所有字段名
        row_field_names = row.__fields__  # 适用于pyspark.sql.Row对象

        # 收集未被使用的字段
        remaining_fields = [f for f in row_field_names if f not in used_field_names]
        print(remaining_fields)
        # 构建meta_info字典
        meta_info = {"filename": getattr(row, "filename", "")}
        for field in remaining_fields:
            meta_info[field] = getattr(row, field, None)

        d = Row(
                track_id=getattr(row, mappings["track_id"],str(uuid.uuid4())),
                url=getattr(row, mappings["url"], ''),
                html=getattr(row, mappings["html"], ''),
                page_layout_type=mappings.get("page_layout_type_map").get(
                    getattr(row, mappings["layout_field"], ''),
                    "article"
                ),
                domain=extract_domain_info(getattr(row, mappings["url"], ''))['domain'],
                dataset_name=mappings["dataset_name"],
                data_source_category=mappings["data_source_category"],
                meta_info=meta_info
            )
        return d 
    except Exception as e:
        # 记录更详细的错误信息
        error_info = {
            "error_data":Row(**d),
            "error_type": type(e).__name__,
            "error_message": str(e),
            "traceback": traceback.format_exc(),
        }
        # s3_doc_writer.write(error_info)
        d['__error'] = error_info
        logger.error(error_info)
        return Row(**d)
   
    
 
#  pandas_df = input_rdd.toDF().toPandas()
#  print(pandas_df)

# 数据转换（仅用字段映射）
transformed_rdd = input_rdd.map(
    lambda row: transform_row(row, broadcast_field_mappings)
).repartition(6000)


In [15]:
broadcast_field_mappings

{'track_id': 'track_id',
 'page_layout_type_map': {'': 'article',
  '文章': 'article',
  '网易': 'article',
  '视频': 'video',
  '腾讯网': 'article'},
 'dataset_name': 'tencent',
 'url': 'url',
 'html': 'content',
 'layout_field': 'f_name',
 'data_source_category': 'HTML'}

In [16]:
transformed_not_error_rdd = transformed_rdd.filter(lambda x:"__error" not in x)
# transformed_not_error_rdd.count()



In [17]:
transformed_not_empty_rdd = transformed_not_error_rdd.filter(lambda x :x.html !='').cache()
# transformed_not_empty_rdd.count()

## 测试 To_Main_HTML

In [18]:


#timeout_seconds = 10
#extractor_chain = ExtractSimpleFactory.create('/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc')
#data_e: DataJson = func_timeout(timeout_seconds, extractor_chain.extract, args=(input_data,))
#d = json.loads(data_e.to_json())
#d['to_main_html'] = data_e.get_content_list().to_main_html()
#d

In [19]:
# transformed_rdd_1 = transformed_rdd.filter(lambda x: "<!--VIDEO_0-->"  not in x.html)
# transformed_rdd_2 = transformed_rdd_1.filter(lambda x: "<!--MUSIC_0-->" not in x.html)
# transformed_rdd_2 = transformed_rdd_2.cache()
transformed_not_empty_rdd.cache()
# 18660
# 62
# 50000-(18660+62)=31278


PythonRDD[16] at RDD at PythonRDD.scala:53

## Extractor

In [21]:


def extract_data(partition, broadcast_extractor_config_path):
    from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
    from loguru import logger

    extractor_chain = ExtractSimpleFactory.create('/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc')
    timeout_seconds = 10
    # 为每个分区创建唯一的错误日志文件
    # partition_id = str(uuid.uuid4())
    # current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    # error_log_path = f"s3://xyz-llm-users/xyz-users/yujia/CC-MAIN-2024-33/output/v002/error_logs/{current_time}_{partition_id}.json"
    # s3_doc_writer = S3DocWriter(path=error_log_path)
    for row in partition:
        try:

            d = row.asDict()
            input_data = DataJson(d)
            data_e: DataJson = func_timeout(timeout_seconds, extractor_chain.extract, args=(input_data,))
            #data_e: DataJson = extractor_chain.extract(input_data)
            
    
            yield Row(**data_e.to_dict())
        except FunctionTimedOut as e1:
                d['__error'] = {
                    "error_type":"TIMEOUT",
                    "error_message": "extract function timeout",
                    "traceback":"TIMEOUT"
                }

                yield Row(**d)
        except Exception as e:
                # 记录更详细的错误信息
                error_info = {
                    "error_data":Row(**d),
                    "error_type": type(e).__name__,
                    "error_message": str(e),
                    "traceback": traceback.format_exc(),
                }
                logger.error(error_info)
                # s3_doc_writer.write(error_info)
                d['__error'] = error_info

                yield Row(**d)
           


# 数据抽取（仅用抽取器配置路径）
processed_rdd = transformed_not_empty_rdd.mapPartitions(
     lambda x:extract_data(
        x,broadcast_extractor_config_path = broadcast_extractor_config_path
    )
)
print(processed_rdd.getNumPartitions())


6000


In [38]:


processed_rdd.take(1)[0].asDict()

                                                                                

{'track_id': '6905e20c-e25f-471f-bfd3-8ff6cb95fce1',
 'url': 'https://new.qq.com/omn/20241006/20241006A019G500',
 'html': '<body> \n <div class="rich_media_content autoTypeSetting24psection"> \n  <!--NO_AD_ERROR_2--> \n  <section style="line-height: 1.75em"> \n   <span style="color: rgb(64, 118, 0); font-size: 18px">为庆祝中华人民共和国75周年华诞，中国环境APP青山Life研究所面向社会征集生态环保主题少儿绘画，得到少年儿童的积极响应。孩子们用一幅幅主题突出、形式新颖、富有时代气息和生活情趣的作品，展现出对生态环境保护与绿色低碳生活方式的思考与行动，对美丽中国的期盼与向往。现将投稿作品予以选登，与大家共赏。</span> \n  </section> \n  <p><img src="https://inews.gtimg.com/om_bt/OKkFbuaxesP2NUMNlDB2v_Zz1F9emTdkEufvZTiX2zmvQAA/641"></p> \n  <p><img src="https://inews.gtimg.com/om_bt/O6kWJTdnl0xJXVrLTIpmjMkBdRolB8F55Td5KHobBICUQAA/641"></p> \n  <p><img src="https://inews.gtimg.com/om_bt/Onhkv-5GoDyfGmDZf-6gPGyDOcxa8bN-SYMfB_OcSMC8UAA/641"></p> \n  <p><img src="https://inews.gtimg.com/om_bt/O_v907d7XVoPz8yExCAErm1aw63EEJ6QreEE1WSPhCxC8AA/641"></p> \n  <p><img src="https://inews.gtimg.com/om_bt/Oa6FLEf-piLAtC_8hUbIEm54Cz4cnASRcU3qAyfEHPu

## 排除error 数据

In [22]:
processed_rdd_filter = processed_rdd.filter(lambda x:"__error" not in x).persist()


In [41]:
processed_rdd_filter.count()

                                                                                

162000446

In [24]:
processed_rdd_error_filter = processed_rdd.filter(lambda x:"__error"  in x)

write_any_path(processed_rdd_error_filter.map(lambda x: Row(value=json_dumps(x.asDict()))).toDF(),"s3://web-parse-hw60p/xuchao/zx-html-error/tencent_blog/v002")

                                                                                

{'rows': 27728735,
 'bytes': {'sum': 167641958787,
  'min': 2505,
  'max': 7528591,
  'cnt': 27728735,
  'avg': 6045.785},
 'files': 6000,
 'sub_paths': {}}

In [25]:

result_df = processed_rdd_filter.map(lambda x: Row(value=json_dumps(x.asDict()))).toDF()


                                                                                

In [26]:
platform_config["output_template"]

's3://llm-users-phdd2/jiangwenhao/article/zh-web-tencent/v002/'

In [27]:
write_any_path(result_df, platform_config["output_template"])



                                                                                

{'rows': 162000445,
 'bytes': {'sum': 2577225612411,
  'min': 700,
  'max': 14486280,
  'cnt': 162000445,
  'avg': 15908.756},
 'files': 6000,
 'sub_paths': {}}

In [None]:
x = "s3://llm-users-phdd2/jiangwenhao/article/zh-web-tencent/"
s3 = get_s3_client(x)
_ = list(list_s3_objects(client=s3, path = x))
_

In [None]:
get_s3_config("s3://llm-users-phdd2/jiangwenhao/article/zh-web-tencent/2019/v017/")

In [None]:
from xinghe.spark import *
from app.common.json_util import *

from xinghe.s3 import *

delete_path = "s3://llm-users-phdd2/jiangwenhao/"
client = get_s3_client(delete_path)
delete_s3_object(delete_path +'article/zh-web-baijiahao/v004', client = client, dry_run = False)