In [1]:

import os
from pyspark.sql import Row, SparkSession
from typing import Dict, Any
import json
import uuid
from functools import partial
from typing import Iterable, Dict, Any
# create spark session
from pyspark.sql import Row, DataFrame
from loguru import logger
from xinghe.spark import *
from app.common.json_util import *

from xinghe.s3 import *


import uuid
import traceback
from datetime import datetime
from llm_web_kit.input.datajson import DataJson
from func_timeout import FunctionTimedOut, func_timeout
config = {
    "spark_conf_name": "spark_4",
    "skip_success_check": True,
    # 根据个人路径进行替换1
    "spark.executorEnv.LLM_WEB_KIT_CFG_PATH": "/share/jiangwenhao/.llm-web-kit.jsonc",
    "spark.dynamicAllocation.maxExecutors": "400",
    "spark.yarn.queue": "root.clean_exp"
}
spark = new_spark_session("llm_kit_cc", config)
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
input_path = ["s3://llm-users-phdd2/jiangwenhao/article/zh-web-tencent/v002/"]
# input_path = ["s3://llm-users-phdd2/jiangwenhao/article/zh-web-tencent/2019/v017/part-67c9306eee8b-000002.jsonl"]

In [3]:
input_df = read_any_path(spark, ",".join(input_path), config)

In [4]:
input_rdd = input_df.rdd.map(lambda x :json.loads(x.value))


In [5]:
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType
import re



# 定义判断是否为纯文本
def is_plain_text(row):
    html = row.get("html","")
    return not re.search(r'<[^>]+>', html)
    
# 定义判断content_list是否为空
def is_empty_content_list(row):
    content_list = row.get("content_list","")
    tag =False
    for i in content_list:
        if len(i) == 0:
            tag = True
    return tag


    

# filter_json_rdd =filter_json_rdd.filter(is_empty_content_list)
# 筛选不含HTML标签的行


## 筛选异常框架处理异常数据集

In [6]:
# HTML是json 文本的数据集

filter_json_rdd = input_rdd.filter(is_plain_text)

In [7]:
# 是HTML但是content_list为空的数据集

filter_html_empty_rdd = input_rdd.filter(lambda x : is_empty_content_list(x) and not is_plain_text(x))
# filter_html_empty_rdd.count()


In [8]:
## 是HTML但是content_list不为空的数据集==》正常输出数据集
filter_html_rdd = input_rdd.filter(lambda x : not is_empty_content_list(x) and not is_plain_text(x))
# filter_html_rdd.count()

In [9]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, MapType, StringType
import re


def parse_text_paragraphs(row):
    # 按换行符分割段落，过滤空行
    html_content = row.get("html", "")
    paragraphs = []
    tmp_dict = {}
    for para in html_content.split('\n'):
        if para.strip():
            paragraphs.append({"c": para.strip(), "t": "text"})
        
    tmp_dict = {
            "type": "paragraph",
            "bbox": None,
            "raw_content": None,
            "content": paragraphs
  
        }      
    # 添加/更新content_list字段
    row["content_list"] = [[tmp_dict]]
    
    # 重建Row对象保留所有原始字段
    return row
   
    
from pyspark.sql import functions as F

filter_json_add_content_list_rdd = filter_json_rdd.map(lambda x : parse_text_paragraphs(x))

filter_html_add_content_list_rdd = filter_html_empty_rdd.map(parse_text_paragraphs)



## 分开之后第一次合并，进行下一步清洗

In [10]:

union_rdd = filter_html_rdd.union(filter_html_add_content_list_rdd).union(filter_json_add_content_list_rdd).cache()



## 检验content_list异常值

In [11]:
import re,regex
from typing import List, Dict

def is_empty_content(text: str) -> bool:
    """检查文本是否为空或仅包含空白字符"""
    
    return not (text or "").strip()

def has_excessive_whitespace(text: str) -> bool:
    """检测过多空格或换行"""
    # 换行符占比超过25%
    total_chars = len(text)
    if total_chars == 0:
        return True
    newline_ratio = text.count('\n') / total_chars
    if newline_ratio > 0.25:
        return True
    
    # 连续空格超过500个或连续换行超过8个
    if ' ' * 500 in text or '\n' * 8 in text:
        return True
    
    return False

line_breaks_re = r"[\n\v\f\r\x85\u2028\u2029]"
visible_spaces_re = r"[\x20\xa0\u2000-\u200a\u202f\u205f\u3000]"
invisible_spaces_re = r"[\u200b-\u200d\u2060\ufeff]"
invisible_chars_re = r"[\xad\ufffc\u2061-\u2063]"
other_controls_re = r"[\x00-\x08\x0e-\x1f\x7f-\x84\x86-\x9f]"
direction_controls_re = r"[\u061c\u200e\u200f\u202a-\u202e\u2066-\u2069]"
head_view_invisible_spaces_re = r"^[\x20\xa0\u2000-\u200a\u202f\u205f\u3000 ]"
private_use_area_pattern = (
    r"[\uE000-\uF8FF]"  # BMP 私有使用区
    r"|[\U000F0000-\U000FFFFD]"  # 辅助平面 A 私有使用区
    r"|[\U00100000-\U0010FFFD]"  # 辅助平面 B 私有使用区
)
ar_invisible_spaces_re = r"[\u2060\ufeff]"
ar_direction_controls_re = r"[\u061c\u202c\u2066-\u2069]"
others = r"[\u2063\x00-\x1F\x7F-\x9F�\u200B-\u200D\uFEFF\u206a\u206e\u206f\u00AD\u200c\xa0\u3000\u2003\u2002\u200e\u00A0\u200e\u25A1\xa0]|&nbsp|\\? :|xa0|&ldquo;|&rdquo;|&rdquo|�|□|&amp|&lt|&gt"

def clean_special_whitespace(s):
    if s is None:
        return True
    s = str(s)
    s = re.sub(line_breaks_re, "\n", s)
    s = re.sub(visible_spaces_re, " ", s)
    s = re.sub(head_view_invisible_spaces_re, "", s)
    s = re.sub(invisible_spaces_re, "", s)
    s = re.sub(direction_controls_re, "", s)
    s = re.sub(invisible_chars_re, "", s)
    s = re.sub(other_controls_re, "", s)
    s = re.sub(private_use_area_pattern, "", s)
    s = re.sub(others,"",s,flags=re.UNICODE)
    s = regex.sub(r'\p{Z}+', ' ', s, flags=regex.UNICODE)
    # 匹配任意数量的指定空白字符（包括常规空格、\u2005、\u200c等）
    return s


def clean_html_tag(s):
    """处理 QUALITY_BAD_EFFECTIVENESS 类型数据质量问题"""
    # 模式选择：清理内容 或 严格过滤
    PROCESS_MODE = "clean"  # 可配置为 "filter" 进行严格过滤
    
    
    # 检测到 HTML 标签时才处理
    if bool(re.search(r'<[^>]+>', s)):
        if PROCESS_MODE == "clean":
            """清理 HTML 标签，保留文本内容"""
            s = re.sub(r'<\/?[a-z][^>]*>', '', s, flags=re.IGNORECASE)

    return  s



    
def is_url(text: str) -> bool:
    """判断文本是否为URL"""
    url_pattern = re.compile(
        r'^(?:http|ftp)s?://'  # 匹配http/https/ftp
        r'(?:\S+(?::\S*)?@)?'  # 用户名密码
        r'(?:\d{1,3}\.){3}\d{1,3}'  # IPv4地址
        r'|'                     # 或域名
        r'(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}'  # 域名
        r'(?::\d+)?'            # 端口
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    return bool(re.match(url_pattern, text))


def is_only_url(text: str) -> bool:
    """判断字符串是否仅包含一个完整URL且无其他内容"""
    # 优化后的正则表达式，支持常见URL格式
    url_pattern = re.compile(
        r'^'                            # 字符串开始
        r'(https?|ftp)://'              # 协议头
        r'(?:\S+(?::\S*)?@)?'           # 用户名密码（可选）
        r'([a-z0-9\-]+\.)+[a-z]{2,}'    # 域名部分
        r'(?::\d+)?'                    # 端口号（可选）
        r'(?:/[^\s?#]*)?'               # 路径（可选）
        r'(?:\?[^\s#]*)?'               # 查询参数（可选）
        r'(?:#[^\s]*)?'                 # 锚点（可选）
        r'$',                           # 字符串结束
        re.IGNORECASE
    )
    return url_pattern.fullmatch(text.strip()) is not None

def is_too_short(text: str) -> bool:
    """判断文本长度是否过短"""
    return len(text) <= 21

def has_abnormal_chars(text: str) -> bool:
    """检测异常字符（如不可见字符、�）"""
    abnormal_pattern = re.compile(r'[\x00-\x1F\x7F-\x9F�\u200B-\u200d\uFEFF\u00AD\u200c\xa0\u3000\u2003\u2002\u200e]')
    return bool(abnormal_pattern.search(text))

    


def clean_repeat_chars(text, max_repeat=3):
    """
    清理无意义重复字符
    :param text: 原始文本
    :param max_repeat: 允许的最大连续重复次数(中文建议3-5)
    :return: 清洗后文本
    """
    # 匹配超过阈值的连续重复字符（支持全角/半角）
    pattern = rf"([\w\W])\1{{{max_repeat},}}"
    
    def replace_func(match):
        char = match.group(1)
        # 保留不超过允许的重复次数
        return char * max_repeat
    
    return re.sub(pattern, replace_func, text)


## URL数据判断

In [12]:
import re
from typing import List, Dict

def is_single_url(text: str) -> bool:
    """严格判断是否为单个URL（允许无协议头，如 www.example.com）"""
    url_pattern = re.compile(
        r'^'                        # 字符串开始
        r'(https?://)?'             # 可选协议头（http:// 或 https://）
        r'(?:www\.)?'               # 可选 www 前缀
        r'(?:[a-zA-Z0-9-]+\.)+'     # 主域名（如 codeproject.）
        r'[a-zA-Z]{2,}'            # 顶级域名（如 com、org）
        r'(?::\d+)?'                # 可选端口（如 :8080）
        r'(?:/[\w#!:.?+=&%@!-/]*)?' # 路径、查询参数等
        r'$',                       # 字符串结束
        re.IGNORECASE
    )
    return url_pattern.fullmatch(text.strip()) is not None

def is_url_only_content(content: str) -> bool:
    """判断内容是否全部由URL构成"""
    if not content.strip():
        return False
    
    # 分割策略：换行符和连续空格
    segments =  re.split(r'\n+|\s{2,}| ', content.strip())
    
    # 检查每个非空段落是否为URL
    return all(
        is_single_url(segment)
        for segment in segments
        if  segment.strip()  # 过滤空行/空段
    )




In [13]:
import re
from typing import Dict, List

import traceback
def is_valid_content_element(processed: Dict) -> Dict:
    """综合校验单个content元素的有效性"""
    # 基础校验：字段完整性
    # 优先取 'c'，若不存在则取 'title_content'，最后默认空字符串
   
    content_key = 'c' if 'c' in processed else 'title_content' if 'title_content' in processed else 'c'

    raw_content = processed.get(content_key, '')
    elem_type = processed.get('t', 'text') 
    

    
    # 规则1: 空内容或纯空格
    if is_empty_content(raw_content):
        processed[content_key] = ''
        processed['t'] = 'text'  # 强制类型为text
        return processed
    
    
    # 规则2: 过多空白（根据不同类型调整阈值）
    if has_excessive_whitespace(raw_content):
        processed[content_key] = ''
        processed['t'] = 'text'  # 强制类型为text
        return processed

    
    # 规则3: URL内容（仅允许特定类型的元素包含URL）
    #url_pattern = re.compile(r'^https?://\S+$')
    #if elem_type != 'hyperlink' and url_pattern.match(content):
    #    return False
 
    # 类型相关校验
    #if elem_type == 'text':
        # 文本长度校验
    #    if len(content) < 3:  # 最小3个字符
    #        return False
    #elif elem_type == 'equation-inline':
    #    # 公式格式校验
    #    if not re.match(r'^[A-Za-z0-9=+\-*/^()]+$', content):
    #        return False

    # --- 清理逻辑 ---
    cleaned = clean_html_tag(clean_repeat_chars(clean_special_whitespace(raw_content)))
  
    processed[content_key] = cleaned  # 更新原始内容键
    processed['t'] = elem_type  # 保留原始类型或默认值
       
    return processed

In [14]:
from typing import List, Dict

def filter_content(content: List[Dict]) -> List[Dict]:
    """过滤content列表，保留有效内容元素"""
    filtered = []
    for elem in content:
        # 步骤1: 校验并处理单个元素
        processed = is_valid_content_element(elem)
        
        # 步骤2: 动态判断内容键（优先'c'，其次是'title_content'）
        content_key = 'c' if 'c' in processed else 'title_content' if 'title_content' in processed else None
        if not content_key:
            continue  # 无内容键，直接跳过
        
        # 步骤3: 检查内容是否有效（非空且非纯空格）
        content_value = processed.get(content_key, '')
        if content_value.strip() != '':
            filtered.append(processed)
            
    return filtered

def update_value(data, target_key, condition = lambda x :True, new_value =clean_special_whitespace):
    """
    递归地遍历任意维的字典列表堆叠结构数据，并根据条件更新特定键的值。

    :param data: 数据结构（字典或列表）
    :param target_key: 要更新的目标键
    :param condition: 更新条件函数
    :param new_value: 新的值（可以是具体值或函数）
    """
    if isinstance(data, dict):
        for key, value in data.items():
            if key == target_key and condition(value):
                if callable(new_value):
                    data[key] = new_value(value)
                else:
                    data[key] = new_value
            else:
                update_value(value, target_key, condition, new_value)
    elif isinstance(data, list):
        for item in data:
            update_value(item, target_key, condition, new_value)
   
        
def filter_content_list(row):
    """优化点：分离标题/段落处理，防御性类型检查，保留原始数据结构"""
    try:
        # 防御性获取内容列表
        for page in row.get("content_list", []):
            valid_elements = []
            for elem in page:
                # 非字典或非目标类型直接保留
                if not isinstance(elem, dict) or elem.get('type') not in ('title', 'paragraph','list'):
                    valid_elements.append(elem)
                    continue

                # 类型分流处理
                elem_type = elem['type']

                content_data = elem.get('content')

                # --- 标题类型处理 ---
                if elem_type == 'title':
                    # 标题内容必须是字典且包含 title_content
                    if isinstance(content_data, dict) and 'title_content' in content_data:
                        filtered = filter_content([content_data])  # 包装成列表
                        # 校验返回结果有效性
                        if filtered and isinstance(filtered, list) and len(filtered) > 0:
                            new_content = filtered[0]
                            # 确保标题内容非空
                            if new_content.get('title_content', '').strip():
                                elem['content'] = new_content
                                valid_elements.append(elem)
                    # 无效标题内容直接过滤
                    continue

                # --- 段落类型处理 ---
                if elem_type == 'paragraph':
                    # 段落内容必须是列表
                    if isinstance(content_data, list):
                        filtered = filter_content(content_data)
                        # 保留非空内容
                        if filtered and len(filtered) > 0:
                            elem['content'] = filtered
                            valid_elements.append(elem)
                    # 无效段落内容直接过滤
                    continue
                if elem_type == 'list':
                    update_value(content_data,'c')
                  
                    elem['content'] = content_data
                   
                    valid_elements.append(elem)
                    


            # 原地更新页面内容
            page[:] = valid_elements
        return row
    except Exception as e:
        row['__error'] = {
            'code': -1,
            'msg': f"Content filtering failed: {str(e)}",
            'trace': traceback.format_exc()
        }
        return row
                

In [15]:
para_cleaning_rdd = union_rdd.map(filter_content_list).cache()


In [16]:
# input_rdd.filter(lambda x : x['track_id'] == 'cd30d78d-9ab1-4577-af00-92e3dfb65fd8').take(1)[0]

In [17]:
 # union_rdd.filter(lambda x :x['track_id'] == '09e0dcee-0768-4609-9d0a-9850c860a6b6').take(1)[0]
spark

In [97]:
para_cleaning_rdd.filter(lambda x :x['track_id'] == '6542abaf-fb50-4cef-a88b-a93db820871e').take(1)[0]

                                                                                

{'track_id': '6542abaf-fb50-4cef-a88b-a93db820871e',
 'url': 'https://blog.sina.com.cn/s/blog_411b4d310100hw0j.html',
 'html': '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "//www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="//www.w3.org/1999/xhtml"><head><script id="sinaads-ck-script" charset="utf-8" src="//d3.sina.com.cn/litong/zhitou/sinaads/src/spec/sinaads_ck.js"></script><script src="https://d4.sina.com.cn/litong/zhitou/wenjing28/js/postMan.js"></script>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<title>爱情保鲜剂_平乐郭氏正骨医院_新浪博客</title>\n<meta http-equiv="X-UA-Compatible" content="IE=EmulateIE8,chrome=1">\n<meta name="renderer" content="webkit">\n<meta name="keywords" content="爱情保鲜剂_平乐郭氏正骨医院_新浪博客,平乐郭氏正骨医院,杂谈">\n<meta name="description" content="爱情保鲜剂_平乐郭氏正骨医院_新浪博客,平乐郭氏正骨医院,">\n<meta content="always" name="referrer">\n<meta http-equiv="mobile-agent" content="format=html5; url=http://blog.sina.cn/dpool/blog/s/blog_411b4d310100hw0

In [120]:
para_cleaning_rdd.filter(lambda x : x['track_id'] == '6a3e406a-0aa2-433b-ae75-9873d2436788').map(lambda x : make_content_md(x)).take(1)[0]

Traceback (most recent call last):
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/share/jiangwenhao/envs/code_clean_venv/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/share/python/3.10.9/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [190]:
result_rdd.filter(lambda x : x['track_id'] == '7a4055b5-2f5f-4cf1-9786-c002f7ea9b88').take(1)[0]


25/03/11 15:08:56 WARN TaskSetManager: Lost task 1761.0 in stage 55.0 (TID 7903) (host-10-140-93-52 executor 37): TaskKilled (Stage cancelled)
25/03/11 15:08:56 WARN TaskSetManager: Lost task 1875.0 in stage 55.0 (TID 8017) (host-10-140-93-52 executor 53): TaskKilled (Stage cancelled)


[Stage 55:>                                                     (6 + 10) / 6000]

## Markdown处理

In [18]:
from llm_web_kit.input.datajson import DataJson

def make_content_md(row):
    try:
        row['content_qa'] = DataJson(row).get_content_list().to_nlp_md()

        return row
    except Exception as e2:
        err = {"code":-1, f"{row}msg": traceback.format_exc()}
        row['content_qa'] = ''
        row['__error'] = err
        return row

def del_content_qa(row):
    del row['content_qa']
    return row

from llm_web_kit.input.datajson import DataJson, StructureMapper
def clist_filter_factory(self, field_you_want = ('paragraph')):
    """把content_list转化为md格式，只接受 Node 类型为 ['a', 'b'].
    Returns:
        str: md格式的文本内容
    """
    md_blocks = []  # 每个是个DocElementType规定的元素块之一转换成的文本
    content_lst = self._get_data()
    for page in content_lst:
        for content_lst_node in page:
            if content_lst_node['type'] in field_you_want:
                txt_content = self._StructureMapper__content_lst_node_2_md(content_lst_node)
                if txt_content and len(txt_content) > 0:
                    md_blocks.append(txt_content)
    md = self._StructureMapper__md_para_splitter.join(md_blocks)
    md = md.strip() + self._StructureMapper__text_end  # 加上结尾换行符
    return md
def clist_to_c(d_iter):
    StructureMapper.clist_filter = clist_filter_factory
    for d in d_iter:
        try:
            prompt = ''
            data_id = d['track_id']
            content  = DataJson(d).get_content_list().clist_filter()
            new_d = {
                'track_id': data_id,
                'prompt': prompt,
                'content_qa': content,
            }
            yield new_d
        except Exception as e:
            raise e
# para_cleaning_rdd.filter( lambda x : not is_empty_content_list(x)).map(make_content_md).filter(lambda x: not  is_too_short(x['content_qa'])).take(1)[0]


In [19]:
result_rdd = para_cleaning_rdd. \
filter( lambda x : not is_empty_content_list(x)).map(make_content_md). \
filter(lambda x: not  is_too_short(x['content_qa'])). \
filter(lambda x : not has_excessive_whitespace(x['content_qa'])). \
filter(lambda x : not is_url_only_content(x['content_qa'])) 
result_rdd.take(1)[0]
# union_rdd.filter(lambda x : x['track_id'] == 'ea0eb1bd-9cd2-441c-a231-12d754eb1f23').take(1)[0]

                                                                                

{'track_id': 'f67c4f9c-1687-4d32-b009-a9575a9b8817',
 'url': 'https://new.qq.com/omn/20220404/20220404A014TC00',
 'html': '<body>\n <p>部分埃及媒体报道称，埃及同数个阿拉伯国家正试图邀请俄罗斯和乌克兰代表团前来进行下一轮和谈。据悉，这些国家已就通过政治途径解决俄乌冲突提出了若干建议，其中包括一份关于俄乌实现停火的倡议。（央视新闻）</p>\n <p>\n  <!--IMG_0--></p>\n <img src="http://qqpublic.qpic.cn/qq_public/0/28-720443001-5A4375547DE7C39F82C739BF9831F128/0?fmt=png&amp;size=25&amp;h=330&amp;w=450&amp;ppv=1">\n</body>',
 'page_layout_type': 'article',
 'domain': 'new.qq.com',
 'dataset_name': 'tencent',
 'data_source_category': 'HTML',
 'meta_info': {'filename': 's3://private-crawl-data/zh-web-tencent/20241218_p1/2022/1732885639667.json.gz',
  'posttime': '2022-04-04 07:15:30',
  'n_content_county': [],
  'm_identity': '19598956',
  'n_origin': '',
  'n_headimg_url': 'http://qqpublic.qpic.cn/qq_public/0/28-720443001-5A4375547DE7C39F82C739BF9831F128/0?fmt=png&size=25&h=330&w=450&ppv=1',
  'f_domain_sec': 'new.qq.com',
  'title': '埃及同部分阿拉伯国家正尝试就俄乌停火进行斡旋',
  'uuid': '56abad99c2f99ba5522467

In [20]:
result_df = result_rdd.map(lambda x: Row(value=json_dumps(x))).toDF()

                                                                                

In [173]:
#交付数据集

final_rdd = para_cleaning_rdd.filter( lambda x : not is_empty_content_list(x)).map(make_content_md).filter(lambda x: not  is_too_short(x['content_qa'])). \
filter(lambda x : not has_excessive_whitespace(x['content_qa'])).filter(lambda x : not is_url_only_content(x['content_qa'])).map(del_content_qa)
final_df = final_rdd.map(lambda x: Row(value=json_dumps(x))).toDF()

                                                                                

In [121]:
result_rdd.take(1)[0]

{'track_id': 'bc45da8d-6169-4713-bf67-227c8e2b1678',
 'url': 'https://tousu.sina.com.cn/complaint/view/17364110592',
 'html': '<div class="ts-d-question"> \n <ul class="ts-q-list"> \n  <li><label>投诉编号：</label> 17364110592</li> \n  <li><label>投诉对象：</label> <a suda-uatrack="key=complaint_company" class="c_link" href="//tousu.sina.com.cn/company/view/?couid=6020086612" target="_blank" data-sudaclick="complaint_company">抖音</a> </li> \n  <li><label>投诉问题：</label> 客服不处理/处理不当,不发货</li> \n  <li><label>投诉要求：</label> 赔偿,解释,作出处罚,道歉,改善服务</li> \n  <li><label>涉诉金额：</label> 10元</li> \n  <li><label>投诉进度：</label><b>已回复</b></li> \n </ul> \n <!-- 律所帮助 --> \n <div class="law_help_div"></div> \n</div>',
 'page_layout_type': 'forum',
 'domain': 'tousu.sina.com.cn',
 'dataset_name': 'sina',
 'data_source_category': 'HTML',
 'meta_info': {'filename': 's3://private-crawl-data/zh-web-sina/20241218_p1/2022/1732934230360.json.gz',
  'statics': {'list': 1, 'list.text': 5}},
 'content_list': [[{'type': 'list',
    'r

In [None]:
# 机器质检数据
write_any_path(result_df, "s3://llm-users-phdd2/jiangwenhao/article/zh-web-tencent/v003/")

[Stage 2:>                                                      (1 + 30) / 9000]

In [174]:
final_df.take(1)[0]

                                                                                

Row(value='{"track_id":"bc45da8d-6169-4713-bf67-227c8e2b1678","url":"https://tousu.sina.com.cn/complaint/view/17364110592","html":"<div class=\\"ts-d-question\\"> \\n <ul class=\\"ts-q-list\\"> \\n  <li><label>投诉编号：</label> 17364110592</li> \\n  <li><label>投诉对象：</label> <a suda-uatrack=\\"key=complaint_company\\" class=\\"c_link\\" href=\\"//tousu.sina.com.cn/company/view/?couid=6020086612\\" target=\\"_blank\\" data-sudaclick=\\"complaint_company\\">抖音</a> </li> \\n  <li><label>投诉问题：</label> 客服不处理/处理不当,不发货</li> \\n  <li><label>投诉要求：</label> 赔偿,解释,作出处罚,道歉,改善服务</li> \\n  <li><label>涉诉金额：</label> 10元</li> \\n  <li><label>投诉进度：</label><b>已回复</b></li> \\n </ul> \\n <!-- 律所帮助 --> \\n <div class=\\"law_help_div\\"></div> \\n</div>","page_layout_type":"forum","domain":"tousu.sina.com.cn","dataset_name":"sina","data_source_category":"HTML","meta_info":{"filename":"s3://private-crawl-data/zh-web-sina/20241218_p1/2022/1732934230360.json.gz","statics":{"list":1,"list.text":5}},"content_list":[[{"

In [175]:

# 验收数据
#腾讯
#write_any_path(final_df, "s3://zhuanxiang-hw60p/article/tencent/final/v003/")
#souhu
#write_any_path(final_df, "s3://zhuanxiang-hw60p/article/souhu/final/v002/")
#sina
write_any_path(final_df, "s3://zhuanxiang-hw60p/article/sina/v008/")
#sina——blog
write_any_path(final_df, "s3://zhuanxiang-hw60p/article/blog_sina_com_cn/final/v001/")


                                                                                

{'rows': 174705947,
 'bytes': {'sum': 1700901954349,
  'min': 525,
  'max': 14546397,
  'cnt': 174705947,
  'avg': 9735.799},
 'files': 17382,
 'sub_paths': {}}

In [51]:
# result_rdd.filter(lambda x: "__error" in x).count()
get_s3_config("s3://zhuanxiang-hw60p/article/tencent/v001/")
write_any_path(result_df, "s3://llm-users-phdd2/jiangwenhao/article/zh-web-souhu/v001/")


                                                                                

{'rows': 139738535,
 'bytes': {'sum': 1263435365117,
  'min': 390,
  'max': 3305885,
  'cnt': 139738535,
  'avg': 9041.424},
 'files': 18000,
 'sub_paths': {}}

In [2]:
para_cleaning_rdd.filter( lambda x : not is_empty_content_list(x)).map(make_content_md).filter(lambda x: "__error" in x).take(1)[0]

## 数据集预处理

In [35]:

import os
from pyspark.sql import Row, SparkSession
from typing import Dict, Any
import json
import uuid
from functools import partial
from typing import Iterable, Dict, Any
# create spark session
from pyspark.sql import Row, DataFrame
from loguru import logger
from xinghe.spark import *
from app.common.json_util import *

from xinghe.s3 import *


import uuid
import traceback
from datetime import datetime
from llm_web_kit.input.datajson import DataJson
from func_timeout import FunctionTimedOut, func_timeout

#| track_id                  | uuid                           | 全局唯一的ID                                                                                                        | 是              |
#| dataset_name              | str                            | 数据集的名字（全局唯一），这个名字是管理员输入的，然后做索引的时候带到index里来                                     | 是              |
#| data_source_category      | str                            | 这一行数据代表的是HTML，PDF，EBOOK,CC,labCC类型                                                                     | 是，此处是 HTML |
#| html                      | 字符串                         | 以UTF-8为编码的HTML文件全文                                                                                         | 是              |
#| url                       | 字符串                         | 这个文件的来源网址                                                                                                  | 是              |
#| file_bytes                | 整数                           | 文件的size, 单位是byte                                                                                              | 是              |
#| meta_info                 | 字典                           | 存放关于文件的元信息:如果能从文件里获取到作者，制作日期等信息。或者数据本身就带有一些其他的信息都放入到这个字段里。 | 是              |
#| meta_info->input_datetime | 其格式为 `yyyy-mm-dd HH:MM:SS` | 生成这个json索引文件这一条数据的时间，可以不用那么精确                                                              | 是              |


def process_platform_data(spark: SparkSession, config,platform: str, input_paths: list, version: str = "001"):
    """接收 spark 作为参数而不是持有它"""
    platform_config = platform_configs.get(platform)
    if not platform_config:
        raise ValueError(f"Unsupported platform: {platform}")

    # Driver 端操作
    input_df = read_any_path(spark, ",".join(input_paths), config)
    print(f"读取数据结束")
    # 准备 Worker 端配置
    worker_config = {
        "field_mappings": platform_config.get("field_mappings"),
        "extractor_config": platform_config.get("extractor_config")
    }
    # 分别广播不同配置
    broadcast_field_mappings = spark.sparkContext.broadcast(worker_config["field_mappings"])
    broadcast_extractor_config_path = spark.sparkContext.broadcast(worker_config["extractor_config"])  # 广播路径

    print("原始数据json结构")
    input_rdd = input_df.rdd.map(lambda x: Row(**{**json.loads(x.value), "filename": x.filename})).cache()
  #  pandas_df = input_rdd.toDF().toPandas()
  #  print(pandas_df)
    print(input_rdd.take(1)[0].asDict().keys())
    # 数据转换（仅用字段映射）
    transformed_rdd = input_rdd.map(
        lambda row: transform_row(row, broadcast_field_mappings.value)
    )
   
    print("formatter数据结束")
    # 数据抽取（仅用抽取器配置路径）
    processed_rdd = transformed_rdd.mapPartitions(
         lambda x:extract_data(
            x,broadcast_extractor_config_path = broadcast_extractor_config_path.value
        )
    )
    print("extractor数据结束")
    print("写入数据中")
   
    # 输出结果Row(value=json.dumps(x.asDict()))).toDF()
    write_any_path(processed_rdd.map(lambda x: Row(value=json.dumps(x.asDict()))).toDF(), platform_config["output_template"],config)
    print("写入数据结束")

    # 清理广播变量
    broadcast_field_mappings.unpersist()
    broadcast_extractor_config_path.unpersist()



def transform_row(row, config: dict) -> Row:
    """根据平台配置转换行数据"""
    mappings = config

    return Row(
        track_id=getattr(row, mappings["track_id"],str(uuid.uuid4())),
        url=getattr(row, mappings["url"], ''),
        html=getattr(row, mappings["html"], ''),
        page_layout_type=mappings.get("page_layout_type_map").get(
            getattr(row, mappings["layout_field"], ''),
            "article"
        ),
        domain=extract_domain_info(getattr(row, mappings["url"], ''))['domain'],
        dataset_name=mappings["dataset_name"],
        data_source_category=mappings["data_source_category"],
        meta_info={"filename":row.filename}
    )




def extract_data(partition, broadcast_extractor_config_path):
    from loguru import logger
    extractor_chain = ExtractSimpleFactory.create('/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc')
    timeout_seconds = 10
    # 为每个分区创建唯一的错误日志文件
    # partition_id = str(uuid.uuid4())
    # current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    # error_log_path = f"s3://xyz-llm-users/xyz-users/yujia/CC-MAIN-2024-33/output/v002/error_logs/{current_time}_{partition_id}.json"
    # s3_doc_writer = S3DocWriter(path=error_log_path)
    try:
        for row in partition:
            d = row.asDict()
           
            input_data = DataJson(d)
            data_e: DataJson = func_timeout(timeout_seconds, extractor_chain.extract, args=(input_data,))
            #data_e: DataJson = extractor_chain.extract(input_data)
            yield Row(**data_e.to_dict())
    except FunctionTimedOut as e1:
            d['__error'] = {
                "error_type":"TIMEOUT",
                "error_message": "extract function timeout",
                "traceback":"TIMEOUT"
            }
            yield Row(**d)
    except Exception as e:
            # 记录更详细的错误信息
            error_info = {
                "error_type": type(e).__name__,
                "error_message": str(e),
                "traceback": traceback.format_exc(),
            }
            logger.error(error_info)
            # s3_doc_writer.write(error_info)
            d['__error'] = error_info
            yield Row(**d)
           


def _safe_extract(data: Dict, extractor, timeout: int = 10) -> Dict:
  
    """直接返回字典，避免生成器"""
    try:
        timeout_seconds = 10
        input_data = DataJson(data.asDict())
        print(input_data)
        data_e: DataJson = func_timeout(timeout_seconds, extractor.extract, \
                                                args=(input_data,))

        print(Row(**data_e.to_dict()))
        return data_e.to_dict() 
    except Exception as e:
        # 记录更详细的错误信息
        error_info = {
            "error_type": type(e).__name__,
            "error_message": str(e),
            "traceback": traceback.format_exc(),
        }
        logger.error(error_info)
        # s3_doc_writer.write(error_info)
        data['__error'] = error_info
        yield Row(**d)


from urllib.parse import urlparse

def extract_domain_info(url: str) -> dict:
    """从 URL 中提取完整的域名信息"""
    parsed = urlparse(url)
    netloc = parsed.netloc
    domain_parts = netloc.split(":")
    domain = domain_parts[0]  # 去除端口号
    root_domain = ".".join(domain.split(".")[-2:]) if len(domain.split(".")) >= 2 else domain

    return {
        "full_url": url,
        "netloc": netloc,
        "domain": domain,
        "root_domain": root_domain
    }
        
def handle_error(row: Dict, error: Exception) -> Dict:
    """统一错误处理"""
    row_dict = row.asDict()
    return {
     ** row,
    "__error": {
        "type": type(error).__name__,
        "message": str(error),
        "traceback": traceback.format_exc()
    }
    }

    
def extract_platform_from_s3_path(s3_path: str) -> str:
    """
    从 S3 路径中提取平台名称（存储桶后的第一个目录）
    
    示例输入: 
    - "s3://private-cooperate-data/zh-web-baijiahao/20241218_p1/"
    输出: "zh-web-baijiahao"
    
    - "s3://private-cooperate-data/DouBan/"
    输出: "DouBan"
    """
    # 分割路径并过滤空字符串
    parts = [p for p in s3_path.split("/") if p.strip() != ""]
    
    # 验证路径格式
    if len(parts) < 3:
        raise ValueError(f"无效的 S3 路径格式: {s3_path}")
    
    # 平台名称是存储桶后的第一个目录
    return parts[2]
import os
from pyspark.sql import Row, SparkSession
from typing import Dict, Any
import json
import uuid
from functools import partial
from typing import Iterable, Dict, Any
# create spark session
from pyspark.sql import Row, DataFrame
from loguru import logger
from xinghe.spark import *
from app.common.json_util import *

from xinghe.s3 import *


import uuid
import traceback
from datetime import datetime
from llm_web_kit.input.datajson import DataJson
from func_timeout import FunctionTimedOut, func_timeout
version="001"
sub_dir = "202401"
input_paths =[f's3://crawl-data/blog_sina_com_cn/gz_file/1729501052/']
platform = extract_platform_from_s3_path(input_paths[0])
platform_configs = {

    "zh-web-baijiahao": {

        "field_mappings": {
            "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video"
                            },
        "dataset_name": "baijiahao",
        "url":"url",
        "html":"content",
        "layout_field": "channel",
        "data_source_category":"JSON"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/{sub_dir}/v{version}/"

    },
    "zh-web-netease": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video"
                            },
        "dataset_name": "net-ease",
        "url":"url",
        "html":"content",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
    "zh-web-tencent": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "腾讯网":"article"
                            },
        "dataset_name": "tencent",
        "url":"url",
        "html":"content",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
     "zh-web-sohu": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "搜狐网":"article"
                            },
        "dataset_name": "souhu",
        "url":"url",
        "html":"content",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
    "zh-web-sina": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "搜狐网":"article",
                                "黑猫投诉":"forum",
                                "新浪网":"article"
                            },
        "dataset_name": "sina",
        "url":"url",
        "html":"content",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },

    "blog_sina_com_cn": {

        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "搜狐网":"article",
                                "黑猫投诉":"forum",
                                
                            },
        "dataset_name": "sina_blog",
        "url":"url",
        "html":"html",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
    "DouBan": {
       
        "field_mappings": {
             "track_id":"track_id",
            "page_layout_type_map" :{
                                "":"article",
                                "文章":"article",
                                "网易":'article',
                                "视频":"video",
                                "搜狐网":"article",
                                "黑猫投诉":"forum",
                                "豆瓣网":""
                            },
        "dataset_name": "sina",
        "url":"url",
        "html":"html",
        "layout_field": "f_name",
        "data_source_category":"HTML"
        },
        "extractor_config": "/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc",
        "output_template": f"s3://llm-users-phdd2/jiangwenhao/article/{platform}/v{version}/"
    },
}
platform_config = platform_configs.get(platform)
platform_config


{'field_mappings': {'track_id': 'track_id',
  'page_layout_type_map': {'': 'article',
   '文章': 'article',
   '网易': 'article',
   '视频': 'video',
   '搜狐网': 'article',
   '黑猫投诉': 'forum'},
  'dataset_name': 'sina_blog',
  'url': 'url',
  'html': 'html',
  'layout_field': 'f_name',
  'data_source_category': 'HTML'},
 'extractor_config': '/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc',
 'output_template': 's3://llm-users-phdd2/jiangwenhao/article/blog_sina_com_cn/v001/'}

In [36]:


#安全解析函数（包含异常捕获）
def safe_json_loads(s):
    try:
        json.loads(s)
        return True
    except:
        return False


        
def transform_row(row, config: dict) -> Row:

    """根据平台配置转换行数据"""
    mappings = config
    from loguru import logger
    try:
        d = Row(
                track_id=getattr(row, mappings["track_id"],str(uuid.uuid4())),
                url=getattr(row, mappings["url"], ''),
                html=getattr(row, mappings["html"], ''),
                page_layout_type=mappings.get("page_layout_type_map").get(
                    getattr(row, mappings["layout_field"], ''),
                    "article"
                ),
                domain=extract_domain_info(getattr(row, mappings["url"], ''))['domain'],
                dataset_name=mappings["dataset_name"],
                data_source_category=mappings["data_source_category"],
                meta_info={"filename":row.filename}
            )
        return d 
    except Exception as e:
        # 记录更详细的错误信息
        error_info = {
            "error_data":Row(**d),
            "error_type": type(e).__name__,
            "error_message": str(e),
            "traceback": traceback.format_exc(),
        }
        # s3_doc_writer.write(error_info)
        d['__error'] = error_info
        logger.error(error_info)
        return Row(**d)
   
    

def extract_data(partition, broadcast_extractor_config_path):
    from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
    from loguru import logger

    extractor_chain = ExtractSimpleFactory.create('/share/jiangwenhao/notebooks/定向子集专项/subset-spliz.jsonc')
    timeout_seconds = 10
    # 为每个分区创建唯一的错误日志文件
    # partition_id = str(uuid.uuid4())
    # current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    # error_log_path = f"s3://xyz-llm-users/xyz-users/yujia/CC-MAIN-2024-33/output/v002/error_logs/{current_time}_{partition_id}.json"
    # s3_doc_writer = S3DocWriter(path=error_log_path)
    for row in partition:
        try:

            d = row.asDict()
            input_data = DataJson(d)
            data_e: DataJson = func_timeout(timeout_seconds, extractor_chain.extract, args=(input_data,))
            #data_e: DataJson = extractor_chain.extract(input_data)
            
    
            yield Row(**data_e.to_dict())
        except FunctionTimedOut as e1:
                d['__error'] = {
                    "error_type":"TIMEOUT",
                    "error_message": "extract function timeout",
                    "traceback":"TIMEOUT"
                }

                yield Row(**d)
        except Exception as e:
                # 记录更详细的错误信息
                error_info = {
                    "error_data":Row(**d),
                    "error_type": type(e).__name__,
                    "error_message": str(e),
                    "traceback": traceback.format_exc(),
                }
                logger.error(error_info)
                # s3_doc_writer.write(error_info)
                d['__error'] = error_info

                yield Row(**d)
from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType         


spark

In [49]:

from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType
version="001"
sub_dir = "202401"

platform = extract_platform_from_s3_path(input_paths[0])

platform_config = platform_configs.get(platform)
platform_config

#config = {
#    "spark_conf_name": "spark_4",
#    "skip_success_check": True,
#    # 根据个人路径进行替换1
#    "spark.executorEnv.LLM_WEB_KIT_CFG_PATH": "/share/jiangwenhao/.llm-web-kit.jsonc",
#    "spark.dynamicAllocation.maxExecutors": "400",
#    "spark.executor.cores": "20"
#}
#spark = new_spark_session("llm_kit_cc", config)


print(f"路径: {input_paths[0]} → 平台: {platform}")


"""接收 spark 作为参数而不是持有它"""
platform_config = platform_configs.get(platform)
if not platform_config:
    raise ValueError(f"Unsupported platform: {platform}")

# Driver 端操作
input_df = read_any_path(spark, ",".join(input_paths), config)
print(f"读取数据结束")


broadcast_field_mappings = platform_config.get("field_mappings")
broadcast_extractor_config_path = platform_config.get("extractor_config") 




input_rdd = df_filtered.rdd.map(lambda x: Row(**{**json.loads(x.value), "filename": x.filename}))


transformed_rdd = input_rdd.map(
    lambda row: transform_row(row, broadcast_field_mappings)
).repartition(6000)


print(transformed_rdd.getNumPartitions())
## 数据抽取（仅用抽取器配置路径）
processed_rdd = transformed_rdd.mapPartitions(
     lambda x:extract_data(
        x,broadcast_extractor_config_path = broadcast_extractor_config_path
    )
)


processed_rdd.count()
# processed_rdd_filter.count()

路径: s3://crawl-data/blog_sina_com_cn/gz_file/1729501052/ → 平台: blog_sina_com_cn
读取数据结束
6000


                                                                                

28174242

In [50]:
processed_rdd_filter = processed_rdd.filter(lambda x:"__error" not in x).persist()
result_df = processed_rdd_filter.map(lambda x: Row(value=json_dumps(x.asDict()))).toDF()

                                                                                

In [52]:
write_any_path(result_df, platform_config["output_template"])

                                                                                

{'rows': 28174230,
 'bytes': {'sum': 1509192879853,
  'min': 471,
  'max': 4047415,
  'cnt': 28174230,
  'avg': 53566.429},
 'files': 6000,
 'sub_paths': {}}

In [108]:
platform_config["output_template"]= 's3://llm-users-phdd2/jiangwenhao/article/zh-web-tencent/v001/'

In [80]:
result_df.take(1)[0].asDict()

                                                                                

{'value': '{"track_id":"45ded449-03b8-43e2-83aa-6ce108ea3d13","url":"https://new.qq.com/omn/20240103/20240103A020Q400","html":"<body> \\n <p>2023年12月28日，广东省学前教育领域2个国家级实验区和5个省级实验区正式启动，广东省各地学前教育工作者齐聚广州越秀，共同论道和擘画学前教育的高质量发展蓝图。期间，越秀区11所公办园邀请全省幼教从业者“浸润式”现场观摩，探索如何“敞开大门”办好园。</p> \\n <p><img src=\\"https://inews.gtimg.com/om_bt/OGlavVXX5r8gSaQKhkofOnszR8QRrMOF0NsHkN_6iU-FcAA/641\\"></p> \\n <p>“浸润式”观摩遇见教育美好</p> \\n <p>在日前举行的学前教育高质量发展交流研讨会中，越秀区11所公办幼儿园敞开大门，开启分论坛活动和“浸润式”现场观摩，邀请全省其他地市的教研员、幼教从业者零距离走进孩子们的一日生活，感受越秀区幼儿园的实力。</p> \\n <p><img src=\\"https://inews.gtimg.com/om_bt/OhXfil4P3yZMVYPqe-EJ-cBd4aZAM37XtuIzKv-6mpaj0AA/641\\"></p> \\n <p>从2011年起，经过实施三期“学前教育行动计划”，越秀学前教育实现了从“幼有所育”到“幼有优育”的跨越式发展。近三年，越秀区几乎以一年一个实验区的争创速度，继先后获得“广东省学前教育改革发展实验区”“广东省学前教育高质量发展实验区”后，2022年，越秀区又成功创建国家级“幼儿园保育教育质量提升实验区”，高素质保教队伍数量逐年提高，“名师”“名园长”不断涌现，已初步建成了广覆盖、保基本、有质量、布局合理、公益普惠的学前教育公共服务体系。</p> \\n <p>越秀区幼儿园到底好在哪里？越秀区打开大门供人参观的底气何在？羊城晚报记者走进各大幼儿园——</p> \\n <p>上月29日上午，在广州市越秀区东方红幼儿园内，近百位幼教观摩者用一个小时时间感受了这所名园的方方面面。在园内水池边，几个孩子专注地用塑料瓶子组建“火箭”和“

25/03/12 20:09:00 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_52_2643 !
25/03/12 20:09:00 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_53_15920 !
25/03/12 20:09:00 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_150_3472 !
25/03/12 20:09:00 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_121_157 !
25/03/12 20:09:00 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_150_5943 !
25/03/12 20:09:00 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_52_3686 !
25/03/12 20:09:00 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_52_12327 !
25/03/12 20:09:00 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_150_5847 !
25/03/12 20:09:00 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_52_3861 !
25/03/12 20:09:00 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_121_1993 !
25/03/12 20:09:00 WARN BlockManage

's3://llm-users-phdd2/jiangwenhao/article/blog_sina_com_cn/v001/'