In [9]:
# create spark session
import uuid
import time

from pyspark.sql import Row, DataFrame
from pyspark.sql.functions import explode, count,col, format_number
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, ArrayType

from app.common.spark_ext import *
from app.common.json_util import *
from app.common.s3 import *
from app.common.json_util import *

config = {
    "spark_conf_name": "spark_2",
    "skip_success_check": True,
    # "spark.dynamicAllocation.maxExecutors":120,
    "spark.executorEnv.HOME": "/share/shijin",
}

spark = new_spark_session("dingo-puyu3-hf", config)

# 抽取数据

In [3]:
# read data
input_paths = [
    "s3://qa-huawei/shijin/15T/pjcc/sample_data/zh/"
]
input_df = read_any_path(spark, ",".join(input_paths), config)

In [5]:
input_df_total=input_df.count()
input_df_total

                                                                                

11748

In [17]:
# sample data
sample_df = input_df.sample(fraction=160/input_df_total).rdd.toDF().limit(150)
sample_df.count()

                                                                                

150

In [None]:
output_acc = S3UploadAcc(spark.sparkContext)
output_path = "s3://qa-huawei/shijin/15T/pjcc/human_check/zh/"

sample_df.repartition(1).foreachPartition(upload_to_s3(output_path, "jsonl", output_acc, 0))

In [4]:
# process data
def process_data(row) -> Row:
    data = json_loads(row.value)
    content_qa = Doc(data).content
    data['content_qa'] = content_qa
    return Row(value=json_dumps(data))

process_df = sample_df.rdd.map(process_data).toDF()

                                                                                

In [5]:
# write data
output_path = "s3://qa-huawei/shijin/15T/pjcc/sample_data/en"
write_any_path(process_df, output_path, config)

                                                                                

{'rows': 12003,
 'bytes': {'sum': 595619325,
  'min': 3060,
  'max': 5073613,
  'cnt': 12003,
  'avg': 49622.538},
 'files': 1554,
 'sub_paths': {'en/en-pj-cc': {'rows': 12003,
   'bytes': {'sum': 595619325,
    'min': 3060,
    'max': 5073613,
    'cnt': 12003,
    'avg': 49622.538},
   'files': 1554}}}

# 机器质检

In [10]:
input_paths = [
    "s3://llm-users-phdd2/jiangwenhao/article/zh-web-sina/result/v005/",
]
input_df = read_any_path(spark, ",".join(input_paths), config)


In [19]:
input_review_rdd = input_df.rdd.map(lambda x: {**json.loads(x.value), "filename": x.filename})

In [21]:
input_review_rdd.filter(lambda x : x['track_id'] == '0e932ccc-4f8f-4dfd-aa59-fd9bf4fec27d').take(1)[0]

                                                                                

{'track_id': '0e932ccc-4f8f-4dfd-aa59-fd9bf4fec27d',
 'url': 'http://finance.sina.com.cn/jjxw/2023-08-21/doc-imzhycmz2479902.shtml',
 'html': '<div class="article" id="artibody"> \n <!-- 秒拍begin --> \n <!-- 秒拍end --> \n <!-- 视频播放器start --> \n <!-- 视频播放器end --> \n <!-- 行情图begin --> \n <!-- 行情图end --> \n <p cms-style="font-L"></p> \n <p cms-style="font-L"></p> \n <div class="img_wrapper"> \n  <img src="http://n.sinaimg.cn/spider20230821/27/w1245h382/20230821/048a-75ac73e4bc20bc804c48b28954290fc1.png" id="0"> \n  <span class="img_descr"></span> \n </div> \n <p cms-style="font-L">中国质量新闻网讯 近日，天津市宝坻区市场监督管理局发布关于43批次食品抽样检验情况的通告（2023年第9期）。本期监督抽检涉及食用<span id="stock_sz000061"><a href="https://finance.sina.com.cn/realstock/company/sz000061/nc.shtml" class="keyword" target="_blank" data-sudaclick="content_marketkeywords_p">农产品</a></span><span id="quote_sz000061"></span>，糖果制品，饮料，方便食品和调味品共5大类，共计43批次样品，全部合格。</p> \n <p cms-style="font-L align-Center"><font cms-style="font-L strong-Bold align-Center">产品

In [11]:
from dingo.io import MetaData

input_rdd = input_df.rdd
input_rdd_format = input_rdd.map(lambda x: MetaData(
    data_id= str(json.loads(x.value)['track_id']),
    prompt='',
    content=json.loads(x.value)['content_qa'],
    # image=json.loads(x.value)['image'],
    # raw_data = json.loads(x.value)
))
input_rdd_format.take(1)[0]

                                                                                

MetaData(data_id='bc45da8d-6169-4713-bf67-227c8e2b1678', prompt='', content='- 17364110592\n- 客服不处理/处理不当,不发货\n- 赔偿,解释,作出处罚,道歉,改善服务\n- 10元\n- 已回复\n', image=None, raw_data={})

In [12]:
from pyspark import SparkConf

from dingo.model.model import Model
from dingo.io import InputArgs
from dingo.exec import Executor

input_data = {
    "eval_group": "qa_standard_v1",
    'input_path': 'redpajama',
    'save_data': True,
    # 'save_raw': True
}
# input_data = {
#     "eval_group": "test",
#     "input_path": "redpajama",  # local filesystem dataset
#     "save_data": True,
#     "custom_config":
#         {
#             "rule_list": ["CommonSpecialCharacter", "CommonColonEnd"],
#             "prompt_list": ["LLMRepeat"],
#             "llm_config":
#                 {
#                     "detect_text_quality":
#                         {
#                             "key": "sk-proj-CwgXjGGBKq9vBI0Lue3jT3BlbkFJ5GnKDvQtrCmlwdHoea04",
#                             "api_url": "http://10.140.54.48:26666/v1",
#                         }
#                 }
#         }
# }
# Model.apply_config(input_data['custom_config_path'])
input_args = InputArgs(**input_data)
executor = Executor.exec_map["spark"](input_args, spark_session=spark, spark_rdd=input_rdd_format)
result = executor.execute()
result = result[0].to_dict()
print(result)





{'task_id': '47253e5e-00aa-11f0-a5b2-e8ebd34ed3cc', 'task_name': 'dingo', 'eval_group': 'qa_standard_v1', 'input_path': '', 'output_path': '', 'create_time': '20250314_155319', 'finish_time': '20250314_155946', 'score': 95.69, 'num_good': 172817833, 'num_bad': 7783731, 'total': 180601564, 'type_ratio': {'QUALITY_BAD_EFFECTIVENESS': 0.043099}, 'name_ratio': {'QUALITY_BAD_EFFECTIVENESS-RuleAbnormalChar': 0.041416, 'QUALITY_BAD_EFFECTIVENESS-RuleAbnormalHtml': 0.001965}}


                                                                                

In [13]:
bad_info_list = executor.bad_info_list
bad_info_list.count()

                                                                                

7783731

In [17]:
# 4622
bad_info_list.take(5)

[{'data_id': '0e932ccc-4f8f-4dfd-aa59-fd9bf4fec27d',
  'prompt': '',
  'content': '中国质量新闻网讯 近日，天津市宝坻区市场监督管理局发布关于43批次食品抽样检验情况的通告（2023年第9期）。本期监督抽检涉及食用 农产品 ，糖果制品，饮料，方便食品和调味品共5大类，共计43批次样品，全部合格。\n\n产品合格信息\n\n（声明：以下信息仅指本次抽检标称的生产企业相关产品的生产日期/批号和所检项目）\n\n| 抽样编号 | 序号 | 标称生产企业名称 | 标称生产企业地址 | 被抽样单位名称 | 被抽样单位所在省份 | 食品名称 | 规格型号 | 生产日期/批号 | 分类 | 公告号 | 公告日期 | 任务来源/项目名称 | 备注 |\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n| DBJ23120115111035213 | 1 | / | / | 天津市宝坻区宜品生活超市 | 天津 | 尖椒 | / | 2023.6.26 （购进日期） | 食用农产品 | 2023年第9期 | 2023.8.18 | 区抽/\xa0\xa0\xa0 监督抽检 | / |\n| DBJ23120115111035214 | 2 | / | / | 天津市宝坻区宜品生活超市 | 天津 | 姜 | / | 2023.6.23 （购进日期） | 食用农产品 | 2023年第9期 | 2023.8.18 | 区抽/\xa0\xa0\xa0 监督抽检 | / |\n| DBJ23120115111035225 | 3 | / | / | 天津市宝坻区福满家水果经营店 | 天津 | 芹菜 | / | 2023.6.26 （购进日期） | 食用农产品 | 2023年第9期 | 2023.8.18 | 区抽/\xa0\xa0\xa0 监督抽检 | / |\n| DBJ23120115111035226 | 4 | / | / | 天津市宝坻区福满家水果经营店 | 天津 | 生姜 | / | 2023.6.26 （购进日期） | 食用农产品 | 2023年第9期 | 2023.8.18 | 区抽/\xa0\xa0\xa0 监督抽检 | / |

In [3]:
spark

In [169]:
# QUALITY_BAD_EFFECTIVENESS-RuleInvisibleChar :1759  list code
# QUALITY_BAD_EFFECTIVENESS-RuleContentShort :  932 解决
# QUALITY_BAD_EFFECTIVENESS-RuleHtmlEntity :1722 table
# QUALITY_BAD_EFFECTIVENESS-RuleOnlyUrl : 181 已解决
# QUALITY_BAD_EFFECTIVENESS-RuleSpecialCharacter : 6 未发现
bad_info_list.filter(lambda x :x['name_list'] ==['QUALITY_BAD_EFFECTIVENESS-RuleOnlyUrl']).take(5)

[{'data_id': '09f8f69f-b138-4c96-a2e0-175b2e0cc77e',
  'prompt': '',
  'content': 'https://qoqi.nju.edu.cn/zw/yjfx/index.html\n\nhttp://www.iop.cas.cn/xshd/cqjz/202005/t20200514_5579644.html\n',
  'error_status': True,
  'type_list': ['QUALITY_BAD_EFFECTIVENESS'],
  'name_list': ['QUALITY_BAD_EFFECTIVENESS-RuleOnlyUrl'],
  'reason_list': ['Content is only an url link.'],
  'raw_data': {}},
 {'data_id': 'c383760e-e26a-419b-9a73-a36f0fb7ed6d',
  'prompt': '',
  'content': 'https://m.tb.cn/h.ULpFVjg?tk=q8MmdiEVrHy\n\nhttps://m.tb.cn/h.ULpvrNq?tk=kYjjdiE4Hkn\n',
  'error_status': True,
  'type_list': ['QUALITY_BAD_EFFECTIVENESS'],
  'name_list': ['QUALITY_BAD_EFFECTIVENESS-RuleOnlyUrl'],
  'reason_list': ['Content is only an url link.'],
  'raw_data': {}},
 {'data_id': 'c745638d-1ac9-494d-9f34-27642ca1e8c7',
  'prompt': '',
  'content': 'https://recordcdn.quklive.com/broadcast/activity/1703846303065389/record.m3u8\n\nhttps://oss-kbw.hbjt.com.cn/video_cover/2023/12/c2dc1a39-904f-4968-b0ff-2

In [107]:
summary_path = "s3://llm-users/qa/hf/redpajama/summary_1121/"
write_any_path(result, summary_path, config)

Exception: s3 dir [s3://llm-users/qa/hf/redpajama/summary_1121/] is already existed.

In [6]:
bad_info_list = executor.bad_info_list

output_df = bad_info_list.map(lambda x: {"value":json_dumps(x)})
output_df = output_df.toDF()

# output_path = "s3://llm-users/qa/m10ap/error_info/"+ds
# write_any_path(output_df, output_path, config)

rule_name_list = Model.get_rules_by_group('qa_standard_v1')
for r in rule_name_list:
    # filter data
    def filter_data(row) -> bool:
        # true, keep
        return r in json_loads(row.value)["name_list"]
    try:
        filter_df = output_df.rdd.filter(filter_data).toDF()
    except ValueError:
        continue

    output_path = f"s3://llm-users/qa/hf/redpajama/error_info_1121/{r}"
    write_any_path(filter_df, output_path, config)

24/11/21 18:02:50 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_9_0 !
24/11/21 18:02:50 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_9_1 !


In [15]:
bad_info_list = executor.bad_info_list
list_rdd = bad_info_list.flatMap(lambda row: row['type_list'])
unique_list = list_rdd.distinct().collect()
unique_list

                                                                                

['QUALITY_INEFFECTIVENESS']

In [14]:
list_rdd = self.bad_info_list.flatMap(lambda row: row['type_list'])
unique_list = list_rdd.distinct().collect()

NameError: name 'self' is not defined