In [49]:
from boto3.session import Session
import os
import zipfile
from io import BytesIO
import json

## S3
class OssClient:
    """
    S3 connection client
    """

    def __init__(
        self, aws_access_key_id, aws_secret_access_key, endpoint_url, region_name
    ):
        self.aws_access_key_id = aws_access_key_id
        self.aws_secret_access_key = aws_secret_access_key
        self.endpoint_url = endpoint_url
        self.region_name = region_name

        self.session = Session(aws_access_key_id, aws_secret_access_key)
        self.s3_client = self.session.client(
            "s3", endpoint_url=endpoint_url, region_name=region_name, use_ssl=False
        )

    def list_buckets(self):
        """
        List all the bucket names
        @return: A list of bucket names
        """
        response = dict(self.s3_client.list_buckets())
        bucket_list = [x['Name'] for x in response['Buckets']]
        return bucket_list

    def list_objects(self, bucket, prefix, delimiter='', size=None):
        """
        List all objects in a given bucket start with a given prefix
        @param bucket:
        @param prefix:
        @param delimiter:
        @return:
        """
        obj_list = []
        paginator = self.s3_client.get_paginator('list_objects')
        pages = paginator.paginate(Bucket=bucket,
                                   Prefix=prefix,
                                   Delimiter=delimiter)

        try:
            for page in pages:
                if 'Contents' in page.keys():
                    for obj in page['Contents']:
                        obj_list.append(obj)
                        if size:
                            if len(obj_list) >= size:
                                raise Exception("exceed the max size")
        except Exception as e:
            print(e)
        return obj_list


    def upload_file(self, local_file, bucket, remote_file=None):
        """
        upload a local file to s3
        :param local_file: file to upload
        :param bucket: bucket to upload to
        :param remote_file: destination file name to be saved
        :return:
        """
        if remote_file is None:
            remote_file = local_file
        try:
            self.s3_client.upload_file(local_file, bucket, remote_file)
        except Exception as e:
            #logger.error(e)
            print(e)
            return False
        return True

    def download_file(self, bucket, remote_file, local_file):
        """
        download a file from s3
        :param bucket: bucket name
        :param remote_file: remote file name
        :param local_file: local file name
        :return:
        """
        try:
            if not os.path.exists(local_file):
                local_file_path = local_file.rsplit('/', 1)[0]
                if not os.path.exists(local_file_path):
                    os.makedirs(local_file_path, exist_ok=True)
                self.s3_client.download_file(bucket, remote_file, local_file)
            # logger.info(f"Already exists {local_file_path}")
            return True
        except Exception as e:
            print(e)
            return False

    def delete_file(self, bucket, remote_file):
        """
        Delete a file from s3
        :param bucket: bucket name
        :param remote_file: remote file name
        :return:
        """
        try:
            self.s3_client.delete_object(Bucket=bucket, Key=remote_file)
            return True
        except Exception as e:
            print(e)
            return False

    def delete_files_in_path(self, bucket, remote_path):
        """
        Delete all files from a specific path in s3
        :param bucket: bucket name
        :param remote_path: remote directory path to delete files from
        :return:
        """
        try:
            # List all objects within the given path
            response = self.s3_client.list_objects(Bucket=bucket, Prefix=remote_path)

            if 'Contents' in response:
                for obj in response['Contents']:
                    key = obj['Key']
                    self.s3_client.delete_object(Bucket=bucket, Key=key)

            return True
        except Exception as e:
            print(e)
            return False
if __name__ == '__main__':
    pperf_client = OssClient("C68F0C5011E8E9C10F64", "5HaI9RymQ3poL/BIC8g2ifksOwYAAAGVEejpySCi", "http://10.140.104.11",None)
    print(pperf_client.list_buckets())
    print(pperf_client.list_objects("zhuanxiang-hw60p","article/souhu/v005/",size=10))

    # pperf_client.download_file("llm-pipeline", "cc/ehtml@002/CC-MAIN-2013-20__segments__1368696381249__warc__CC-MAIN-20130516092621-00000-ip-10-60-113-184.ec2.internal.jsonl.gz", "./cc-check.jsonl.gz")
    # pperf_client.upload_file("/share/sufutao/全流程/数学书/mathbook-49.jsonl", "data-warehouse", "sufutao/jumble/mathbook-49.jsonl")

['annotate-data', 'cn-common-crawl', 'crawl-app-data', 'crawldata-temp', 'media-file-index', 'pdf-hw60p', 'private-hcorpus', 'web-parse-hw60p', 'zhuanxiang-hw60p']
exceed the max size
[{'Key': 'article/souhu/v005/_SUMMARY_rows_139753046_bytes_2868127539447_files_18000', 'LastModified': datetime.datetime(2025, 3, 19, 3, 57, 7, 390000, tzinfo=tzutc()), 'ETag': '"deb4ecdbd2e20d9b329d0a55f009a97d"', 'Size': 133, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'bigdata', 'ID': '00000194F3863CF19413831D51B868ED'}}, {'Key': 'article/souhu/v005/part-67da3e50b3c5-000000.jsonl', 'LastModified': datetime.datetime(2025, 3, 19, 3, 47, 47, 164000, tzinfo=tzutc()), 'ETag': '"900cdf1f7b6b9c6e065ba26b6c46f82a-27"', 'Size': 437342648, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'bigdata', 'ID': '00000194F3863CF19413831D51B868ED'}}, {'Key': 'article/souhu/v005/part-67da3e50b3c5-000001.jsonl', 'LastModified': datetime.datetime(2025, 3, 19, 3, 47, 47, 76000, tzinfo=tzutc()), 'ETag': '"7d51409

In [1]:

import app.common_clean.core.definitions as core_def
import app.common_clean.basic_functions as bfuncs
import re
from typing import Tuple, Dict, Callable, Union, List
import app.common_clean.basic_functions as bfuncs
from app.common_clean.basic_functions.sentence import jieba_lcut
from collections import defaultdict
from itertools import islice
import app.common_clean.functions.detectors as detectors
import app.common_clean.functions.heuristic_statistics as hstatistic
from app.common_clean.core.content import get_text_content,get_nlp_content
from app.common_clean.functions import clean_funcs as clean_funcs_module

from pyspark.sql import Row, DataFrame

from xinghe.spark import *
from app.common.json_util import *
from xinghe.s3 import *
import re
from app.datasets.base import *
from app.content.base import Process
from app.content.handler import collapse_dup_newlines
from app.common.html2md import html2md, md2md
from app.common.get_word_list import get_words
from app.common.lxml_string_json import html_to_lxml, html_to_string
from bs4 import BeautifulSoup
import app.common.runtime as runtime
from app.common.html2md import md_remove_link
from app.content.base import Process
from app.content.handler import (
    mask_bank_account,
    mask_email,
    mask_id_card,
    mask_phone_number,
)
from app.content.handler import *
from app.snippets.doc_dedup import snippet_handle as dedup
HEADERS = ["h1", "h2", "h3", "h4"]

config = {
    "spark_conf_name": "spark_2", # another value is "spark_2"
    "skip_success_check": True,
#     "input_split_ratio": 0.125,
    "spark.executor.memory": "10g",
    "spark.executor.cores": "2",
    "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.hudi.catalog.HoodieCatalog",
    "spark.sql.extensions": "org.apache.spark.sql.hudi.HoodieSparkSessionExtension",
    "spark.kryo.registrator": "org.apache.spark.HoodieSparkKryoRegistrar",
    "spark.sql.catalog.spark_catalog.warehouse": "s3a://data-warehouse/hudi/",
    "output_compression": "gz",
    # "spark.yarn.queue": "root.clean_exp"
    # "spark.hadoop.net.topology.script.file.name": "/share/dataproc/code-clean/conf/hadoop_4/topology.py"
}

spark = new_spark_session("nan_check", config)
sc = spark.sparkContext
sc

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# read data


file_paths = ['s3://private-crawl-data/zh-web-sina/20241218_p1/']
    # 's3://data-warehouse/sufutao/llm_anno/content_only/textbook_content_20pages_25w/']

df_toComplete = read_any_path(spark, ",".join(file_paths), config)
df_toComplete.printSchema()

root
 |-- value: string (nullable = true)
 |-- filename: string (nullable = true)



In [None]:
# view data
input_df.printSchema()
input_df.take(1)