# Goals
this code is to sample another 0.06 % of 刑事一审 cases from the four course_of_actions, in order to assemble a finetuning training dataset for LLM, to better extract information from the judgment.

# assemble folder paths

In [None]:
base_path = "/mnt/blob_berkeley_account"
folder_names = [f"{base_path}/{year}_judgment_data" for year in range(2010, 2022)] 

def list_csv_files(path):
    """
    List all CSV files recursively within a given path.
    """
    files = []
    for item in dbutils.fs.ls(path):
        if item.path.endswith(".csv"):
            files.append(item.path)
        elif item.isDir():  # If the item is a directory, recurse
            files.extend(list_csv_files(item.path))
    return files


select relevant columns, rename them into English

In [None]:
column_mapping = {
    "原始链接": "OriginalLink",
    "案号": "CaseNumber",
    "案件名称": "CaseName",
    "法院": "Court",
    "所属地区": "Location",
    "案件类型": "CaseType",
    "审理程序": "TrialProcedure",
    "裁判日期": "JudgmentDate",
    "公开日期": "PublicationDate",
    "当事人": "PartiesInvolved",
    "案由": "CausesofAction",
    "法律依据": "LegalBasis",
    "全文": "FullText"
}


Select columns, rename, filter out CaseReasons, them sample the data.

In [None]:
from pyspark.sql.functions import col

# Causes of Action to filter and sample
causes_of_action = {
    "盗窃": "theft",
    "交通肇事": "traffic_accidents",
    "毒品": "drug_related",
    "诈骗": "fraud"
}

sample_fraction = 0.0006 

import re

for folder in folder_names:
    csv_files = list_csv_files(folder)
    for file_path in csv_files:
        # Read the CSV file into a DataFrame
        df = spark.read.csv(file_path, header=True, inferSchema=True)
        
        # Select and rename the relevant columns
        df_selected_and_renamed = df.select([col(c).alias(column_mapping[c]) for c in column_mapping.keys()])
        
        # Filter to include only cases that are 刑事一审 in CaseType
        df_criminal_cases = df_selected_and_renamed.filter(col("CaseType").contains("刑事"))
        df_criminal_cases_first_trail = df_criminal_cases.filter(col("TrialProcedure").contains("一审"))
        
        # Loop through each cause of action, filter, sample, and process/save
        for cause, output_suffix in causes_of_action.items():
            df_filtered = df_criminal_cases.filter(col("CausesofAction").contains(cause))
            sampled_df = df_filtered.sample(fraction=sample_fraction)  # Sample the filtered DataFrame

            # Extract year and month from file_path
            match = re.search(r'(\d{4})_(\d{2})_judgment\.csv', file_path)
            if match:
                year = match.group(1)  # '2013'
                month = match.group(2)  # '01'
            else:
                raise ValueError(f"Month pattern not found in the file path: {file_path}")

            saved_file_name = f"{year}_{month}_{output_suffix}_judgment_data"
            
            # Define the full path for saving the DataFrame
            full_path = f"/mnt/processed_data_criminal_case_analysis/finetuning_training_data/{output_suffix}/{saved_file_name}"
            
            # Save the DataFrame
            sampled_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(full_path)
            



# how many 一审刑事案件 are there after sampling?

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

# Initialize Spark session
spark = SparkSession.builder.appName("CountCases").getOrCreate()

# Base path where the data is saved
base_path = "/mnt/processed_data_criminal_case_analysis"

# Causes of Action to filter and sample
causes_of_action = {
    "盗窃": "theft",
    "交通肇事": "traffic_accidents",
    "毒品": "drug_related",
    "诈骗": "fraud"
}

# Dictionary to hold the count of cases for each cause of action
cases_count = {}

# Iterate over each cause of action to read the saved DataFrames and count the cases
for cause, output_suffix in causes_of_action.items():
    # Construct the path to read the saved files for the current cause of action
    path = f"/mnt/processed_data_criminal_case_analysis/finetuning_training_data/{output_suffix}/*/*.csv"
    
    # Read the saved data
    df = spark.read.csv(path, header=True, inferSchema=True)
    
    # Count the number of cases for the current cause of action
    count = df.count()
    
    # Store the count in the dictionary
    cases_count[cause] = count

# Print the counts for each cause of action
for cause, count in cases_count.items():
    print(f"{cause}案件数: {count}")


# save the output into one csv for data labeling

In [None]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("AggregateCases").getOrCreate()

# Base path for reading and saving data
read_base_path = "/mnt/processed_data_criminal_case_analysis/finetuning_training_data"
save_base_path = "/mnt/processed_data_criminal_case_analysis/finetuning_training_data/aggregated_data"

# Causes of Action to filter and sample
causes_of_action = {
    "盗窃": "theft",
    "交通肇事": "traffic_accidents",
    "毒品": "drug_related",
    "诈骗": "fraud"
}

# Iterate over each cause of action to read the saved DataFrames, combine them, and write back to a single CSV
for cause, output_suffix in causes_of_action.items():
    # Construct the path to read the saved files for the current cause of action
    read_path = f"{read_base_path}/{output_suffix}/*/*.csv"
    
    # Read the saved data
    df = spark.read.csv(read_path, header=True, inferSchema=True)
    
    # Define the path for saving the combined CSV
    save_path = f"{save_base_path}/{output_suffix}.csv"
    
    # Save the combined data to a single CSV file
    df.coalesce(1).write.mode("overwrite").option("header", "true").csv(save_path)
