# Goal

filter out 一审 drug_related cases. store them in datalake gen 2.

assemble folder paths

In [0]:
base_path = "/mnt/blob_berkeley_account"
folder_names = [f"{base_path}/{year}_judgment_data" for year in range(2016, 2022)] # does here need a dash?

def list_csv_files(path):
    """
    List all CSV files recursively within a given path.
    """
    files = []
    for item in dbutils.fs.ls(path):
        if item.path.endswith(".csv"):
            files.append(item.path)
        elif item.isDir():  # If the item is a directory, recurse
            files.extend(list_csv_files(item.path))
    return files


select relevant columns, rename them into English

In [0]:
column_mapping = {
    "原始链接": "OriginalLink",
    "案号": "CaseNumber",
    "案件名称": "CaseName",
    "法院": "Court",
    "所属地区": "Location",
    "案件类型": "CaseType",
    "审理程序": "TrialProcedure",
    "裁判日期": "JudgmentDate",
    "公开日期": "PublicationDate",
    "当事人": "PartiesInvolved",
    "案由": "CausesofAction",
    "法律依据": "LegalBasis",
    "全文": "FullText"
}


Select columns, rename, filter out CaseReasons, them sample the data.

In [0]:
from pyspark.sql.functions import col
import re

# Causes of Action to filter
causes_of_action = {
    "毒品": "drug_related",  # Mapping "毒品" to "drug_related"
}

for folder in folder_names:
    csv_files = list_csv_files(folder)
    for file_path in csv_files:
        # Read the CSV file into a DataFrame
        df = spark.read.csv(file_path, header=True, inferSchema=True)
        
        # Select and rename the relevant columns
        df_selected_and_renamed = df.select([col(c).alias(column_mapping[c]) for c in column_mapping.keys()])
        
        # Filter to include only cases that mention "刑事" in CaseType
        df_criminal_cases = df_selected_and_renamed.filter(col("CaseType").contains("刑事"))
        
        df_first_trial = df_criminal_cases.filter(col("TrialProcedure").contains("一审"))
        
        # Loop through each cause of action, filter and process/save
        for cause, output_suffix in causes_of_action.items():
            df_filtered = df_first_trial.filter(col("CausesofAction").contains(cause))

            # Extract year and month from file_path
            match = re.search(r'(\d{4})_(\d{2})_judgment\.csv', file_path)
            if match:
                year = match.group(1)  # Example: '2013'
                month = match.group(2)  # Example: '01'
            else:
                raise ValueError(f"Month pattern not found in the file path: {file_path}")

            saved_file_name = f"{year}_{month}_{output_suffix}_judgment_data"

            # Define the full path for saving the DataFrame
            full_path = f"/mnt/processed_data_criminal_case_analysis/{output_suffix}_full_dataset/{saved_file_name}"
            
            # Save the DataFrame
            df_filtered.write.mode("overwrite").option("header", "true").csv(full_path)


# Test code for the a small data set
 

In [0]:
# checking the mount status
display(dbutils.fs.ls("/mnt/processed_data_criminal_case_analysis"))
display(dbutils.fs.ls("/mnt/processed_data_criminal_case_analysis/2013_judgment_data"))

In [0]:
from pyspark.sql.functions import col
import re

base_path = "/mnt/processed_data_criminal_case_analysis"
folder_names = ["/mnt/processed_data_criminal_case_analysis/2013_judgment_data"]

def list_csv_files(path):
    """
    List all CSV files recursively within a given path.
    """
    files = []
    for item in dbutils.fs.ls(path):
        if item.path.endswith(".csv"):
            files.append(item.path)
        elif item.isDir():  # If the item is a directory, recurse
            files.extend(list_csv_files(item.path))
    return files

# Causes of Action to filter and sample
causes_of_action = {
    
    "盗窃": "theft",
    "交通肇事": "traffic_accidents",
    "毒品": "drug_related",
    "诈骗": "fraud"
}

sample_fraction = 0.01

for folder in folder_names:
    csv_files = list_csv_files(folder)
    for file_path in csv_files:
        # Read the CSV file into a DataFrame
        df = spark.read.csv(file_path, header=True, inferSchema=True)
        
        # Select and rename the relevant columns
        df_selected_and_renamed = df.select([col(c).alias(column_mapping[c]) for c in column_mapping.keys()])
        
        # Filter to include only cases that mention "刑事" in CaseType
        df_criminal_cases = df_selected_and_renamed.filter(col("CaseType").contains("刑事"))
        
        # Loop through each cause of action, filter, sample, and process/save
        for cause, output_suffix in causes_of_action.items():
            df_filtered = df_criminal_cases.filter(col("CausesofAction").contains(cause))
            sampled_df = df_filtered.sample(fraction=sample_fraction)  # Sample the filtered DataFrame

            # Extract year and month from file_path
            match = re.search(r'(\d{4})年(\d{2})月裁判文书数据\.csv', file_path)
            if match:
                year = match.group(1)  # '2013'
                month = match.group(2)  # '01'
            else:
                raise ValueError("Month pattern not found in the file path.")

            saved_file_name = f"{year}_{month}_{output_suffix}_judgment_data"
            
            # Define the full path for saving the DataFrame
            full_path = f"/mnt/processed_data_criminal_case_analysis/{output_suffix}/{saved_file_name}"
            
            # Save the DataFrame
            sampled_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(full_path)
            

