In [None]:
import os
import pandas as pd


def drop_misalign_df(df):
    df = df.dropna(subset=['public_defender'])
    df_filtered = df[df['Adcode'].str.match(r'^\d{6}$')]
    df_filtered = df_filtered[df_filtered['TotalImprisonmentLengthforCriminalA'].apply(lambda x: str(x).isdigit() and int(x) <= 9999)]
    df_filtered = df[df['TrialProcedure'].str.contains('一审', na=False)]
    df_filtered.drop('ResponseText', axis=1, inplace=True)
    return df_filtered

def process_files(base_path):
    global total_rows_count, failed_fetch_count, na_in_response_count, rows_dropped_count
    # Define the output folder based on the base path
    output_folder = "/mnt/processed_data_criminal_case_analysis/drug_related_data_enrich_cleaning_Apr_15/4_cleaned_data_misalign_exists/"

    # Using dbutils.fs.ls to list directories/files
    files = dbutils.fs.ls(base_path)

    for file in files:

        if file.name.endswith(".csv"):
            # Reading CSV file into DataFrame
            file_path = file.path.replace("dbfs:", "/dbfs")  # Convert to local file path if necessary
            df = pd.read_csv(file_path, on_bad_lines='skip')

            # Apply 'trim_and_fetch_facts' function to 'FullText' column
            df_misalign_drop = drop_misalign_df(df)

            if df_misalign_drop.empty:
                print(f"No data after filtering for {file.name}. Moving to the next file.")
                continue
        
            total_rows_count += len(df)
            
            rows_dropped_count += len(df) - len(df_misalign_drop)

            # Construct the output path for the enriched CSV file
            output_file_path = 'dbfs:'+ os.path.join(output_folder, f"{os.path.basename(file.name)}")
            
            # Save the processed DataFrame to the new CSV file, ensuring the path is in "/dbfs" format for local IO
            df_misalign_drop.to_csv(output_file_path.replace("dbfs:", "/dbfs"), index=False)
    
    return rows_dropped_count


base_path = "/mnt/processed_data_criminal_case_analysis/drug_related_data_enrich_cleaning_Apr_15/2_lawyer_presence_data_enrich/"

total_rows_count, failed_fetch_count, na_in_response_count, rows_dropped_count = 0, 0, 0, 0
rows_dropped_count = process_files(base_path)

# Print the counts for process failures and rows dropped.
print(f"Total rows processed: {total_rows_count}")
print(f"Failed API fetch count: {failed_fetch_count}")
print(f"NA in response count: {na_in_response_count}")
print(f"Rows dropped: {rows_dropped_count}")
