# Set Up

In [0]:
%pip install pyahocorasick  # Install known missing dependency first
%pip install git+https://github.com/jiaruisong/chinese_province_city_area_mapper.git

# Function: location Extraction

In [0]:
import cpca

def location_extraction(location):
    df = cpca.transform(location)
    
    columns_mapping = {
        "省": "Province",
        "市": "City",
        "区": "District",
        "地址": "CourtLevel",
        "adcode": "Adcode"
    }

    # Renaming the columns
    df_renamed = df.rename(columns=columns_mapping)

    return df_renamed



test run

In [0]:
court_names = [
    "江苏省高级人民法院",
    "株洲市芦淞区人民法院",
    "湖南省高级人民法院",
    "云南省曲靖市中级人民法院",
    "湖南省长沙市中级人民法院",
    "湖南省长沙市中级人民法院",
    "上海市第二中级人民法院",
    "南宁市江南区人民法院",
    "不存在县人民法院",
    "深圳市龙岗区人民法院"
]

court_names_1 = ["不存在县人民法院"]
df = location_extraction(court_names)
print(df)
df = location_extraction(court_names_1)
print(df)

# Location Extraction

In [0]:
import pandas as pd
import os


def process_files(base_path, cause):
    output_folder = f"{base_path}/{cause}_Location_March_18"
    path_pattern = os.path.join(base_path, cause)

    files = dbutils.fs.ls(path_pattern)

    for file in files:
        if file.isDir():
            sub_files = dbutils.fs.ls(file.path)
            for sub_file in sub_files:
                if sub_file.name.endswith(".csv"):
                    file_path = sub_file.path.replace("dbfs:", "/dbfs")
                    df = pd.read_csv(file_path, on_bad_lines='skip')

                    df_filtered = df[df['TrialProcedure'].str.contains('一审', na=False)]

                    if df_filtered.empty:
                        print(f"No data after filtering for {sub_file.name}. Moving to the next file.")
                        continue

                    # Reset index to use as a temporary join key
                    df_filtered.reset_index(drop=True, inplace=True)

                    # Apply address categorization function to the 'court' column and reset its index
                    #address_data = df_filtered['Court'].apply(location_extraction).reset_index()

                    # Merge the original DataFrame with the categorized address data
                    #df_enriched = pd.merge(df_filtered, address_data, left_index=True, right_index=True)

                    # Apply the function and get a Series of DataFrames
                    #address_data_series = df['Court'].apply(location_extraction)

                    # Convert the Series of DataFrames into a single DataFrame
                    #address_data = pd.concat(address_data_series.tolist(), ignore_index=True)

                    # Call the function with the entire 'court' column as a list
                    address_data = location_extraction(df['Court'].tolist())

                    # Ensure the resulting DataFrame has the same index as the original for correct row alignment
                    address_data.index = df.index

                    # Merge the new address data with the original DataFrame
                    df_enriched = pd.concat([df, address_data], axis=1)

                    # Construct and save the output file as before
                    output_file_path = 'dbfs:'+ os.path.join(output_folder, f"{os.path.basename(sub_file.name)}")
                    df_enriched.to_csv(output_file_path.replace("dbfs:", "/dbfs"), index=False)


base_path = "/mnt/processed_data_criminal_case_analysis"
causes_of_action = ["drug_related"]

for cause in causes_of_action:
    process_files(base_path, cause)
