# Set Up

In [1]:
%pip install pyahocorasick  # Install known missing dependency first
%pip install git+https://github.com/jiaruisong/chinese_province_city_area_mapper.git

Note: you may need to restart the kernel to use updated packages.
Collecting git+https://github.com/jiaruisong/chinese_province_city_area_mapper.git
  Cloning https://github.com/jiaruisong/chinese_province_city_area_mapper.git to /private/var/folders/pv/gnjct8s51217skw1p2lmnj9w0000gn/T/pip-req-build-9wgalnfk
  Running command git clone --filter=blob:none --quiet https://github.com/jiaruisong/chinese_province_city_area_mapper.git /private/var/folders/pv/gnjct8s51217skw1p2lmnj9w0000gn/T/pip-req-build-9wgalnfk
  Resolved https://github.com/jiaruisong/chinese_province_city_area_mapper.git to commit 9c5e2fe74f2c9c6568573043bbf5db051e9c0988
  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


# Function: location Extraction

In [2]:
import cpca

def location_extraction(location):
    df = cpca.transform(location)
    
    columns_mapping = {
        "省": "Province",
        "市": "City",
        "区": "District",
        "地址": "CourtLevel",
        "adcode": "Adcode"
    }

    # Renaming the columns
    df_renamed = df.rename(columns=columns_mapping)

    return df_renamed



test

In [3]:
court_names = [
    "江苏省高级人民法院",
    "株洲市芦淞区人民法院",
    "湖南省高级人民法院",
    "云南省曲靖市中级人民法院",
    "湖南省长沙市中级人民法院",
    "湖南省长沙市中级人民法院",
    "上海市第二中级人民法院",
    "南宁市江南区人民法院",
    "不存在县人民法院",
    "深圳市龙岗区人民法院"
]

court_names_1 = ["不存在县人民法院"]
df = location_extraction(court_names)
print(df)
df = location_extraction(court_names_1)
print(df)

  Province  City District CourtLevel  Adcode
0      江苏省  None     None     高级人民法院  320000
1      湖南省   株洲市      芦淞区       人民法院  430203
2      湖南省  None     None     高级人民法院  430000
3      云南省   曲靖市     None     中级人民法院  530300
4      湖南省   长沙市     None     中级人民法院  430100
5      湖南省   长沙市     None     中级人民法院  430100
6      上海市  None     None   第二中级人民法院  310000
7  广西壮族自治区   南宁市      江南区       人民法院  450105
8      重庆市     县     None       人民法院  500200
9      广东省   深圳市      龙岗区       人民法院  440307
  Province City District CourtLevel  Adcode
0      重庆市    县     None       人民法院  500200


# Location Extraction - bunk edits

In [4]:
import pandas as pd
import os

def process_files(base_path, cause):
    # Define the output folder for the processed data.
    output_folder = "/Users/jiaruisong/Documents/Coding/INFO 288 Big Data and Development_Data/drug_related_full_dataset_type_amount_cleaned_penalty_lawyer_location"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Loop through all CSV files in the base_path directory
    for filename in os.listdir(base_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(base_path, filename)
            df = pd.read_csv(file_path, on_bad_lines='skip')
            
            
            # Presumably, 'location_extraction' extracts location data from 'Court' column
            address_data = location_extraction(df['Court'].tolist())
            address_data.index = df.index
            
            # Merge the new address data with the original DataFrame
            df_enriched = pd.concat([df, address_data], axis=1)
            
            # Construct the output path and save the processed DataFrame as a new CSV file
            output_file_path = os.path.join(output_folder, filename)
            df_enriched.to_csv(output_file_path, index=False)

# Define the base path for input files
base_path = "/Users/jiaruisong/Documents/Coding/INFO 288 Big Data and Development_Data/drug_related_full_dataset_type_amount_cleaned_penalty_lawyer"
causes_of_action = ["drug_related"]

# Process files for each cause of action
for cause in causes_of_action:
    process_files(base_path, cause)


# testrun: using a small file

In [None]:
import os
import pandas as pd


def process_files(base_path, cause):

    # Define the output folder based on the base path and cause
    output_folder = f"/mnt/processed_data_criminal_case_analysis/drug_related_location_extraction_test_run_March_19"

    # Using dbutils.fs.ls to list directories/files
    files = dbutils.fs.ls(base_path)

    for sub_file in files:
        if sub_file.name.endswith(".csv"):
            # Reading CSV file into DataFrame
            # Convert to local file path if necessary
            file_path = sub_file.path.replace("dbfs:", "/dbfs")
            df = pd.read_csv(file_path, on_bad_lines='skip')

            # Filter rows where 'TrialProcedure' column contains '一审'
            df_filtered = df[df['TrialProcedure'].str.contains('一审', na=False)]

            if df_filtered.empty:
                print(
                    f"No data after filtering for {sub_file.name}. Moving to the next file.")
                continue

            # Call the function with the entire 'court' column as a list
            address_data = location_extraction(df_filtered['Court'].tolist())

            # Ensure the resulting DataFrame has the same index as the original for correct row alignment
            address_data.index = df_filtered.index

            # Merge the new address data with the original DataFrame
            df_enriched = pd.concat([df_filtered, address_data], axis=1)

            # Construct the output path for the enriched CSV file
            output_file_path = 'dbfs:' + \
                os.path.join(
                    output_folder, f"{os.path.basename(sub_file.name)}")

            # Save the processed DataFrame to the new CSV file, ensuring the path is in "/dbfs" format for local IO
            df_enriched.to_csv(output_file_path.replace(
                "dbfs:", "/dbfs"), index=False)


# base_path = "/mnt/processed_data_criminal_case_analysis/finetuning_training_data/drug_related_sample_training_data/"
base_path = "/mnt/processed_data_criminal_case_analysis/drug_related_DrugTypeAmount_Penalty_March_19/part-00000-tid-560076838174988988-2cd33208-a73b-45e0-a942-97b7f01f5e87-3229-1-c000.csv"

causes_of_action = ["drug_related"]

for cause in causes_of_action:
    total_rows_count, failed_fetch_count, na_in_response_count = 0, 0, 0
    process_files(base_path, cause)