option 1: spark

In [0]:
base_path = "/mnt/processed_data_criminal_case_analysis/drug_related_DrugTypeAmount_Penalty_Location_March_19"

output_path = "/mnt/processed_data_criminal_case_analysis/drug_related_DrugTypeAmount_Penalty_Location_DataframeAlignment_March_20"

from pyspark.sql.functions import lit

# Assuming `common_columns` is a list of all columns identified
def align_dataframe(df, common_columns):
    for col in common_columns:
        if col not in df.columns:
            df = df.withColumn(col, lit(None))
    return df.select(common_columns)

common_columns = [
    "OriginalLink",
    "CaseNumber",
    "CaseName",
    "Court",
    "Location",
    "CaseType",
    "TrialProcedure",
    "JudgmentDate",
    "PublicationDate",
    "PartiesInvolved",
    "CausesofAction",
    "LegalBasis",
    "FullText",
    "drug_a",
    "amount_a",
    "drug_b",
    "amount_b",
    "drug_c",
    "amount_c",
    "ResponseText",
    "Charge1forCriminalA",
    "FineforCriminalA",
    "TotalImprisonmentLengthforCriminalA",
    "SuspendedforCriminalA",
    "Charge2forCriminalA",
    "Charge1forCriminalB",
    "Charge2forCriminalB",
    "FineforCriminalB",
    "TotalImprisonmentLengthforCriminalB",
    "SuspendedforCriminalB",
    "Charge1forCriminalC",
    "Charge2forCriminalC",
    "FineforCriminalC",
    "TotalImprisonmentLengthforCriminalC",
    "SuspendedforCriminalC",
    "Charge1forCriminalD",
    "Charge2forCriminalD",
    "FineforCriminalD",
    "TotalImprisonmentLengthforCriminalD",
    "SuspendedforCriminalD",
    "Charge1forCriminalE",
    "Charge2forCriminalE",
    "FineforCriminalE",
    "TotalImprisonmentLengthforCriminalE",
    "SuspendedforCriminalE",
    "Charge1forCriminalF",
    "Charge2forCriminalF",
    "FineforCriminalF",
    "TotalImprisonmentLengthforCriminalF",
    "SuspendedforCriminalF",
    "Charge3forCriminalC",
    "Province",
    "City",
    "District",
    "CourtLevel",
    "Adcode"
]


In [0]:
sub_files = dbutils.fs.ls(base_path)

for sub_file in sub_files:
    if sub_file.name.endswith(".csv"):
        file_path = sub_file.path
        df = spark.read.format("csv") \
            .option("header", "true") \
            .option("inferSchema", "true") \
            .load(file_path)
        df_aligned = align_dataframe(df, common_columns)
        df_aligned.write.mode("overwrite").option("header", "true").csv(output_path)
        
        print(f"Aligned dataframe for {file_path}")
        df_aligned.printSchema()
        

option 2: spark sql=

In [0]:
# Register the DataFrame as a temp view
df.createOrReplaceTempView("cases_view")

# Execute SQL query to find non-integer values
non_integer_values_query = """
SELECT OriginalLink, CaseNumber,	CaseName,	Court,	Location,	CaseType,	TrialProcedure,	JudgmentDate,PublicationDate, TotalImprisonmentLengthforCriminalA, PartiesInvolved,	CausesofAction,	LegalBasis,	FullText,	drug_a,	amount_a,	drug_b,	amount_b,	ResponseText,	Charge1forCriminalA	FineforCriminalA,	TotalImprisonmentLengthforCriminalA,	SuspendedforCriminalA,	Charge2forCriminalA,	Charge1forCriminalB,	Charge2forCriminalB	FineforCriminalB,	TotalImprisonmentLengthforCriminalB,	SuspendedforCriminalB,	Province,	City,	District,	CourtLevel,	Adcode
FROM cases_view
WHERE CAST(TotalImprisonmentLengthforCriminalA AS INT) IS NULL
AND TotalImprisonmentLengthforCriminalA IS NOT NULL
"""

non_integer_values = spark.sql(non_integer_values_query)

# Show the results
non_integer_values.show()


In [0]:
from pyspark.sql.functions import col, isnan, when, count

# Assuming 'df' is your DataFrame
# Replace 'TotalImprisonmentLengthforCriminalA' with the actual column name you want to check
# Repeat the process for other TotalImprisonmentLength columns as needed

# Try casting the column to an integer type
df_with_cast = df.withColumn("TotalImprisonmentLengthInt", col("TotalImprisonmentLengthforCriminalB").cast("int"))

# Filter to find rows where cast is not successful
# This condition checks for nulls in the casted column which indicates unsuccessful casts
non_integer_rows = df_with_cast.filter(df_with_cast["TotalImprisonmentLengthInt"].isNull() & ~df_with_cast["TotalImprisonmentLengthforCriminalB"].isNull())

# Show the rows with non-integer values
non_integer_rows.select("TotalImprisonmentLengthforCriminalB").show(100, truncate=False)
#non_integer_rows.show(non_integer_rows.count(), truncate=False)


option 3: pd

In [0]:
all_columns = [
    "OriginalLink",
    "CaseNumber",
    "CaseName",
    "Court",
    "Location",
    "CaseType",
    "TrialProcedure",
    "JudgmentDate",
    "PublicationDate",
    "PartiesInvolved",
    "CausesofAction",
    "LegalBasis",
    "FullText",
    "drug_a",
    "amount_a",
    "drug_b",
    "amount_b",
    "Charge1forCriminalA",
    "FineforCriminalA",
    "TotalImprisonmentLengthforCriminalA",
    "SuspendedforCriminalA",
    "Charge2forCriminalA",
    "Charge1forCriminalB",
    "Charge2forCriminalB",
    "FineforCriminalB",
    "TotalImprisonmentLengthforCriminalB",
    "SuspendedforCriminalB",
    "Province",
    "City",
    "District",
    "CourtLevel",
    "Adcode"

]


check columns are the same before loading to sql

In [0]:
import os
import pandas as pd
from collections import Counter





def align_dataframe(base_path, cause):
    previous_one = ['this is the first time']

    # Using dbutils.fs.ls to list directories/files
    files = dbutils.fs.ls(base_path)

    for sub_file in files:
        if sub_file.name.endswith(".csv"):
            # Reading CSV file into DataFrame
            # Convert to local file path if necessary
            file_path = sub_file.path.replace("dbfs:", "/dbfs")
            df = pd.read_csv(file_path, on_bad_lines='skip')

            columns_to_drop = [col for col in df.columns]

            are_same_unordered = Counter(columns_to_drop) == Counter(previous_one)

            if not are_same_unordered:
                print(columns_to_drop)
                print(previous_one)

            previous_one= columns_to_drop

            
            #for x in ['A', 'B', 'C', 'D', 'E', 'F']:
                #try:
                    #print(df[f'TotalImprisonmentLengthforCriminal{x}'].unique())
                    #print(file_path)
                #except KeyError:
                    #continue
            


base_path = "mnt/processed_data_criminal_case_analysis/drug_related_DrugTypeAmount_Penalty_Location_DataframeAlignment_March_20"

causes_of_action = ["drug_related"]

for cause in causes_of_action:
    align_dataframe(base_path, cause)

In [0]:
import os
import pandas as pd


def align_dataframe(base_path, cause):

    # Define the output folder based on the base path and cause
    output_folder = "mnt/processed_data_criminal_case_analysis/drug_related_DrugTypeAmount_Penalty_Location_DataframeAlignment_March_20"

    # Using dbutils.fs.ls to list directories/files
    files = dbutils.fs.ls(base_path)

    for sub_file in files:
        if sub_file.name.endswith(".csv"):
            # Reading CSV file into DataFrame
            # Convert to local file path if necessary
            file_path = sub_file.path.replace("dbfs:", "/dbfs")
            df = pd.read_csv(file_path, on_bad_lines='skip')
            
            #drop columns that are not needed
            columns_to_drop = [col for col in df.columns if col not in all_columns]
            df = df.drop(columns=columns_to_drop)
            
            # add missing columns
            for column in all_columns:
                if column not in df.columns:
                    df[column] = None
            
            #data cleaning
            df.dropna(subset=['FullText'], inplace=True)  # Remove rows with NaN in 'FullText'
            df = df[df['FullText'] != '']  # Further remove rows where 'FullText' is an empty string

            
            # replace '无期徒刑' with 9998,"死刑" with 9999
            for x in ['A', 'B']:
                try:
                    df.loc[df[f'TotalImprisonmentLengthforCriminal{x}'].str.contains('无期', na=False), f'TotalImprisonmentLengthforCriminal{x}'] = 9998
                    df.loc[df[f'TotalImprisonmentLengthforCriminal{x}'].str.contains('死刑', na=False), f'TotalImprisonmentLengthforCriminal{x}'] = 9999
                except (KeyError, AttributeError):
                    continue


            # Construct the output path for the enriched CSV file
            output_file_path = 'dbfs:' + \
                os.path.join(
                    output_folder, f"{os.path.basename(sub_file.name)}")

            # Save the processed DataFrame to the new CSV file, ensuring the path is in "/dbfs" format for local IO
            df.to_csv(output_file_path.replace(
                "dbfs:", "/dbfs/"), index=False)


base_path = "/mnt/processed_data_criminal_case_analysis/drug_related_DrugTypeAmount_Penalty_Location_March_19"

causes_of_action = ["drug_related"]

for cause in causes_of_action:
    align_dataframe(base_path, cause)

# data cleaning for drug type

## inspect unqiue values

In [0]:
import os
import pandas as pd


def align_dataframe(base_path, cause):
    # Using dbutils.fs.ls to list directories/files
    files = dbutils.fs.ls(base_path)

    # Initialize a list to collect all unique values across files
    unique_values = []
    i = 0

    for sub_file in files:
        if sub_file.name.endswith(".csv"):
            # Reading CSV file into DataFrame
            # Convert to local file path if necessary
            file_path = sub_file.path.replace("dbfs:", "/dbfs")
            df = pd.read_csv(file_path, on_bad_lines='skip')

            i += 1
            
            for x in ['a', 'b']:
                try:
                    # Get unique values from the dataframe
                    unique_drug_values = df[f'drug_{x}'].unique()
                    # Store them in the list
                    unique_values.extend(unique_drug_values)
                    print(f"{i}, Unique values for drug_{x} in {file_path}: {unique_drug_values}")
                except KeyError:
                    print("key error", x, "in file path:", file_path)
                    continue
    
    # Find overall unique values across all files
    overall_unique_values = set(unique_values)
    print("Overall unique values across all files:", overall_unique_values)


base_path = "mnt/processed_data_criminal_case_analysis/drug_related_DrugTypeAmount_Penalty_Location_DataframeAlignment_March_20"

causes_of_action = ["drug_related"]

for cause in causes_of_action:
    align_dataframe(base_path, cause)




# data cleaning for drug amount

## inpsect unique values of drug amount

In [0]:
import os
import pandas as pd


def align_dataframe(base_path, cause):
    # Using dbutils.fs.ls to list directories/files
    files = dbutils.fs.ls(base_path)

    # Initialize a list to collect all unique values across files
    unique_values = []
    i = 0

    for sub_file in files:
        if sub_file.name.endswith(".csv"):
            # Reading CSV file into DataFrame
            # Convert to local file path if necessary
            file_path = sub_file.path.replace("dbfs:", "/dbfs")
            df = pd.read_csv(file_path, on_bad_lines='skip')
            
            for x in ['a', 'b']:
                try:
                    # Get unique values from the dataframe
                    unique_drug_values = df[f'amount_{x}'].unique()
                    # Store them in the list
                    unique_values.extend(unique_drug_values)
                    print(f"{i}, Unique values for amount_{x} in {file_path}: {unique_drug_values}")
                except KeyError:
                    print("key error", x, "in file path:", file_path)
                    continue
    
    # Find overall unique values across all files
    overall_unique_values = set(unique_values)
    print("Overall unique values across all files:", overall_unique_values)


base_path = "mnt/processed_data_criminal_case_analysis/drug_related_DrugTypeAmount_Penalty_Location_DataframeAlignment_March_20"

causes_of_action = ["drug_related"]

for cause in causes_of_action:
    align_dataframe(base_path, cause)


## drop '克' and remove other values

In [0]:
# replace the values without saving to csv file

import os
import pandas as pd

def align_dataframe(base_path, cause):
    # Using dbutils.fs.ls to list directories/files
    files = dbutils.fs.ls(base_path)

    # Initialize a list to collect all unique values across files
    unique_values = []
    i = 0

    for sub_file in files:
        if sub_file.name.endswith(".csv"):
            # Reading CSV file into DataFrame
            # Convert to local file path if necessary
            file_path = sub_file.path.replace("dbfs:", "/dbfs")
            df = pd.read_csv(file_path, on_bad_lines='skip')
            
            for x in ['a', 'b']:
                column_name = f'amount_{x}'
                try:
                    # Clean and transform the amount data
                    df[column_name] = df[column_name].apply(clean_amount)
                    
                    # Get unique values from the cleaned dataframe
                    unique_drug_values = df[column_name].unique()
                    # Store them in the list
                    unique_values.extend(unique_drug_values)
                    print(f"{i}, Cleaned unique values for {column_name} in {file_path}: {unique_drug_values}")
                except KeyError:
                    print("Key error", x, "in file path:", file_path)
                    continue
    
    # Find overall unique values across all files
    overall_unique_values = set(unique_values)
    print("Overall unique values across all files:", overall_unique_values)

def clean_amount(value):
    if pd.isna(value):
        return value
    if isinstance(value, str):
        value = value.replace('克', '').replace('g', '').strip()
    try:
        return float(value)
    except ValueError:
        return pd.NA

base_path = "mnt/processed_data_criminal_case_analysis/drug_related_DrugTypeAmount_Penalty_Location_DataframeAlignment_March_20"
causes_of_action = ["drug_related"]

for cause in causes_of_action:
    align_dataframe(base_path, cause)


# data cleaning for charge

## insepct unique values

In [0]:
import os
import pandas as pd


def align_dataframe(base_path, cause):
    # Using dbutils.fs.ls to list directories/files
    files = dbutils.fs.ls(base_path)

    # Dictionary to store unique values by column
    unique_values_dict = {}

    for sub_file in files:
        if sub_file.name.endswith(".csv"):
            # Reading CSV file into DataFrame
            # Convert to local file path if necessary
            file_path = sub_file.path.replace("dbfs:", "/dbfs")
            df = pd.read_csv(file_path, on_bad_lines='skip')

            for x in ['A', 'B']:
                for i in ['1', '2']:
                    column_name = f'Charge{i}forCriminal{x}'
                    try:
                        # Fetch unique values from the column
                        column_unique_values = df[column_name].unique()
                        # Add to the unique values dictionary
                        if column_name not in unique_values_dict:
                            unique_values_dict[column_name] = set(column_unique_values)
                        else:
                            unique_values_dict[column_name].update(column_unique_values)
                        print(f"Unique values for {column_name} in {file_path}: {column_unique_values}")
                    except KeyError:
                        print(f"Key error: Column {column_name} does not exist in {file_path}")
                        continue

    # Print overall unique values for each column
    for column_name, values in unique_values_dict.items():
        print(f"Overall unique values for {column_name}: {values}")



base_path = "mnt/processed_data_criminal_case_analysis/drug_related_DrugTypeAmount_Penalty_Location_DataframeAlignment_March_20"

causes_of_action = ["drug_related"]

for cause in causes_of_action:
    align_dataframe(base_path, cause)


# data cleaning for fine

## inspect unique values

In [0]:
import os
import pandas as pd


def align_dataframe(base_path, cause):
    # Using dbutils.fs.ls to list directories/files
    files = dbutils.fs.ls(base_path)

    for sub_file in files:
        if sub_file.name.endswith(".csv"):
            # Reading CSV file into DataFrame
            # Convert to local file path if necessary
            file_path = sub_file.path.replace("dbfs:", "/dbfs")
            df = pd.read_csv(file_path, on_bad_lines='skip')
            
            for x in ['A', 'B']:
                try:
                    print(df[f'FineforCriminal{x}'].unique())
                except KeyError:
                    continue


base_path = "mnt/processed_data_criminal_case_analysis/drug_related_DrugTypeAmount_Penalty_Location_DataframeAlignment_March_20"

causes_of_action = ["drug_related"]

for cause in causes_of_action:
    align_dataframe(base_path, cause)


# data cleaning for suspened

## inspect unique values

In [0]:
import os
import pandas as pd


def align_dataframe(base_path, cause):
    # Using dbutils.fs.ls to list directories/files
    files = dbutils.fs.ls(base_path)

    for sub_file in files:
        if sub_file.name.endswith(".csv"):
            # Reading CSV file into DataFrame
            # Convert to local file path if necessary
            file_path = sub_file.path.replace("dbfs:", "/dbfs")
            df = pd.read_csv(file_path, on_bad_lines='skip')
            
            for x in ['A', 'B']:
                try:
                    print(df[f'SuspendedforCriminal{x}'].unique())
                except KeyError:
                    continue


base_path = "mnt/processed_data_criminal_case_analysis/drug_related_DrugTypeAmount_Penalty_Location_DataframeAlignment_March_20"

causes_of_action = ["drug_related"]

for cause in causes_of_action:
    align_dataframe(base_path, cause)


## bunk edits suspended to 1 or 0.