In [0]:


from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, FloatType, IntegerType

# Start Spark session
spark = SparkSession.builder.appName("DrugCrimeAnalysis").getOrCreate()

schema = StructType([
    StructField("OriginalLink", StringType(), True),
    StructField("CaseNumber", StringType(), True),
    StructField("CaseName", StringType(), True),
    StructField("Court", StringType(), True),
    StructField("Location", StringType(), True),
    StructField("CaseType", StringType(), True),
    StructField("TrialProcedure", StringType(), True),
    StructField("JudgmentDate", StringType(), True),
    StructField("PublicationDate", StringType(), True),
    StructField("PartiesInvolved", StringType(), True),
    StructField("CausesofAction", StringType(), True),
    StructField("LegalBasis", StringType(), True),
    StructField("FullText", StringType(), True),
    StructField("drug_a", StringType(), True),
    StructField("amount_a", FloatType(), True),
    StructField("drug_b", StringType(), True),
    StructField("amount_b", FloatType(), True),
    StructField("Charge1forCriminalA", StringType(), True),
    StructField("FineforCriminalA", IntegerType(), True),
    StructField("TotalImprisonmentLengthforCriminalA", FloatType(), True),
    StructField("SuspendedforCriminalA", BooleanType(), True),
    StructField("Charge1forCriminalB", StringType(), True),
    StructField("FineforCriminalB", IntegerType(), True),
    StructField("TotalImprisonmentLengthforCriminalB", IntegerType(), True),
    StructField("SuspendedforCriminalB", BooleanType(), True),
    StructField("Charge2forCriminalA", StringType(), True),
    StructField("Charge2forCriminalB", StringType(), True),
    StructField("Province", StringType(), True),
    StructField("City", StringType(), True),
    StructField("District", StringType(), True),
    StructField("CourtLevel", StringType(), True),
    StructField("Adcode", StringType(), True),
    StructField("TrimmedType", StringType(), True),
    StructField("TextAroundTrimmedPoint", StringType(), True),
    StructField("lawyer", BooleanType(), True),
    StructField("public_defender", BooleanType(), True)
])

# Load data with defined schema OR NOT
df = spark.read.csv("/mnt/processed_data_criminal_case_analysis/drug_related_data_enrich_cleaning_Apr_15/3_drop_ResponseTexts/", header=True,
                    schema=schema
                    )

df.show()



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Bucketizer

#DEBUG:

# Removing rows where TotalImprisonmentLengthforCriminalA cannot be converted into a float
df_filtered = df.filter(col("TotalImprisonmentLengthforCriminalA").cast("float").isNotNull())


# Count the number of rows in the filtered DataFrame
row_count_filtered = df_filtered.count()

print(f"Total number of rows with non-NULL TotalImprisonmentLengthforCriminalA: {row_count_filtered}")

df_filtered_12000 = df.filter(col("TotalImprisonmentLengthforCriminalA")<=9999.0)
df_filtered_12000.show()

df_filtered_12000.write.csv("/mnt/processed_data_criminal_case_analysis/drug_related_data_enrich_cleaning_Apr_15/inspect_misalignment_12000_temp.csv", header=True)

# Stop Spark session
spark.stop()


# data cleaning: investigating misalignment

In [0]:
df_filtered_12000 = df.filter(col("TotalImprisonmentLengthforCriminalA")>9999.0)
df_filtered_12000.show()

df_filtered_12000.write.csv("/mnt/processed_data_criminal_case_analysis/drug_related_data_enrich_cleaning_Apr_15/inspect_misalignment_12000.csv", header=True)

# If needed, count these rows to see how many there are
count_of_non_numeric_rows = df_filtered_12000.count()
print(f"Number of rows with larger than 9999 on TotalImprisonmentLengthforCriminalA: {count_of_non_numeric_rows}")


In [0]:
#inspect misalignment - 陕西省
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Assuming SparkSession is already started and DataFrame df is available
# Uncomment the following line if you need to start a Spark session
# spark = SparkSession.builder.appName("DrugCrimeAnalysis").getOrCreate()

# Filter DataFrame where TotalImprisonmentLengthforCriminalA equals '陕西省'
rows_with_issue = df.filter(col("TotalImprisonmentLengthforCriminalA") == '陕西省')

# Show the results
rows_with_issue.show(truncate=False)

# If needed, you can also count these rows
count_of_issue_rows = rows_with_issue.count()
print(f"Number of rows with TotalImprisonmentLengthforCriminalA as '陕西省': {count_of_issue_rows}")


In [0]:
# inspect misalignment - non numeric value
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, regexp_extract
from pyspark.sql.types import IntegerType


# Using regexp_extract to find non-numeric values; we consider valid numbers both integers and floats
non_numeric_rows = df.filter(regexp_extract(col("TotalImprisonmentLengthforCriminalA"), "^[+-]?((\d+(\.\d*)?)|(\.\d+))([eE][+-]?\d+)?$", 0) == "")


# Show the results
# Write non-numeric rows to a CSV file
non_numeric_rows.write.csv("/mnt/processed_data_criminal_case_analysis/drug_related_data_enrich_cleaning_Apr_15/inspect_misalignment.csv", header=True)

# If needed, count these rows to see how many there are
count_of_non_numeric_rows = non_numeric_rows.count()
print(f"Number of rows with non-numeric TotalImprisonmentLengthforCriminalA: {count_of_non_numeric_rows}")
