# load data

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, FloatType, IntegerType

# Start Spark session
spark = SparkSession.builder.appName("DrugCrimeAnalysis").getOrCreate()

schema = StructType([
    StructField("OriginalLink", StringType(), True),
    StructField("CaseNumber", StringType(), True),
    StructField("CaseName", StringType(), True),
    StructField("Court", StringType(), True),
    StructField("Location", StringType(), True),
    StructField("CaseType", StringType(), True),
    StructField("TrialProcedure", StringType(), True),
    StructField("JudgmentDate", StringType(), True),
    StructField("PublicationDate", StringType(), True),
    StructField("PartiesInvolved", StringType(), True),
    StructField("CausesofAction", StringType(), True),
    StructField("LegalBasis", StringType(), True),
    StructField("FullText", StringType(), True),
    StructField("drug_a", StringType(), True),
    StructField("amount_a", FloatType(), True),
    StructField("Charge1forCriminalA", StringType(), True),
    StructField("FineforCriminalA", FloatType(), True),
    StructField("TotalImprisonmentLengthforCriminalA", IntegerType(), True),
    StructField("SuspendedforCriminalA", BooleanType(), True),
    StructField("Province", StringType(), True),
    StructField("City", StringType(), True),
    StructField("District", StringType(), True),
    StructField("CourtLevel", StringType(), True),
    StructField("Adcode", StringType(), True),
    StructField("lawyer", BooleanType(), True),
    StructField("public_defender", BooleanType(), True)
])

# Load data with defined schema OR NOT
df = spark.read.csv("/mnt/processed_data_criminal_case_analysis/drug_related_data_enrich_cleaning_Apr_15/4_cleaned_data_misalign_exists/", header=True,
                    schema=schema
                    )

df.show()



# draw distribution of lawyer and non lawyer cases

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, floor
import pandas as pd
import matplotlib.pyplot as plt

# Convert 'lawyer' from boolean to integer for easier processing
df = df.withColumn("lawyer", col("lawyer").cast("integer"))

# Create bins for the imprisonment lengths
df = df.withColumn("ImprisonmentBin", floor(col("TotalImprisonmentLengthforCriminalA") / 3) * 3)

# Group by imprisonment bin and lawyer presence, then count
df_grouped = df.groupBy("ImprisonmentBin", "lawyer").count()

# Collect data to Pandas DataFrame for visualization
pdf = df_grouped.toPandas()

# Pivot the data for plotting
pivot_pdf = pdf.pivot(index='ImprisonmentBin', columns='lawyer', values='count').fillna(0)

# Set explicit colors for the bars
colors = ['red', 'blue']  # Red for cases without a lawyer, Blue for cases with a lawyer

# Create the histogram
pivot_pdf.plot(kind='bar', stacked=False, figsize=(12, 6), color=colors)
plt.title('Histogram of Imprisonment Lengths by Lawyer Presence')
plt.xlabel('Imprisonment Length (months)')
plt.ylabel('Number of Cases')
plt.xticks(rotation=45)
plt.legend(['No Lawyer', 'Lawyer'])
plt.show()


# distribution: lawyer, public defender, and non lawyer cases.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, floor, when, lit
import pandas as pd
import matplotlib.pyplot as plt

# Categorize cases based on the type of legal representation
df = df.withColumn(
    "Representation",
    when(col("lawyer") & ~col("public_defender"), "Lawyer")
    .when(col("lawyer") & col("public_defender"), "Public Defender")
    .when(~col("lawyer"), "No Representation")
)

# Create bins for the imprisonment lengths
df = df.withColumn("ImprisonmentBin", floor(col("TotalImprisonmentLengthforCriminalA") / 3) * 3)

# Group by imprisonment bin and representation type, then count
df_grouped = df.groupBy("ImprisonmentBin", "Representation").count()

# Collect the data to a Pandas DataFrame for visualization
pdf = df_grouped.toPandas()

# Pivot the data for plotting
pivot_pdf = pdf.pivot(index='ImprisonmentBin', columns='Representation', values='count').fillna(0)

# Plotting
# Set the colors for each representation category
colors = ['#1f77b4', '#2ca02c', '#d62728']  # Blue for 'Lawyer', Green for 'Public Defender', Red for 'No Representation'

# Create the stacked bar chart
pivot_pdf.plot(kind='bar', stacked=True, color=colors, figsize=(12, 6))

# Customize the plot with a grid, axis labels, and title
plt.title('Number of Cases by Imprisonment Length and Legal Representation', fontsize=14)
plt.xlabel('Imprisonment Length (months)', fontsize=12)
plt.ylabel('Number of Cases', fontsize=12)
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', which='major', color='grey', alpha=0.5)

# Enhance the legend
plt.legend(title='Legal Representation', title_fontsize='13')

# Show the plot
plt.show()

# data cleaning: investigating misalignment

In [0]:
df_filtered_12000 = df.filter(col("TotalImprisonmentLengthforCriminalA")>9999.0)
df_filtered_12000.show()

df_filtered_12000.write.csv("/mnt/processed_data_criminal_case_analysis/drug_related_data_enrich_cleaning_Apr_15/inspect_misalignment_12000.csv", header=True)

# If needed, count these rows to see how many there are
count_of_non_numeric_rows = df_filtered_12000.count()
print(f"Number of rows with larger than 9999 on TotalImprisonmentLengthforCriminalA: {count_of_non_numeric_rows}")


In [0]:
#inspect misalignment - 陕西省
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Assuming SparkSession is already started and DataFrame df is available
# Uncomment the following line if you need to start a Spark session
# spark = SparkSession.builder.appName("DrugCrimeAnalysis").getOrCreate()

# Filter DataFrame where TotalImprisonmentLengthforCriminalA equals '陕西省'
rows_with_issue = df.filter(col("TotalImprisonmentLengthforCriminalA") == '陕西省')

# Show the results
rows_with_issue.show(truncate=False)

# If needed, you can also count these rows
count_of_issue_rows = rows_with_issue.count()
print(f"Number of rows with TotalImprisonmentLengthforCriminalA as '陕西省': {count_of_issue_rows}")


In [0]:
# inspect misalignment - non numeric value
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, regexp_extract
from pyspark.sql.types import IntegerType


# Using regexp_extract to find non-numeric values; we consider valid numbers both integers and floats
non_numeric_rows = df.filter(regexp_extract(col("TotalImprisonmentLengthforCriminalA"), "^[+-]?((\d+(\.\d*)?)|(\.\d+))([eE][+-]?\d+)?$", 0) == "")


# Show the results
# Write non-numeric rows to a CSV file
non_numeric_rows.write.csv("/mnt/processed_data_criminal_case_analysis/drug_related_data_enrich_cleaning_Apr_15/inspect_misalignment.csv", header=True)

# If needed, count these rows to see how many there are
count_of_non_numeric_rows = non_numeric_rows.count()
print(f"Number of rows with non-numeric TotalImprisonmentLengthforCriminalA: {count_of_non_numeric_rows}")
